From 4d2ec827c22028afc947acbe06859a0b6b6af8b9 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 3 Jun 2026 04:09:10 -0700
Subject: [PATCH 01/22] port fast mps parser tp tree

---
 cpp/CMakeLists.txt                            |   30 +-
 cpp/cuopt_cli.cpp                             |   22 +-
 .../cuopt/linear_programming/io/parser.hpp    |   59 +-
 cpp/src/CMakeLists.txt                        |    1 +
 cpp/src/io/CMakeLists.txt                     |   10 +
 .../fast_parse_primitives.hpp                 |  590 ++++
 .../io/experimental_mps_fast/fast_parser.cpp  | 2770 +++++++++++++++++
 .../io/experimental_mps_fast/fast_parser.hpp  |   19 +
 .../fast_parser_adapter.cpp                   |   23 +
 .../io/experimental_mps_fast/file_reader.cpp  |  252 ++
 .../io/experimental_mps_fast/file_reader.hpp  |  168 +
 .../hash_table_smallstr.hpp                   |  330 ++
 .../experimental_mps_fast/lz4_file_reader.cpp |  759 +++++
 .../io/experimental_mps_fast/mmap_region.hpp  |  141 +
 .../mps_section_scanner.cpp                   |  413 +++
 .../mps_section_scanner.hpp                   |   98 +
 .../io/experimental_mps_fast/nvtx_ranges.hpp  |  135 +
 .../io/experimental_mps_fast/simd_compat.hpp  |   10 +
 cpp/tests/linear_programming/parser_test.cpp  |  107 +-
 19 files changed, 5905 insertions(+), 32 deletions(-)
 create mode 100644 cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser.cpp
 create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
 create mode 100644 cpp/src/io/experimental_mps_fast/file_reader.cpp
 create mode 100644 cpp/src/io/experimental_mps_fast/file_reader.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
 create mode 100644 cpp/src/io/experimental_mps_fast/mmap_region.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
 create mode 100644 cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
 create mode 100644 cpp/src/io/experimental_mps_fast/simd_compat.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7e2dd099c1..60227547b4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -202,9 +202,11 @@ endif ()
 find_package(OpenMP REQUIRED)
 message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
 
-# MPS/QPS parser supports compressed inputs via bzip2 and zlib
+# MPS/QPS parser supports compressed inputs via bzip2 and zlib; the experimental fast MPS parser
+# supports LZ4 via runtime-loaded liblz4.
 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON)
 option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON)
+option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON)
 if (CUOPT_PARSER_WITH_BZIP2)
     find_package(BZip2 REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_BZIP2)
@@ -213,6 +215,10 @@ if (CUOPT_PARSER_WITH_ZLIB)
     find_package(ZLIB REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_ZLIB)
 endif ()
+if (CUOPT_PARSER_WITH_LZ4)
+    # No headers or link target needed; the experimental reader loads one liblz4 symbol at runtime.
+    add_compile_definitions(MPS_PARSER_WITH_LZ4)
+endif ()
 
 # Debug options
 if (CMAKE_BUILD_TYPE MATCHES Debug)
@@ -250,6 +256,20 @@ else ()
     find_package(RAFT REQUIRED)
 endif ()
 
+rapids_cpm_find(simde 0.8.2
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
+        GIT_TAG v0.8.2
+        GIT_SHALLOW TRUE
+        DOWNLOAD_ONLY TRUE
+)
+
+if (NOT TARGET simde::simde)
+    add_library(simde::simde INTERFACE IMPORTED GLOBAL)
+    set_target_properties(simde::simde
+            PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${simde_SOURCE_DIR}")
+endif ()
+
 FetchContent_Declare(
         papilo
         GIT_REPOSITORY "https://github.com/scipopt/papilo.git"
@@ -436,11 +456,18 @@ if (BUILD_TESTS)
 endif ()
 
 set(CUOPT_SRC_FILES)
+set(MPS_FAST_SRC_FILES)
 add_subdirectory(src)
 if (HOST_LINEINFO)
     set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
 endif ()
 
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
+        CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
+    set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+            APPEND PROPERTY COMPILE_OPTIONS "-mavx2;-maes;-msse4.2")
+endif ()
+
 # Apply -UNDEBUG only to solver source files (not gRPC infrastructure).
 # Must happen before gRPC files are appended to CUOPT_SRC_FILES.
 # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO).
@@ -596,6 +623,7 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        simde::simde
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 37876cac7a..e99462091e 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -90,11 +90,13 @@ inline cuopt::init_logger_t dummy_logger(
  *                  .mps/.qps and their .gz/.bz2 variants → MPS parser;
  *                  anything else is rejected.
  * @param initial_solution_file Path to initial solution file in SOL format
+ * @param mps_reader MPS reader implementation selected by the CLI
  * @param settings Merged solver settings (config file loaded in main, then CLI overrides applied)
  */
 int run_single_file(const std::string& file_path,
                     const std::string& initial_solution_file,
                     bool solve_relaxation,
+                    cuopt::linear_programming::io::mps_reader_type_t mps_reader,
                     cuopt::linear_programming::solver_settings_t<int, double>& settings)
 {
   cuopt::init_logger_t log(settings.get_parameter<std::string>(CUOPT_LOG_FILE),
@@ -108,7 +110,7 @@ int run_single_file(const std::string& file_path,
   {
     CUOPT_LOG_INFO("Reading file %s", base_filename.c_str());
     try {
-      mps_data_model = cuopt::linear_programming::io::read<int, double>(file_path);
+      mps_data_model = cuopt::linear_programming::io::read<int, double>(file_path, mps_reader);
     } catch (const std::logic_error& e) {
       CUOPT_LOG_ERROR("Parser exception: %s", e.what());
       parsing_failed = true;
@@ -285,7 +287,8 @@ int main(int argc, char* argv[])
     .help(
       "input problem file; format dispatched by extension (case-insensitive). "
       "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants "
-      "(e.g. .lp.gz, .mps.bz2, .qps.gz)")
+      "(e.g. .lp.gz, .mps.bz2, .qps.gz). Experimental .mps.lz4 inputs require "
+      "--mps-reader fast")
     .nargs(1)
     .required();
 
@@ -303,6 +306,13 @@ int main(int argc, char* argv[])
     .help("path to parameter config file (key = value format, supports all parameters)")
     .default_value(std::string(""));
 
+  program.add_argument("--mps-reader")
+    .help(
+      "MPS reader implementation: default uses the production parser; fast uses the experimental "
+      "SIMD parser for LP/MIP .mps and .mps.lz4 files")
+    .default_value(std::string("default"))
+    .choices("default", "fast");
+
   program.add_argument("--dump-hyper-params")
     .help("print hyper-parameters only in config file format and exit")
     .default_value(false)
@@ -403,6 +413,12 @@ int main(int argc, char* argv[])
   const auto initial_solution_file = program.get<std::string>("--initial-solution");
   const auto solve_relaxation      = program.get<bool>("--relaxation");
   const auto params_file           = program.get<std::string>("--params-file");
+  const auto mps_reader_arg        = program.get<std::string>("--mps-reader");
+
+  auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader;
+  if (mps_reader_arg == "fast") {
+    mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental;
+  }
 
   cuopt::linear_programming::solver_settings_t<int, double> settings;
   try {
@@ -432,5 +448,5 @@ int main(int argc, char* argv[])
     RAFT_CUDA_TRY(cudaSetDevice(0));
   }
 
-  return run_single_file(file_name, initial_solution_file, solve_relaxation, settings);
+  return run_single_file(file_name, initial_solution_file, solve_relaxation, mps_reader, settings);
 }
diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index a63e40f31f..1d47590287 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -17,6 +17,14 @@
 
 namespace cuopt::linear_programming::io {
 
+/**
+ * @brief Selects which MPS reader implementation should be used by dispatching entry points.
+ *
+ * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems
+ * from raw .mps and .mps.lz4 files only.
+ */
+enum class mps_reader_type_t { default_reader, fast_experimental };
+
 /**
  * @brief Reads the equation from an MPS or QPS file.
  *
@@ -43,6 +51,18 @@ template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
                                     bool fixed_mps_format = false);
 
+/**
+ * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader.
+ *
+ * This prototype reader supports raw .mps and .mps.lz4 files only. It does not support LP, QPS,
+ * quadratic MPS sections, fixed-format forcing, or .gz/.bz2 compressed inputs.
+ *
+ * @param[in] mps_file_path Path to a raw .mps or .mps.lz4 file.
+ * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file.
+ */
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path);
+
 /**
  * @brief Reads an MPS problem from in-memory file contents.
  *
@@ -107,13 +127,19 @@ mps_data_model_t<i_t, f_t> read_lp(const std::string& lp_file_path);
 template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
 
+template <typename i_t, typename f_t>
+inline mps_data_model_t<i_t, f_t> read(const std::string& path,
+                                       mps_reader_type_t mps_reader,
+                                       bool fixed_mps_format = false);
+
 /**
  * @brief Reads an optimization problem from a file, dispatching on the file
  *        extension. Extension matching is case-insensitive.
  *
  * Routing:
  *   - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps()
- *   - .lp,  .lp.gz,  .lp.bz2                            → read_lp()
+ *   - .mps.lz4 → experimental fast MPS reader only
+ *   - .lp,  .lp.gz,  .lp.bz2 → read_lp()
  *   - anything else → std::logic_error
  *
  * This is the entry point of choice for user-facing tools (CLI, C API) that
@@ -126,13 +152,37 @@ mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
  */
 template <typename i_t, typename f_t>
 inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+{
+  return read<i_t, f_t>(path, mps_reader_type_t::default_reader, fixed_mps_format);
+}
+
+template <typename i_t, typename f_t>
+inline mps_data_model_t<i_t, f_t> read(const std::string& path,
+                                       mps_reader_type_t mps_reader,
+                                       bool fixed_mps_format)
 {
   std::string lower(path);
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  if (lower.ends_with(".mps") || lower.ends_with(".mps.gz") || lower.ends_with(".mps.bz2") ||
-      lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2")) {
+  const bool is_mps_lz4 = lower.ends_with(".mps.lz4");
+  if (lower.ends_with(".mps") || is_mps_lz4 || lower.ends_with(".mps.gz") ||
+      lower.ends_with(".mps.bz2") || lower.ends_with(".qps") || lower.ends_with(".qps.gz") ||
+      lower.ends_with(".qps.bz2")) {
+    if (mps_reader == mps_reader_type_t::fast_experimental) {
+      if (fixed_mps_format) {
+        throw std::logic_error(
+          "experimental fast MPS reader does not support fixed MPS format forcing");
+      }
+      if (!lower.ends_with(".mps") && !is_mps_lz4) {
+        throw std::logic_error(
+          "experimental fast MPS reader supports raw .mps and .mps.lz4 LP/MIP files only");
+      }
+      return read_mps_fast_experimental<i_t, f_t>(path);
+    }
+    if (is_mps_lz4) {
+      throw std::logic_error(".mps.lz4 inputs require the experimental fast MPS reader");
+    }
     return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
   if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) {
@@ -140,7 +190,8 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_f
   }
   throw std::logic_error(
     "read: unrecognized input file extension. Supported (case-insensitive): "
-    ".mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, .lp.bz2. "
+    ".mps, .mps.lz4, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, "
+    ".lp.bz2. "
     "Given path: " +
     path);
 }
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 1ae6988466..6883cce82f 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -25,3 +25,4 @@ add_subdirectory(branch_and_bound)
 add_subdirectory(cuts)
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${UTIL_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt
index cc4affa890..4c99b1848b 100644
--- a/cpp/src/io/CMakeLists.txt
+++ b/cpp/src/io/CMakeLists.txt
@@ -3,6 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 
+set(MPS_FAST_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_adapter.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp
+)
+
 set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/data_model_view.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/file_to_string.cpp
@@ -13,6 +21,8 @@ set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/parser.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/cython_parser.cpp
+  ${MPS_FAST_SRC_FILES}
 )
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${PARSERS_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
new file mode 100644
index 0000000000..9da59e7b44
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -0,0 +1,590 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "simd_compat.hpp"
+
+#include <array>
+#include <cctype>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string_view>
+#include <utility>
+
+#ifndef __likely
+#define __likely(x) __builtin_expect(!!(x), 1)
+#endif
+
+#ifndef __unlikely
+#define __unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+namespace mps_fast {
+
+// double values in MPS data rarely need more than this many fractional digits.
+inline constexpr double decimals[16][10] = {
+  {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9},
+  {0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09},
+  {0.000, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009},
+  {0.0000, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009},
+  {0.00000, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009},
+  {0.000000,
+   0.000001,
+   0.000002,
+   0.000003,
+   0.000004,
+   0.000005,
+   0.000006,
+   0.000007,
+   0.000008,
+   0.000009},
+  {0.0000000,
+   0.0000001,
+   0.0000002,
+   0.0000003,
+   0.0000004,
+   0.0000005,
+   0.0000006,
+   0.0000007,
+   0.0000008,
+   0.0000009},
+  {0.00000000,
+   0.00000001,
+   0.00000002,
+   0.00000003,
+   0.00000004,
+   0.00000005,
+   0.00000006,
+   0.00000007,
+   0.00000008,
+   0.00000009},
+  {0.000000000,
+   0.000000001,
+   0.000000002,
+   0.000000003,
+   0.000000004,
+   0.000000005,
+   0.000000006,
+   0.000000007,
+   0.000000008,
+   0.000000009},
+  {0.0000000000,
+   0.0000000001,
+   0.0000000002,
+   0.0000000003,
+   0.0000000004,
+   0.0000000005,
+   0.0000000006,
+   0.0000000007,
+   0.0000000008,
+   0.0000000009},
+  {0.00000000000,
+   0.00000000001,
+   0.00000000002,
+   0.00000000003,
+   0.00000000004,
+   0.00000000005,
+   0.00000000006,
+   0.00000000007,
+   0.00000000008,
+   0.00000000009},
+  {0.000000000000,
+   0.000000000001,
+   0.000000000002,
+   0.000000000003,
+   0.000000000004,
+   0.000000000005,
+   0.000000000006,
+   0.000000000007,
+   0.000000000008,
+   0.000000000009},
+  {0.0000000000000,
+   0.0000000000001,
+   0.0000000000002,
+   0.0000000000003,
+   0.0000000000004,
+   0.0000000000005,
+   0.0000000000006,
+   0.0000000000007,
+   0.0000000000008,
+   0.0000000000009},
+  {0.00000000000000,
+   0.00000000000001,
+   0.00000000000002,
+   0.00000000000003,
+   0.00000000000004,
+   0.00000000000005,
+   0.00000000000006,
+   0.00000000000007,
+   0.00000000000008,
+   0.00000000000009},
+  {0.000000000000000,
+   0.000000000000001,
+   0.000000000000002,
+   0.000000000000003,
+   0.000000000000004,
+   0.000000000000005,
+   0.000000000000006,
+   0.000000000000007,
+   0.000000000000008,
+   0.000000000000009},
+  {0.0000000000000000,
+   0.0000000000000001,
+   0.0000000000000002,
+   0.0000000000000003,
+   0.0000000000000004,
+   0.0000000000000005,
+   0.0000000000000006,
+   0.0000000000000007,
+   0.0000000000000008,
+   0.0000000000000009}};
+
+inline constexpr int EXP10_TABLE_MAX = 308;
+
+constexpr double constexpr_pow10(int exp)
+{
+  if (exp == 0) return 1.0;
+  double result = 1.0;
+  if (exp > 0) {
+    for (int i = 0; i < exp; ++i)
+      result *= 10.0;
+  } else {
+    for (int i = 0; i > exp; --i)
+      result /= 10.0;
+  }
+  return result;
+}
+
+constexpr auto make_exp10_table()
+{
+  std::array<double, EXP10_TABLE_MAX * 2 + 1> table{};
+  for (int i = -EXP10_TABLE_MAX; i <= EXP10_TABLE_MAX; ++i) {
+    table[(size_t)(i + EXP10_TABLE_MAX)] = constexpr_pow10(i);
+  }
+  return table;
+}
+
+inline constexpr auto table_exp10 = make_exp10_table();
+
+static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; }
+
+static inline double fast_frac_atoi(const char*& data, const char* end)
+{
+  double val = 0.0;
+
+#define MPS_FAST_FRAC_DIGIT(i)                                   \
+  do {                                                           \
+    if (data >= end || !is_digit_byte(*data)) return val;        \
+    val += decimals[i][static_cast<unsigned char>(*data) & 0xF]; \
+    ++data;                                                      \
+  } while (0)
+
+  MPS_FAST_FRAC_DIGIT(0);
+  MPS_FAST_FRAC_DIGIT(1);
+  MPS_FAST_FRAC_DIGIT(2);
+  MPS_FAST_FRAC_DIGIT(3);
+  MPS_FAST_FRAC_DIGIT(4);
+  MPS_FAST_FRAC_DIGIT(5);
+  MPS_FAST_FRAC_DIGIT(6);
+  MPS_FAST_FRAC_DIGIT(7);
+  MPS_FAST_FRAC_DIGIT(8);
+  MPS_FAST_FRAC_DIGIT(9);
+  MPS_FAST_FRAC_DIGIT(10);
+  MPS_FAST_FRAC_DIGIT(11);
+  MPS_FAST_FRAC_DIGIT(12);
+  MPS_FAST_FRAC_DIGIT(13);
+  MPS_FAST_FRAC_DIGIT(14);
+  MPS_FAST_FRAC_DIGIT(15);
+
+#undef MPS_FAST_FRAC_DIGIT
+
+  while (data < end && is_digit_byte(*data)) {
+    ++data;
+  }
+  return val;
+}
+
+static inline double fast_atof_core(const char*& data, const char* end)
+{
+  double sign = 1.0;
+  if (data < end && *data == '-') {
+    sign = -1.0;
+    ++data;
+  } else if (data < end && *data == '+') {
+    ++data;
+  }
+
+  uint64_t int_part = 0;
+  while (data < end && is_digit_byte(*data)) {
+    int_part = int_part * 10 + (*data - '0');
+    ++data;
+  }
+
+  double result = static_cast<double>(int_part);
+
+  if (data < end && *data == '.') {
+    ++data;
+    result += fast_frac_atoi(data, end);
+  }
+
+  if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) {
+    ++data;
+    int exp_sign = 1;
+    if (data < end && *data == '-') {
+      exp_sign = -1;
+      ++data;
+    } else if (data < end && *data == '+') {
+      ++data;
+    }
+
+    int exponent = 0;
+    while (data < end && is_digit_byte(*data)) {
+      exponent = exponent * 10 + (*data - '0');
+      ++data;
+    }
+
+    exponent *= exp_sign;
+    if (exponent >= -EXP10_TABLE_MAX && exponent <= EXP10_TABLE_MAX) {
+      result *= table_exp10[static_cast<size_t>(exponent + EXP10_TABLE_MAX)];
+    } else {
+      result *= std::pow(10.0, exponent);
+    }
+  }
+
+  return sign * result;
+}
+
+static inline double fast_atof(const char* data, const char* end)
+{
+  return fast_atof_core(data, end);
+}
+
+static inline double fast_atof_advance(const char*& ptr, const char* end)
+{
+  return fast_atof_core(ptr, end);
+}
+
+struct cursor_t {
+  const char* start;
+  const char* ptr;
+  const char* end;
+
+  cursor_t(const char* data, std::size_t size) : start(data), ptr(data), end(data + size) {}
+
+  bool done() const { return ptr >= end; }
+
+  std::pair<std::size_t, std::size_t> position() const
+  {
+    std::size_t line       = 1;
+    const char* line_start = start;
+    for (const char* p = start; p < ptr; ++p) {
+      if (*p == '\n') {
+        ++line;
+        line_start = p + 1;
+      }
+    }
+    std::size_t column = static_cast<std::size_t>(ptr - line_start) + 1;
+    return {line, column};
+  }
+
+  [[noreturn]] void error(const char* msg, ...)
+  {
+    auto [line, col] = position();
+    va_list args;
+    va_start(args, msg);
+    char msg_buf[512];
+    std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args);
+    va_end(args);
+    char buf[1024];
+    std::snprintf(buf, sizeof(buf), "%zu:%zu: %s", line, col, msg_buf);
+    throw std::runtime_error(buf);
+  }
+
+  void advance(std::size_t n)
+  {
+    if (ptr + n > end) { throw std::runtime_error("cursor advanced past end of file"); }
+    ptr += n;
+  }
+
+  template <bool skip_ws_mode>
+  static const char* scalar_scan(const char* p, const char* end)
+  {
+    while (p < end) {
+      unsigned char c = static_cast<unsigned char>(*p);
+      if constexpr (skip_ws_mode) {
+        if (c > 32 || c == '\n') return p;
+      } else {
+        if (c <= 32) return p;
+      }
+      p++;
+    }
+    return end;
+  }
+
+  template <bool skip_ws_mode>
+  static const char* simd_scan(const char* p, const char* end)
+  {
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    while (p + 32 <= end) {
+      simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)p);
+      simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
+
+      unsigned int mask;
+      if (skip_ws_mode) {
+        simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+        mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+      } else {
+        mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+      }
+
+      if (mask != 0) { return p + __builtin_ctz(mask); }
+      p += 32;
+    }
+    return scalar_scan<skip_ws_mode>(p, end);
+  }
+
+  void skip_ws() { ptr = simd_scan<true>(ptr, end); }
+
+  void skip_comment_line()
+  {
+    while (!done() && *ptr != '\n') {
+      ptr++;
+    }
+    if (!done()) ptr++;
+  }
+
+  void skip_to_eol()
+  {
+    while (!done() && *ptr != '\n') {
+      ptr++;
+    }
+  }
+
+  inline __attribute__((always_inline)) std::string_view read_field()
+  {
+    if (__unlikely(done())) { return {}; }
+
+    const char* field_start = ptr;
+    if (__unlikely(end - ptr < 32)) {
+      ptr                   = scalar_scan<false>(ptr, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
+    unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+
+    if (__unlikely(ws_mask == 0)) {
+      ptr                   = simd_scan<false>(ptr + 32, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    int field_end_off     = __builtin_ctz(ws_mask);
+    const char* field_end = ptr + field_end_off;
+
+    simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int stop_mask =
+      (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+    unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1);
+
+    if (__likely(after_field != 0)) {
+      ptr = ptr + __builtin_ctz(after_field);
+    } else {
+      ptr = field_end;
+      if (ptr < end) { skip_ws(); }
+    }
+
+    return std::string_view(field_start, field_end - field_start);
+  }
+
+  inline __attribute__((always_inline)) std::string_view peek_field()
+  {
+    if (__unlikely(done())) { return {}; }
+    const char* field_end = simd_scan<false>(ptr, end);
+    return std::string_view(ptr, field_end - ptr);
+  }
+
+  inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
+  read_two_fields()
+  {
+    if (__unlikely(end - ptr < 32)) {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return {f1, f2};
+    }
+
+    const char* field1_start = ptr;
+    const simde__m256i v32   = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl   = simde_mm256_set1_epi8('\n');
+
+    simde__m256i data  = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32  = simde_mm256_cmpgt_epi8(data, v32);
+    simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+
+    unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32);
+    unsigned int ws_mask        = ~printable_mask;
+    unsigned int nl_mask        = (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int stop_mask      = printable_mask | nl_mask;
+
+    if (__unlikely(ws_mask == 0)) {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return {f1, f2};
+    }
+    int field1_end_off = __builtin_ctz(ws_mask);
+
+    unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1);
+    if (__unlikely(after_field1 == 0)) {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return {f1, f2};
+    }
+    int field2_start_off = __builtin_ctz(after_field1);
+
+    if (__unlikely(ptr[field2_start_off] == '\n')) {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return {f1, f2};
+    }
+
+    unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1);
+    if (__unlikely(ws_after_field2_start == 0)) {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return {f1, f2};
+    }
+    int field2_end_off = __builtin_ctz(ws_after_field2_start);
+
+    unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1);
+    if (__likely(after_field2 != 0)) {
+      ptr = ptr + __builtin_ctz(after_field2);
+    } else {
+      ptr = ptr + field2_end_off;
+      skip_ws();
+    }
+
+    return {std::string_view(field1_start, field1_end_off),
+            std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)};
+  }
+
+  bool eol() const { return ptr < end && *ptr == '\n'; }
+};
+
+static inline void expect(cursor_t& cursor, const char* field)
+{
+  auto id = cursor.read_field();
+  if (__unlikely(id != field)) { cursor.error("expected '%s', got '%s'", field, id.data()); }
+}
+
+static inline void accept_comment_line(cursor_t& cursor)
+{
+  for (;;) {
+    while (!cursor.done() && cursor.eol()) {
+      cursor.advance(1);
+    }
+    if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; }
+    cursor.skip_comment_line();
+  }
+}
+
+static inline void expect_eol(cursor_t& cursor)
+{
+  if (__unlikely(!cursor.eol())) { cursor.error("expected end of line, got '%s'", cursor.ptr); }
+
+  for (;;) {
+    while (cursor.eol()) {
+      cursor.advance(1);
+    }
+    if (__unlikely(cursor.done())) { return; }
+
+    if (__unlikely(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) {
+      cursor.skip_comment_line();
+      continue;
+    }
+
+    if (__likely(cursor.ptr[0] == ' ') && __likely(cursor.ptr + 1 < cursor.end)) {
+      cursor.ptr += 1;
+    }
+
+    if (__unlikely(cursor.done())) { return; }
+    if (__unlikely(!std::isalpha(static_cast<unsigned char>(cursor.ptr[0])))) {
+      cursor.skip_ws();
+      if (cursor.eol()) { continue; }
+    }
+    break;
+  }
+}
+
+static inline std::string_view peek(cursor_t& cursor) { return cursor.peek_field(); }
+
+static inline bool accept(cursor_t& cursor, const char* field)
+{
+  if (peek(cursor) == field) {
+    expect(cursor, field);
+    return true;
+  }
+  return false;
+}
+
+static inline void expect_section(cursor_t& cursor, const char* section)
+{
+  expect(cursor, section);
+  expect_eol(cursor);
+}
+
+static inline double expect_number(cursor_t& cursor)
+{
+  auto num = cursor.read_field();
+  if (num.empty()) { cursor.error("expected number, got '%s'", num.data()); }
+  return fast_atof(num.data(), num.data() + num.size());
+}
+
+static inline double expect_number_fast_pm_one(cursor_t& cursor)
+{
+  const char* p = cursor.ptr;
+  if (p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
+    cursor.ptr = p + 2;
+    cursor.skip_ws();
+    return -1.0;
+  }
+  if (p[0] == '1' && p[1] <= ' ') {
+    cursor.ptr = p + 1;
+    cursor.skip_ws();
+    return 1.0;
+  }
+  return expect_number(cursor);
+}
+
+static inline bool accept_section(cursor_t& cursor, const char* section)
+{
+  if (accept(cursor, section)) {
+    expect_eol(cursor);
+    return true;
+  }
+  return false;
+}
+
+static inline bool accept_comment(cursor_t& cursor)
+{
+  if (__unlikely(!cursor.done() && cursor.ptr[0] == '$')) {
+    cursor.skip_to_eol();
+    return true;
+  }
+  return false;
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
new file mode 100644
index 0000000000..bce17a435f
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -0,0 +1,2770 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "fast_parse_primitives.hpp"
+#include "file_reader.hpp"
+#include "hash_table_smallstr.hpp"
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+#ifdef MPS_FAST_PERF_COUNTERS
+#include "perf_counters.hpp"
+#endif
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <omp.h>
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cctype>
+#include <cerrno>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <exception>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
+namespace mps_fast {
+
+static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS       = 4096;
+static constexpr int MPS_ROWS_THREAD_CAP                  = 16;
+static constexpr int MPS_COLUMNS_THREAD_CAP               = 32;
+static constexpr int MPS_BOUNDS_THREAD_CAP                = 32;
+static constexpr int MPS_NAMES_THREAD_CAP                 = 16;
+static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024;
+static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES     = 256ull * 1024ull * 1024ull;
+static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES       = 1 * 1024 * 1024;
+
+static int phase_thread_count(int phase_cap)
+{
+  return std::max(1, std::min(phase_cap, omp_get_max_threads()));
+}
+
+// =============================================================================
+// RAII Timer for profiling with deferred output
+// =============================================================================
+
+struct TimerEntry {
+  const char* name;
+  double elapsed_ms;
+};
+
+static std::vector<TimerEntry>& get_timer_buffer()
+{
+  static std::vector<TimerEntry> buffer;
+  buffer.reserve(100);
+  return buffer;
+}
+
+static std::mutex& get_timer_mutex()
+{
+  static std::mutex mutex;
+  return mutex;
+}
+
+static void flush_timers()
+{
+  std::lock_guard<std::mutex> lock(get_timer_mutex());
+  auto& buffer = get_timer_buffer();
+  for (const auto& entry : buffer) {
+    std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms);
+  }
+  buffer.clear();
+}
+
+static size_t system_page_size()
+{
+  static size_t page_size = [] {
+    long value = sysconf(_SC_PAGESIZE);
+    return value > 0 ? (size_t)value : (size_t)4096;
+  }();
+  return page_size;
+}
+
+enum class materialize_touch_t {
+  write_2mb,
+  write_4kb,
+};
+
+// instanciate a range using mmap anon pages with hugepage hints, and materialize them
+// by touching each to nudge the kernel into invoking its THP mechanism
+static void materialize_hugepages(const char* label,
+                                  void* data,
+                                  size_t bytes,
+                                  materialize_touch_t touch)
+{
+  (void)label;
+  if (data == nullptr || bytes == 0) return;
+
+  constexpr size_t two_mb = 2 * 1024 * 1024;
+  size_t page_size        = system_page_size();
+  uintptr_t start         = reinterpret_cast<uintptr_t>(data);
+  uintptr_t end           = start + bytes;
+  uintptr_t aligned_start = start & ~(uintptr_t)(page_size - 1);
+  uintptr_t aligned_end   = (end + page_size - 1) & ~(uintptr_t)(page_size - 1);
+  size_t aligned_bytes    = (size_t)(aligned_end - aligned_start);
+
+  errno = 0;
+  madvise(reinterpret_cast<void*>(aligned_start), aligned_bytes, MADV_HUGEPAGE);
+
+  size_t step        = touch == materialize_touch_t::write_2mb ? two_mb : page_size;
+  volatile char* ptr = reinterpret_cast<volatile char*>(data);
+  for (size_t offset = 0; offset < bytes; offset += step) {
+    ptr[offset] = ptr[offset];
+  }
+  ptr[bytes - 1] = ptr[bytes - 1];
+}
+
+template <typename T>
+static void materialize_vector_hugepages(const char* label,
+                                         std::vector<T>& values,
+                                         materialize_touch_t touch)
+{
+  materialize_hugepages(label, values.data(), values.size() * sizeof(T), touch);
+}
+
+class scoped_timer_t {
+ public:
+  scoped_timer_t(const char* name, double* accumulator = nullptr)
+    : name_(name),
+      accumulator_(accumulator),
+      nvtx_(name, nvtx::color_for_name(name)),
+      start_(std::chrono::high_resolution_clock::now())
+  {
+  }
+
+  ~scoped_timer_t()
+  {
+    auto end          = std::chrono::high_resolution_clock::now();
+    double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
+    nvtx_.end();
+    if (accumulator_) { *accumulator_ += elapsed_ms; }
+    std::lock_guard<std::mutex> lock(get_timer_mutex());
+    get_timer_buffer().push_back({name_, elapsed_ms});
+  }
+
+  scoped_timer_t(const scoped_timer_t&)            = delete;
+  scoped_timer_t& operator=(const scoped_timer_t&) = delete;
+
+ private:
+  const char* name_;
+  double* accumulator_;
+  nvtx::scoped_range nvtx_;
+  std::chrono::high_resolution_clock::time_point start_;
+};
+
+static inline bool section_token_matches(const char* p,
+                                         const char* end,
+                                         const char* token,
+                                         size_t len)
+{
+  return (size_t)(end - p) >= len && std::memcmp(p, token, len) == 0 &&
+         ((size_t)(end - p) == len || p[len] <= ' ');
+}
+
+static inline bool is_quadratic_section_start(const char* p, const char* end)
+{
+  return section_token_matches(p, end, "QUADOBJ", 7) ||
+         section_token_matches(p, end, "QMATRIX", 7) ||
+         section_token_matches(p, end, "QCMATRIX", 8);
+}
+
+static inline bool is_rhs_section_end(const char* p, const char* end)
+{
+  switch (p[0]) {
+    case 'B': return std::memcmp(p, "BOUNDS", 6) == 0 && p[6] <= ' ';
+    case 'Q': return is_quadratic_section_start(p, end);
+    case 'R': return std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ';
+    case 'E': return std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ';
+    default: return false;
+  }
+}
+
+static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section)
+{
+  const char* row_end = row_start;
+  while (row_end < cursor.end && *row_end > ' ') {
+    row_end++;
+  }
+  cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start);
+}
+
+// =============================================================================
+// Parsing state shared across section parsers
+// =============================================================================
+
+// Hash and equality for string_view keys in unordered_map
+struct string_view_hash {
+  size_t operator()(std::string_view sv) const { return std::hash<std::string_view>{}(sv); }
+};
+
+static inline size_t next_power_of_2(size_t n)
+{
+  if (n == 0) return 1;
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n |= n >> 32;
+  return n + 1;
+}
+
+enum class row_index_mode_t {
+  hash,
+  dense_ordered,
+};
+
+static inline bool is_decimal_digit(char c) { return (unsigned)(c - '0') <= 9; }
+
+static inline size_t decimal_digits_u64(uint64_t value)
+{
+  size_t digits = 1;
+  while (value >= 10) {
+    value /= 10;
+    digits++;
+  }
+  return digits;
+}
+
+static inline bool parse_trailing_u64(std::string_view name,
+                                      std::string_view& prefix,
+                                      uint64_t& value,
+                                      size_t& suffix_width)
+{
+  size_t pos = name.size();
+  while (pos > 0 && is_decimal_digit(name[pos - 1])) {
+    pos--;
+  }
+  if (pos == name.size()) { return false; }
+
+  uint64_t parsed = 0;
+  for (size_t i = pos; i < name.size(); ++i) {
+    uint64_t digit = (uint64_t)(name[i] - '0');
+    if (parsed > (std::numeric_limits<uint64_t>::max() - digit) / 10) { return false; }
+    parsed = parsed * 10 + digit;
+  }
+
+  prefix       = std::string_view(name.data(), pos);
+  value        = parsed;
+  suffix_width = name.size() - pos;
+  return true;
+}
+
+static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width)
+{
+  return suffix_width > 1 && name[name.size() - suffix_width] == '0';
+}
+
+static inline bool dense_suffix_width_ok(uint64_t value,
+                                         size_t suffix_width,
+                                         bool zero_padded,
+                                         size_t pad_width)
+{
+  size_t digits         = decimal_digits_u64(value);
+  size_t expected_width = zero_padded ? std::max(pad_width, digits) : digits;
+  return suffix_width == expected_width;
+}
+
+template <typename i_t, typename f_t>
+struct parse_state_t {
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem;
+  cursor_t& cursor;
+
+  // Temporary string_view storage (points into input buffer, no allocation)
+  std::vector<std::string_view> row_names_sv;
+  std::vector<std::string_view> var_names_sv;
+  std::string_view problem_name_sv;
+  std::string_view objective_name_sv;
+  std::vector<std::string_view> ignored_objective_names_sv;
+
+  // Optional dense ordered column index for labels like V0, V1, ...
+  bool col_dense_ordered = false;
+  std::string_view col_dense_prefix;
+  uint64_t col_dense_min_id  = 0;
+  uint64_t col_dense_max_id  = 0;
+  size_t col_dense_pad_width = 0;
+  bool col_dense_zero_padded = false;
+
+  // Row name hash table - sized at runtime based on row count
+  size_t row_hash_buckets = 0;
+  size_t row_hash_mask    = 0;  // buckets - 1, for fast modulo via &
+  mmap_region_t row_hash_region;
+  hash_slot_var_t* row_names_ht = nullptr;
+  // Overflow map for row names longer than HASH_KEY_BYTES
+  std::unordered_map<std::string_view, size_t, string_view_hash> row_names_long;
+
+  // Optional dense ordered row index for labels like R0001, R0002, ...
+  row_index_mode_t row_index_mode = row_index_mode_t::hash;
+  bool row_dense_candidate        = true;
+  std::string_view row_dense_prefix;
+  uint64_t row_dense_min_id  = 0;
+  uint64_t row_dense_max_id  = 0;
+  uint64_t row_dense_base_id = 0;
+  size_t row_dense_pad_width = 0;
+  bool row_dense_zero_padded = false;
+
+  // var_names still uses STL (only used in parse_bounds, not as hot)
+  std::unordered_map<std::string_view, size_t, string_view_hash> var_names_map;
+
+  parse_state_t(cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& p, cursor_t& c)
+    : problem(p), cursor(c)
+  {
+  }
+
+  void init_row_hash_table()
+  {
+    if (init_row_dense_ordered_table()) { return; }
+    init_row_hash_table_impl();
+  }
+
+  bool row_dense_has_expected_width(uint64_t value, size_t suffix_width) const
+  {
+    return dense_suffix_width_ok(value, suffix_width, row_dense_zero_padded, row_dense_pad_width);
+  }
+
+  bool col_dense_has_expected_width(uint64_t value, size_t suffix_width) const
+  {
+    return dense_suffix_width_ok(value, suffix_width, col_dense_zero_padded, col_dense_pad_width);
+  }
+
+  bool is_ignored_objective_name(std::string_view name) const
+  {
+    return std::find(ignored_objective_names_sv.begin(), ignored_objective_names_sv.end(), name) !=
+           ignored_objective_names_sv.end();
+  }
+
+  void add_ignored_objective_name(std::string_view name)
+  {
+    if (name == objective_name_sv || is_ignored_objective_name(name)) { return; }
+    ignored_objective_names_sv.push_back(name);
+  }
+
+  void observe_objective_row_name(std::string_view name)
+  {
+    if (objective_name_sv.empty()) {
+      objective_name_sv = name;
+    } else {
+      add_ignored_objective_name(name);
+    }
+  }
+
+  void observe_row_name_for_dense_index(std::string_view name, size_t row_index)
+  {
+    if (!row_dense_candidate) { return; }
+
+    std::string_view prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
+      row_dense_candidate = false;
+      return;
+    }
+
+    if (row_index == 0) {
+      row_dense_prefix      = prefix;
+      row_dense_min_id      = value;
+      row_dense_max_id      = value;
+      row_dense_base_id     = value;
+      row_dense_pad_width   = suffix_width;
+      row_dense_zero_padded = dense_suffix_is_zero_padded(name, suffix_width);
+      return;
+    }
+
+    if (prefix != row_dense_prefix) {
+      row_dense_candidate = false;
+      return;
+    }
+
+    if (row_dense_base_id > std::numeric_limits<uint64_t>::max() - row_index) {
+      row_dense_candidate = false;
+      return;
+    }
+
+    uint64_t expected = row_dense_base_id + row_index;
+    if (value != expected || !row_dense_has_expected_width(value, suffix_width)) {
+      row_dense_candidate = false;
+      return;
+    }
+
+    row_dense_min_id = std::min(row_dense_min_id, value);
+    row_dense_max_id = std::max(row_dense_max_id, value);
+  }
+
+  bool init_row_dense_ordered_table()
+  {
+    scoped_timer_t timer("row_dense_finalize");
+    size_t n_rows = row_names_sv.size();
+    if (!row_dense_candidate || n_rows == 0) { return false; }
+    if (row_dense_max_id < row_dense_min_id) { return false; }
+    uint64_t dense_count = row_dense_max_id - row_dense_min_id + 1;
+    if (dense_count != n_rows) { return false; }
+
+    row_index_mode = row_index_mode_t::dense_ordered;
+    return true;
+  }
+
+  void init_row_hash_table_impl()
+  {
+    scoped_timer_t timer("row_hash_init_total");
+    size_t n_rows = row_names_sv.size();
+    // load factor 50%
+    row_hash_buckets          = next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64));
+    row_hash_mask             = row_hash_buckets - 1;
+    size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t);
+
+    {
+      scoped_timer_t timer("row_hash_mmap");
+      // Use mmap for allocation - the OS provides zero'd pages
+      row_hash_region = mmap_region_t::anonymous(
+        row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table");
+      row_names_ht = static_cast<hash_slot_var_t*>(row_hash_region.data());
+      // Request huge pages to reduce TLB misses
+      row_hash_region.advise(MADV_HUGEPAGE);
+    }
+
+    // pre-touch the 2MB huge pages to nudge the kernel into allocating them
+#ifdef MPS_FAST_THP_PREFAULT
+    {
+      scoped_timer_t timer("row_hash_thp_prefault");
+      materialize_hugepages(
+        "row_names_ht", row_names_ht, row_hash_region.size(), materialize_touch_t::write_2mb);
+    }
+#endif
+
+    {
+      scoped_timer_t timer("row_hash_insert_all");
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        row_insert(row_names_sv[idx], idx);
+      }
+    }
+
+    // Force the kernel to please please collapse the page range into THP pages
+#ifdef MPS_FAST_MADV_COLLAPSE
+    {
+      scoped_timer_t timer("row_hash_madv_collapse");
+      row_hash_region.advise(MADV_COLLAPSE);
+    }
+#endif
+  }
+
+  size_t row_lookup_dense_ordered(std::string_view name) const
+  {
+    std::string_view prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; }
+    if (prefix != row_dense_prefix || !row_dense_has_expected_width(value, suffix_width)) {
+      return SIZE_MAX;
+    }
+    if (value < row_dense_min_id || value > row_dense_max_id) { return SIZE_MAX; }
+    return (size_t)(value - row_dense_min_id);
+  }
+
+  size_t row_lookup(std::string_view name) const
+  {
+    if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) {
+      return row_lookup_dense_ordered(name);
+    }
+    return row_lookup_hash(name);
+  }
+
+  size_t read_row_lookup_dense_ordered(cursor_t& cursor) const
+  {
+    const char* start = cursor.ptr;
+    const char* p     = start;
+
+    size_t prefix_len = row_dense_prefix.size();
+    if (prefix_len > 0) {
+      if ((size_t)(cursor.end - p) < prefix_len ||
+          std::memcmp(p, row_dense_prefix.data(), prefix_len) != 0) {
+        cursor.read_field();
+        return SIZE_MAX;
+      }
+      p += prefix_len;
+    }
+
+    const char* digits_start = p;
+    uint64_t value           = 0;
+    while (p < cursor.end && is_decimal_digit(*p)) {
+      uint64_t digit = (uint64_t)(*p - '0');
+      if (value > (std::numeric_limits<uint64_t>::max() - digit) / 10) {
+        cursor.ptr = start;
+        cursor.read_field();
+        return SIZE_MAX;
+      }
+      value = value * 10 + digit;
+      p++;
+    }
+
+    size_t suffix_width = (size_t)(p - digits_start);
+    if (suffix_width == 0 || p >= cursor.end || *p > ' ' ||
+        !row_dense_has_expected_width(value, suffix_width) || value < row_dense_min_id ||
+        value > row_dense_max_id) {
+      cursor.ptr = start;
+      cursor.read_field();
+      return SIZE_MAX;
+    }
+
+    cursor.ptr = p;
+    cursor.skip_ws();
+    return (size_t)(value - row_dense_min_id);
+  }
+
+  size_t read_row_lookup(cursor_t& cursor) const
+  {
+    if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) {
+      return read_row_lookup_dense_ordered(cursor);
+    }
+
+    auto row_name = cursor.read_field();
+    return row_lookup_hash(row_name);
+  }
+
+  size_t row_lookup_hash(std::string_view name) const
+  {
+    if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+      auto it = row_names_long.find(name);
+      return it != row_names_long.end() ? it->second : SIZE_MAX;
+    }
+    hash_key_t key               = make_key(name.data(), name.size());
+    uint32_t hash                = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask;
+    const hash_slot_var_t* slots = row_names_ht;
+    const hash_slot_var_t* slot  = &slots[hash];
+
+    for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) {
+      if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; }
+      if (slot->count == 0) { return SIZE_MAX; }
+      if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
+    }
+    return SIZE_MAX;
+  }
+
+  size_t col_lookup_dense_ordered(std::string_view name) const
+  {
+    std::string_view prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; }
+    if (prefix != col_dense_prefix || !col_dense_has_expected_width(value, suffix_width)) {
+      return SIZE_MAX;
+    }
+    if (value < col_dense_min_id || value > col_dense_max_id) { return SIZE_MAX; }
+    return (size_t)(value - col_dense_min_id);
+  }
+
+  void dense_col_name(size_t idx, std::string& out) const
+  {
+    uint64_t value = col_dense_min_id + idx;
+    char digits_buf[32];
+    auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value);
+    if (ec != std::errc()) {
+      out.assign(col_dense_prefix);
+      return;
+    }
+    size_t digits_len = (size_t)(digits_end - digits_buf);
+    size_t width = col_dense_zero_padded ? std::max(col_dense_pad_width, digits_len) : digits_len;
+    out.resize(col_dense_prefix.size() + width);
+    std::memcpy(out.data(), col_dense_prefix.data(), col_dense_prefix.size());
+    char* suffix = out.data() + col_dense_prefix.size();
+    if (width > digits_len) {
+      std::memset(suffix, '0', width - digits_len);
+      suffix += width - digits_len;
+    }
+    std::memcpy(suffix, digits_buf, digits_len);
+  }
+
+  void row_insert(std::string_view name, size_t index)
+  {
+    if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+      row_names_long[name] = index;
+      return;
+    }
+    hash_key_t key         = make_key(name.data(), name.size());
+    uint32_t hash          = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask;
+    hash_slot_var_t* slots = row_names_ht;
+    hash_slot_var_t* slot  = &slots[hash];
+
+    for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) {
+      if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; }
+      if (slot->count == 0) {
+        key_store(slot->key, key);            // Writes 32 bytes, including garbage in last 4
+        slot->count = (uint32_t)(index + 1);  // Overwrite last 4 bytes with actual count
+        return;
+      }
+      if (key_cmpeq(slot->key, key)) {
+        slot->count = (uint32_t)(index + 1);
+        return;
+      }
+    }
+    __builtin_trap();
+  }
+};
+
+// =============================================================================
+// Section parsers
+// =============================================================================
+
+template <typename i_t, typename f_t>
+static void parse_name_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_name");
+  if (peek(state.cursor) == "ROWS") { return; }
+  expect(state.cursor, "NAME");
+  if (!state.cursor.eol()) {
+    state.problem_name_sv = state.cursor.read_field();
+    accept_comment(state.cursor);
+  }
+  expect_eol(state.cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_objsense_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objsense");
+  if (accept(state.cursor, "OBJSENSE")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    if (accept(state.cursor, "MIN")) {
+      state.problem.maximize_ = false;
+    } else if (accept(state.cursor, "MAX")) {
+      state.problem.maximize_ = true;
+    } else {
+      state.cursor.error("expected MIN or MAX, got '%s'", state.cursor.read_field().data());
+    }
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_objname_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objname");
+  if (accept(state.cursor, "OBJNAME")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    state.objective_name_sv = state.cursor.read_field();
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+struct RowChunkBoundary {
+  const char* start;
+  const char* end;
+};
+
+struct RowChunkInfo {
+  size_t constraints = 0;
+  bool malformed     = false;
+  std::vector<std::string_view> objective_names;
+  bool has_first_constraint = false;
+  std::string_view first_constraint_name;
+};
+
+static const char* rows_find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static bool parse_rows_line_fast(const char*& p,
+                                 const char* end,
+                                 char& row_type,
+                                 std::string_view& row_name)
+{
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+  if (p >= end) { return false; }
+  if (*p == '\n') {
+    p++;
+    return false;
+  }
+  if (*p == '*' || *p == '$') {
+    p = rows_find_next_line(p, end);
+    return false;
+  }
+
+  row_type = *p++;
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+
+  const char* name_start = p;
+  while (p < end && *p > ' ')
+    p++;
+  if (name_start == p) { return false; }
+  row_name = std::string_view(name_start, (size_t)(p - name_start));
+
+  // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and
+  // field 3 may start with '$' to comment the rest of the record.
+  p = rows_find_next_line(p, end);
+  return true;
+}
+
+static std::vector<RowChunkBoundary> compute_row_chunk_boundaries(const char* rows_start,
+                                                                  const char* rows_end,
+                                                                  int num_threads)
+{
+  scoped_timer_t timer("rows_compute_chunk_boundaries");
+
+  std::vector<RowChunkBoundary> boundaries((size_t)num_threads);
+  size_t total_size = (size_t)(rows_end - rows_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  boundaries[0].start = rows_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = rows_end;
+    } else {
+      const char* boundary            = rows_start + (size_t)(t + 1) * chunk_size;
+      boundary                        = rows_find_next_line(boundary, rows_end);
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
+    }
+  }
+
+  return boundaries;
+}
+
+template <typename i_t, typename f_t>
+static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
+                                             const char* rows_start,
+                                             const char* rows_end,
+                                             int num_threads)
+{
+  scoped_timer_t timer("parse_rows_parallel");
+
+  auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads);
+  std::vector<RowChunkInfo> infos((size_t)num_threads);
+
+  {
+    scoped_timer_t timer("rows_count_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      RowChunkInfo info;
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            info.malformed = true;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') {
+          info.objective_names.push_back(row_name);
+        } else {
+          if (!info.has_first_constraint) {
+            info.first_constraint_name = row_name;
+            info.has_first_constraint  = true;
+          }
+          info.constraints++;
+        }
+      }
+
+      infos[(size_t)t] = info;
+    }
+  }
+
+  for (const auto& info : infos) {
+    if (info.malformed) { return false; }
+  }
+
+  std::vector<size_t> offsets((size_t)num_threads + 1, 0);
+  {
+    scoped_timer_t timer("rows_prefix_sum");
+    for (int t = 0; t < num_threads; ++t) {
+      offsets[(size_t)t + 1] = offsets[(size_t)t] + infos[(size_t)t].constraints;
+    }
+  }
+
+  size_t total_rows = offsets[(size_t)num_threads];
+  {
+    scoped_timer_t timer("rows_resize_outputs");
+    state.row_names_sv.resize(total_rows);
+    state.problem.row_types_.resize(total_rows);
+  }
+
+  if (state.objective_name_sv.empty()) {
+    for (const auto& info : infos) {
+      if (!info.objective_names.empty()) {
+        state.objective_name_sv = info.objective_names.front();
+        break;
+      }
+    }
+  }
+  for (const auto& info : infos) {
+    for (std::string_view name : info.objective_names) {
+      state.add_ignored_objective_name(name);
+    }
+  }
+
+  bool dense_candidate = total_rows > 0;
+  std::string_view dense_prefix;
+  uint64_t dense_base_id = 0;
+  size_t dense_pad_width = 0;
+  bool dense_zero_padded = false;
+
+  if (dense_candidate) {
+    std::string_view first_name;
+    for (const auto& info : infos) {
+      if (info.has_first_constraint) {
+        first_name = info.first_constraint_name;
+        break;
+      }
+    }
+
+    uint64_t first_value      = 0;
+    size_t first_suffix_width = 0;
+    if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) {
+      dense_candidate = false;
+    } else {
+      dense_base_id     = first_value;
+      dense_pad_width   = first_suffix_width;
+      dense_zero_padded = dense_suffix_is_zero_padded(first_name, first_suffix_width);
+    }
+  }
+
+  std::vector<uint8_t> dense_ok_by_chunk((size_t)num_threads, 1);
+
+  {
+    scoped_timer_t timer("rows_fill_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_fill_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      size_t out      = offsets[(size_t)t];
+
+      bool local_dense_ok = dense_candidate;
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            local_dense_ok = false;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') { continue; }
+
+        state.row_names_sv[out]       = row_name;
+        state.problem.row_types_[out] = row_type;
+
+        if (local_dense_ok) {
+          std::string_view prefix;
+          uint64_t value      = 0;
+          size_t suffix_width = 0;
+          uint64_t expected   = dense_base_id + out;
+          local_dense_ok =
+            parse_trailing_u64(row_name, prefix, value, suffix_width) && prefix == dense_prefix &&
+            value == expected &&
+            dense_suffix_width_ok(value, suffix_width, dense_zero_padded, dense_pad_width);
+        }
+        out++;
+      }
+
+      dense_ok_by_chunk[(size_t)t] = local_dense_ok ? 1 : 0;
+    }
+  }
+
+  {
+    scoped_timer_t timer("rows_dense_metadata");
+    for (uint8_t ok : dense_ok_by_chunk) {
+      dense_candidate = dense_candidate && ok;
+    }
+    state.row_dense_candidate = dense_candidate;
+    if (dense_candidate) {
+      state.row_dense_prefix      = dense_prefix;
+      state.row_dense_min_id      = dense_base_id;
+      state.row_dense_max_id      = dense_base_id + total_rows - 1;
+      state.row_dense_base_id     = dense_base_id;
+      state.row_dense_pad_width   = dense_pad_width;
+      state.row_dense_zero_padded = dense_zero_padded;
+    }
+  }
+
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows_serial");
+
+  while (state.cursor.ptr < rows_end) {
+    auto row_type = state.cursor.ptr[0];
+    state.cursor.advance(1);
+    state.cursor.skip_ws();
+    // if (row_type != "E" && row_type != "L" && row_type != "G" && row_type != "N") {
+    //   state.cursor.error("expected E, L, G, or N, got '%s'", row_type.data());
+    // }
+
+    auto row_name = state.cursor.read_field();
+    // ROWS fields after the row name are unused; tolerate annotations/comments there.
+    state.cursor.skip_to_eol();
+
+    // 'N' type is the objective row - store its name but don't add to constraints
+    if (row_type == 'N') {
+      state.observe_objective_row_name(row_name);
+    } else {
+      size_t row_idx = state.row_names_sv.size();
+      state.row_names_sv.push_back(row_name);
+      state.observe_row_name_for_dense_index(row_name, row_idx);
+      state.problem.row_types_.push_back(row_type);
+    }
+    expect_eol(state.cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows");
+  expect_section(state.cursor, "ROWS");
+
+  {
+    scoped_timer_t timer("parse_rows_scan");
+    const char* rows_start = state.cursor.ptr;
+
+    size_t rows_bytes    = (size_t)(rows_end - state.cursor.ptr);
+    int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    bool parsed_parallel = false;
+    if (rows_bytes >= 512ull * 1024ull * 1024ull && num_threads > 1) {
+      parsed_parallel =
+        parse_rows_section_parallel_impl<i_t, f_t>(state, state.cursor.ptr, rows_end, num_threads);
+      if (!parsed_parallel) {
+        state.row_names_sv.clear();
+        state.problem.row_types_.clear();
+        state.row_dense_candidate   = true;
+        state.row_dense_prefix      = {};
+        state.row_dense_min_id      = 0;
+        state.row_dense_max_id      = 0;
+        state.row_dense_base_id     = 0;
+        state.row_dense_pad_width   = 0;
+        state.row_dense_zero_padded = false;
+        state.cursor.ptr            = rows_start;
+        parse_rows_section_serial_impl(state, rows_end);
+      }
+    } else {
+      parse_rows_section_serial_impl(state, rows_end);
+    }
+    state.cursor.ptr = rows_end;
+  }
+
+  state.problem.n_constraints_ = (i_t)state.row_names_sv.size();
+  state.problem.b_.resize((size_t)state.problem.n_constraints_);
+
+  {
+    scoped_timer_t timer("parse_rows_hash_init");
+    state.init_row_hash_table();
+  }
+}
+
+// =============================================================================
+// Parallel COLUMNS parser
+// =============================================================================
+
+struct MarkerInfo {
+  enum Type { INTORG, INTEND };
+  Type type;
+  size_t after_local_var_idx;  // SIZE_MAX means "before first variable"
+};
+
+struct RowCountBlock {
+  size_t block_id       = 0;
+  size_t storage_offset = 0;
+};
+
+struct DenseColChunkStats {
+  bool candidate = true;
+  std::string_view prefix;
+  uint64_t first_id = 0;
+  uint64_t last_id  = 0;
+  size_t pad_width  = 0;
+  bool zero_padded  = false;
+  size_t count      = 0;
+};
+
+struct ChunkResult {
+  std::vector<double> values;
+  std::vector<uint32_t> row_indices;
+  std::vector<size_t> col_offsets;
+  std::vector<std::string_view> var_names;
+  std::vector<MarkerInfo> markers;
+  std::vector<std::pair<size_t, double>> objective_entries;  // local_col_idx -> coefficient
+  // Sparse per-row scratch: each touched 4096-row block stores counts after parsing,
+  // then the same slots become CSR write cursors. This avoids scanning/allocating
+  // chunks*n_rows entries when a chunk only touches clustered row ranges. The
+  // block payloads live in one arena per chunk so scatter has hugepage-friendly
+  // write-position metadata instead of many independent 32 KiB allocations.
+  std::vector<int64_t> row_count_storage;
+  std::vector<RowCountBlock> row_count_blocks;
+  std::vector<int32_t> row_count_block_dir;
+  std::string_view first_var_name;
+  std::string_view last_var_name;
+  DenseColChunkStats dense_col_stats;
+};
+
+struct ChunkBoundary {
+  const char* start;
+  const char* end;
+};
+
+struct BoundsChunkBoundary {
+  const char* start;
+  const char* end;
+};
+
+static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx)
+{
+  size_t block_id   = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+  size_t local      = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+  int32_t block_pos = result.row_count_block_dir[block_id];
+  if (__unlikely(block_pos < 0)) {
+    block_pos                            = (int32_t)result.row_count_blocks.size();
+    result.row_count_block_dir[block_id] = block_pos;
+    RowCountBlock block;
+    block.block_id       = block_id;
+    block.storage_offset = result.row_count_storage.size();
+    result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0);
+    result.row_count_blocks.push_back(std::move(block));
+  }
+  return result
+    .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local];
+}
+
+static void observe_dense_col_name(DenseColChunkStats& stats, std::string_view name)
+{
+  if (!stats.candidate) { return; }
+
+  std::string_view prefix;
+  uint64_t value      = 0;
+  size_t suffix_width = 0;
+  if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
+    stats.candidate = false;
+    return;
+  }
+
+  if (stats.count == 0) {
+    stats.prefix      = prefix;
+    stats.first_id    = value;
+    stats.last_id     = value;
+    stats.pad_width   = suffix_width;
+    stats.zero_padded = dense_suffix_is_zero_padded(name, suffix_width);
+    stats.count       = 1;
+    return;
+  }
+
+  if (prefix != stats.prefix) {
+    stats.candidate = false;
+    return;
+  }
+  if (stats.last_id == std::numeric_limits<uint64_t>::max() || value != stats.last_id + 1) {
+    stats.candidate = false;
+    return;
+  }
+  if (!dense_suffix_width_ok(value, suffix_width, stats.zero_padded, stats.pad_width)) {
+    stats.candidate = false;
+    return;
+  }
+  stats.last_id = value;
+  stats.count++;
+}
+
+static bool dense_col_chunk_padding_compatible(const DenseColChunkStats& stats,
+                                               bool global_zero_padded,
+                                               size_t global_pad_width)
+{
+  if (global_zero_padded) {
+    return stats.pad_width == global_pad_width ||
+           (!stats.zero_padded && decimal_digits_u64(stats.first_id) >= global_pad_width);
+  }
+  return !stats.zero_padded;
+}
+
+// Read first field (column name) from a line without modifying any state
+static std::string_view peek_line_column_name(const char* line_start, const char* end)
+{
+  const char* p = line_start;
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+  const char* field_start = p;
+  while (p < end && *p > ' ')
+    p++;
+  return std::string_view(field_start, (size_t)(p - field_start));
+}
+
+// Find the start of the next line
+static const char* find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static const char* find_bounds_body_end(const char* bounds_body_start, const char* parse_end)
+{
+  const char* p = bounds_body_start;
+  while (p < parse_end) {
+    if ((*p == 'E' && parse_end - p >= 6 && std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ') ||
+        (*p == 'Q' && is_quadratic_section_start(p, parse_end)) ||
+        (*p == 'R' && parse_end - p >= 6 && std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ')) {
+      return p;
+    }
+    p = find_next_line(p, parse_end);
+  }
+  return parse_end;
+}
+
+static std::vector<BoundsChunkBoundary> compute_line_chunk_boundaries(const char* section_start,
+                                                                      const char* section_end,
+                                                                      int num_threads)
+{
+  scoped_timer_t timer("bounds_compute_chunk_boundaries");
+
+  size_t total_size = (size_t)(section_end - section_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<BoundsChunkBoundary> boundaries((size_t)num_threads);
+  boundaries[0].start = section_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = section_end;
+    } else {
+      const char* boundary            = section_start + (size_t)(t + 1) * chunk_size;
+      boundaries[(size_t)t].end       = find_next_line(boundary, section_end);
+      boundaries[(size_t)t + 1].start = boundaries[(size_t)t].end;
+    }
+  }
+  return boundaries;
+}
+
+static std::vector<ChunkBoundary> compute_chunk_boundaries(const char* columns_start,
+                                                           const char* columns_end,
+                                                           int num_threads)
+{
+  scoped_timer_t timer("compute_chunk_boundaries");
+
+  size_t total_size = (size_t)(columns_end - columns_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<ChunkBoundary> boundaries(num_threads);
+
+  // Parallel boundary finding - each thread finds its own end at a column transition
+  // #pragma omp parallel for
+  for (int t = 0; t < num_threads; t++) {
+    if (t == 0) { boundaries[t].start = columns_start; }
+
+    if (t == num_threads - 1) {
+      boundaries[t].end = columns_end;
+    } else {
+      // Find estimated position and align to line boundary
+      const char* estimated_end = columns_start + (t + 1) * chunk_size;
+      const char* line_start    = estimated_end;
+      while (line_start < columns_end && *line_start != '\n')
+        line_start++;
+      if (line_start < columns_end) line_start++;
+
+      // Read column name at this line
+      std::string_view col_name = peek_line_column_name(line_start, columns_end);
+
+      // Scan forward until column name changes (to avoid splitting a column)
+      const char* boundary = line_start;
+      while (boundary < columns_end) {
+        const char* next_line = find_next_line(boundary, columns_end);
+        if (next_line >= columns_end) break;
+
+        std::string_view next_col = peek_line_column_name(next_line, columns_end);
+        if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') {
+          // Found a column transition (and it's not a MARKER line)
+          boundary = next_line;
+          break;
+        }
+        boundary = next_line;
+      }
+      boundaries[t].end = boundary;
+    }
+  }
+
+  // Fix up start pointers (each start is previous end)
+  for (int t = 1; t < num_threads; t++) {
+    boundaries[t].start = boundaries[t - 1].end;
+  }
+
+  return boundaries;
+}
+
+template <typename i_t, typename f_t>
+static ChunkResult parse_columns_chunk(const char* chunk_start,
+                                       const char* chunk_end,
+                                       const parse_state_t<i_t, f_t>& state)
+{
+  ChunkResult result;
+
+  if (chunk_start >= chunk_end) {
+    result.col_offsets.push_back(0);
+    return result;
+  }
+
+  size_t chunk_size     = (size_t)(chunk_end - chunk_start);
+  size_t estimated_nnz  = chunk_size / 100;
+  size_t estimated_cols = estimated_nnz / 10;
+  if (__unlikely(state.problem.n_constraints_ > (i_t)std::numeric_limits<int32_t>::max())) {
+    state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices");
+  }
+  result.values.reserve(estimated_nnz);
+  result.row_indices.reserve(estimated_nnz);
+  result.col_offsets.reserve(estimated_cols + 1);
+  result.var_names.reserve(estimated_cols);
+  result.objective_entries.reserve(estimated_cols);
+  size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) /
+                        COLUMN_ROW_COUNT_BLOCK_ROWS;
+  result.row_count_block_dir.resize(n_row_blocks, -1);
+  size_t estimated_touched_blocks = std::min(n_row_blocks, std::max<size_t>(16, estimated_nnz));
+  result.row_count_blocks.reserve(estimated_touched_blocks);
+  result.row_count_storage.reserve(estimated_touched_blocks * COLUMN_ROW_COUNT_BLOCK_ROWS);
+
+  cursor_t cursor(chunk_start, (size_t)(chunk_end - chunk_start));
+  std::string_view prev_var_name = "";
+
+  cursor.skip_ws();
+
+  while (!cursor.done()) {
+    if (__unlikely(*cursor.ptr == 'R')) {
+      auto next = cursor.peek_field();
+      // RHS section is mandatory right after COLUMNS section
+      if (next == "RHS") { break; }
+    }
+
+    auto [var_name, field2] = cursor.read_two_fields();
+    if (__unlikely(!field2.empty() && field2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+
+    // Check for integer marker
+    if (__unlikely(field2[0] == '\'' && field2 == "'MARKER'")) {
+      auto marker_type = cursor.read_field();
+
+      MarkerInfo marker;
+      marker.after_local_var_idx =
+        result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1;
+
+      if (marker_type == "'INTORG'") {
+        marker.type = MarkerInfo::INTORG;
+      } else {
+        marker.type = MarkerInfo::INTEND;
+      }
+      result.markers.push_back(marker);
+
+      while (!cursor.done() && !cursor.eol())
+        cursor.ptr++;
+      if (!cursor.done()) cursor.ptr++;
+      cursor.skip_ws();
+      continue;
+    }
+
+    auto row_name = field2;
+    // quite often in MIPs the coefficient is just a single-digit integer
+    double value;
+    double sign = 1.0;
+    if (cursor.ptr[0] == '-') {
+      sign = -1.0;
+      cursor.advance(1);
+    }
+    if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && cursor.ptr[1] == '\n') {
+      value = sign * (cursor.ptr[0] - '0');
+      cursor.advance(1);
+    } else {
+      value = sign * fast_atof_advance(cursor.ptr, cursor.end);
+    }
+    // usually EOL directly follows
+    if (__unlikely(!cursor.eol())) { cursor.skip_ws(); }
+    accept_comment(cursor);
+
+    if (result.first_var_name.empty()) { result.first_var_name = var_name; }
+    result.last_var_name = var_name;
+
+    if (prev_var_name != var_name) {
+      result.var_names.push_back(var_name);
+      observe_dense_col_name(result.dense_col_stats, var_name);
+      result.col_offsets.push_back(result.values.size());
+      prev_var_name = var_name;
+    }
+
+    auto add_entry = [&](std::string_view rn, double val) {
+      size_t row_idx = state.row_lookup(rn);
+      if (__likely(row_idx != SIZE_MAX)) {
+        assert(row_idx <= (size_t)std::numeric_limits<int32_t>::max());
+        result.values.push_back(val);
+        result.row_indices.push_back((uint32_t)row_idx);
+        column_row_count_slot(result, row_idx)++;
+      } else if (__likely(rn == state.objective_name_sv)) {
+        result.objective_entries.push_back({result.var_names.size() - 1, val});
+      } else if (state.is_ignored_objective_name(rn)) {
+        return;
+      } else {
+        state.cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data());
+      }
+    };
+
+    add_entry(row_name, value);
+
+    // Optional second entry on same line
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      double value2 = fast_atof_advance(cursor.ptr, cursor.end);
+      cursor.skip_ws();
+      accept_comment(cursor);
+
+      add_entry(row_name2, value2);
+    }
+
+    expect_eol(cursor);
+  }
+
+  result.col_offsets.push_back(result.values.size());
+
+  return result;
+}
+
+// Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC
+template <typename i_t, typename f_t>
+static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
+                                       std::vector<ChunkResult>& chunks,
+                                       int num_threads)
+{
+  scoped_timer_t timer("merge_chunks_to_csr");
+
+  int num_chunks = (int)chunks.size();
+  if (num_chunks == 0) return;
+
+  i_t n_rows = state.problem.n_constraints_;
+
+  std::vector<size_t> global_col_offset(num_chunks + 1);
+  global_col_offset[0] = 0;
+  size_t total_nnz     = 0;
+  {
+    scoped_timer_t timer("columns_global_offsets");
+    for (int t = 0; t < num_chunks; t++) {
+      global_col_offset[t + 1] = global_col_offset[t] + chunks[t].var_names.size();
+      total_nnz += chunks[t].values.size();
+    }
+  }
+  size_t total_cols = global_col_offset[num_chunks];
+  {
+    scoped_timer_t timer("columns_dense_metadata");
+    bool dense_ok   = total_cols > 0;
+    bool have_first = false;
+    std::string_view dense_prefix;
+    uint64_t expected_next_id = 0;
+    uint64_t dense_min_id     = 0;
+    uint64_t dense_max_id     = 0;
+    size_t dense_pad_width    = 0;
+    bool dense_zero_padded    = false;
+
+    for (int t = 0; t < num_chunks && dense_ok; ++t) {
+      const auto& stats = chunks[t].dense_col_stats;
+      if (stats.count == 0) { continue; }
+      if (!stats.candidate || stats.count != chunks[t].var_names.size()) {
+        dense_ok = false;
+        break;
+      }
+      if (!have_first) {
+        have_first        = true;
+        dense_prefix      = stats.prefix;
+        expected_next_id  = stats.first_id;
+        dense_min_id      = stats.first_id;
+        dense_pad_width   = stats.pad_width;
+        dense_zero_padded = stats.zero_padded;
+      }
+      if (stats.prefix != dense_prefix || stats.first_id != expected_next_id ||
+          !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) {
+        dense_ok = false;
+        break;
+      }
+      if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) {
+        dense_ok = false;
+        break;
+      }
+      dense_max_id = stats.last_id;
+      if (stats.last_id == std::numeric_limits<uint64_t>::max()) {
+        expected_next_id = stats.last_id;
+        dense_ok         = false;
+        break;
+      }
+      expected_next_id = stats.last_id + 1;
+    }
+
+    if (!have_first || dense_max_id < dense_min_id ||
+        dense_max_id - dense_min_id + 1 != total_cols) {
+      dense_ok = false;
+    }
+
+    state.col_dense_ordered = dense_ok;
+    if (dense_ok) {
+      state.col_dense_prefix      = dense_prefix;
+      state.col_dense_min_id      = dense_min_id;
+      state.col_dense_max_id      = dense_max_id;
+      state.col_dense_pad_width   = dense_pad_width;
+      state.col_dense_zero_padded = dense_zero_padded;
+    }
+  }
+
+  // Step 2: Sum row counts (already computed during parsing) and build CSR row_offsets
+  std::vector<i_t> global_row_counts((size_t)n_rows, 0);
+  {
+    scoped_timer_t timer("columns_sum_row_counts");
+    for (int t = 0; t < num_chunks; t++) {
+      for (const auto& block : chunks[t].row_count_blocks) {
+        const int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset;
+        size_t row_base             = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+        size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base);
+        for (size_t local = 0; local < block_limit; ++local) {
+          global_row_counts[row_base + local] += (i_t)block_counts[local];
+        }
+      }
+    }
+  }
+  {
+    scoped_timer_t timer("columns_build_row_offsets");
+    state.problem.A_offsets_.resize((size_t)n_rows + 1);
+    state.problem.A_offsets_[0] = 0;
+    for (i_t r = 0; r < n_rows; r++) {
+      state.problem.A_offsets_[(size_t)r + 1] =
+        state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r];
+    }
+  }
+
+  {
+    scoped_timer_t timer("columns_counts_to_write_positions");
+    std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0});
+    for (int t = 0; t < num_chunks; t++) {
+      for (auto& block : chunks[t].row_count_blocks) {
+        int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset;
+        size_t row_base       = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+        size_t block_limit    = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base);
+        for (size_t local = 0; local < block_limit; ++local) {
+          int64_t count = block_counts[local];
+          if (count == 0) continue;
+          size_t row          = row_base + local;
+          i_t pos             = state.problem.A_offsets_[row] + global_row_counts[row];
+          block_counts[local] = (int64_t)pos;
+          global_row_counts[row] += (i_t)count;
+        }
+      }
+    }
+  }
+
+  {
+    scoped_timer_t timer("columns_row_count_storage_hugepages");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_chunks; ++t) {
+      materialize_vector_hugepages(
+        "column_row_count_storage", chunks[t].row_count_storage, materialize_touch_t::write_2mb);
+    }
+  }
+
+  // Step 6: Allocate CSR arrays
+  {
+    scoped_timer_t timer("allocate_csr_arrays");
+
+    // May be unexpectedly slow, even if already reserved() to good fit.
+    // I assume the cause is probably that the pages aren't actually backed when reserve() is called
+    // and the actual physical allocation only happens now
+
+    // evil tweak until we can refactior problem_t
+    // run the zero-init resize() calls in parallel
+
+#pragma omp parallel sections num_threads(4)
+    {
+#pragma omp section
+      {
+        state.problem.A_.resize(total_nnz);
+      }
+#pragma omp section
+      {
+        state.problem.A_indices_.resize(total_nnz);
+      }
+#pragma omp section
+      {
+        if (!state.col_dense_ordered) { state.var_names_sv.resize(total_cols); }
+      }
+#pragma omp section
+      {
+        state.problem.var_types_.resize(total_cols);
+      }
+    }
+  }
+
+  // Step 6: Parallel scatter into CSR + copy var_names
+  {
+    scoped_timer_t timer("scatter_into_csr");
+    {
+      scoped_timer_t matrix_timer("scatter_matrix_entries");
+#ifdef MPS_FAST_PERF_COUNTERS
+      std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_chunks);
+#endif
+#pragma omp parallel for num_threads(num_threads)
+      for (int t = 0; t < num_chunks; t++) {
+#ifdef MPS_FAST_PERF_COUNTERS
+        thread_perf_counters_t perf_counters;
+#endif
+        auto& chunk = chunks[t];
+
+        for (size_t local_col = 0; local_col < chunks[t].var_names.size(); local_col++) {
+          i_t global_col = (i_t)(global_col_offset[t] + local_col);
+
+          size_t col_start = chunks[t].col_offsets[local_col];
+          size_t col_end   = chunks[t].col_offsets[local_col + 1];
+          for (size_t idx = col_start; idx < col_end; idx++) {
+            i_t row                        = (i_t)chunks[t].row_indices[idx];
+            size_t row_idx                 = (size_t)row;
+            size_t block_id                = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+            size_t local                   = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+            int32_t block_pos              = chunk.row_count_block_dir[block_id];
+            RowCountBlock& block           = chunk.row_count_blocks[(size_t)block_pos];
+            int64_t& write_pos             = chunk.row_count_storage[block.storage_offset + local];
+            i_t dest                       = (i_t)write_pos++;
+            state.problem.A_[dest]         = (f_t)chunks[t].values[idx];
+            state.problem.A_indices_[dest] = global_col;
+          }
+        }
+#ifdef MPS_FAST_PERF_COUNTERS
+        perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      print_perf_totals("scatter_matrix_entries", perf_snapshots);
+#endif
+    }
+
+    if (!state.col_dense_ordered) {
+      {
+        scoped_timer_t names_timer("scatter_var_names");
+#pragma omp parallel for num_threads(num_threads)
+        for (int t = 0; t < num_chunks; t++) {
+          for (size_t i = 0; i < chunks[t].var_names.size(); i++) {
+            state.var_names_sv[global_col_offset[t] + i] = chunks[t].var_names[i];
+          }
+        }
+      }
+    } else {
+      scoped_timer_t names_timer("scatter_var_names");
+    }
+  }
+
+  // Step 7: Apply integer markers
+  struct GlobalMarker {
+    MarkerInfo::Type type;
+    size_t global_var_idx;
+  };
+  {
+    scoped_timer_t timer("columns_apply_markers");
+    std::vector<GlobalMarker> all_markers;
+
+    for (int t = 0; t < num_chunks; t++) {
+      for (const auto& m : chunks[t].markers) {
+        GlobalMarker gm;
+        gm.type = m.type;
+
+        if (m.after_local_var_idx == SIZE_MAX) {
+          // Marker before any variable in this chunk
+          gm.global_var_idx = (global_col_offset[t] > 0) ? global_col_offset[t] - 1 : SIZE_MAX;
+        } else {
+          gm.global_var_idx = global_col_offset[t] + m.after_local_var_idx;
+        }
+        all_markers.push_back(gm);
+      }
+    }
+
+    std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
+      // SIZE_MAX means "before all variables" - should sort first
+      if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
+      if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
+      return a.global_var_idx < b.global_var_idx;
+    });
+
+    bool is_integer   = false;
+    size_t marker_idx = 0;
+
+    for (size_t v = 0; v < total_cols; v++) {
+      while (marker_idx < all_markers.size() &&
+             (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
+              all_markers[marker_idx].global_var_idx < v)) {
+        if (all_markers[marker_idx].type == MarkerInfo::INTORG) {
+          is_integer = true;
+        } else {
+          is_integer = false;
+        }
+        marker_idx++;
+      }
+      state.problem.var_types_[v] = is_integer ? 'I' : 'C';
+    }
+  }
+
+  // Step 8: Handle objective entries
+  {
+    scoped_timer_t timer("columns_objective_entries");
+    state.problem.c_.resize(total_cols, f_t{0});
+    for (int t = 0; t < num_chunks; t++) {
+      for (const auto& [local_col, coeff] : chunks[t].objective_entries) {
+        size_t global_col = global_col_offset[t] + local_col;
+        if (global_col < total_cols) { state.problem.c_[global_col] = (f_t)coeff; }
+      }
+    }
+  }
+
+  // Store final dimensions; CSR and objective coefficients are already complete.
+  state.problem.n_vars_ = (i_t)total_cols;
+  state.problem.nnz_    = (i_t)total_nnz;
+}
+
+template <typename i_t, typename f_t>
+static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
+                                           int num_threads,
+                                           const char* columns_end)
+{
+  scoped_timer_t timer("parse_columns_parallel");
+
+  if (num_threads <= 0) { num_threads = phase_thread_count(MPS_COLUMNS_THREAD_CAP); }
+
+  // Skip the "COLUMNS" header
+  expect_section(state.cursor, "COLUMNS");
+
+  const char* columns_start    = state.cursor.ptr;
+  size_t columns_bytes         = (size_t)(columns_end - columns_start);
+  size_t chunk_limited_threads = std::max<size_t>(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES);
+  num_threads = std::max(1, std::min<int>(num_threads, (int)chunk_limited_threads));
+
+  // Compute chunk boundaries
+  auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads);
+
+  // Parse chunks in parallel
+  std::vector<ChunkResult> results(num_threads);
+
+  {
+    scoped_timer_t timer("parse_columns_chunk_parallel");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_threads);
+#endif
+    {
+#pragma omp parallel for num_threads(num_threads)
+      for (int t = 0; t < num_threads; t++) {
+        MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns);
+#ifdef MPS_FAST_PERF_COUNTERS
+        thread_perf_counters_t perf_counters;
+#endif
+        results[t] =
+          parse_columns_chunk<i_t, f_t>(chunk_bounds[t].start, chunk_bounds[t].end, state);
+#ifdef MPS_FAST_PERF_COUNTERS
+        perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+      }
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("parse_columns_chunk_parallel", perf_snapshots);
+#endif
+  }
+
+  // Merge results directly into CSR format
+  merge_chunk_results_to_csr(state, results, num_threads);
+
+  // Update cursor to RHS section
+  state.cursor.ptr = columns_end;
+  state.cursor.skip_ws();
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_rhs");
+  expect_section(cursor, "RHS");
+
+  auto field_from_start = [](const char* start, const char* end) {
+    const char* p = start;
+    while (p < end && *p > ' ') {
+      p++;
+    }
+    return std::string_view(start, (size_t)(p - start));
+  };
+
+  auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) {
+    if (row_idx != SIZE_MAX) {
+      state.problem.b_[row_idx] = value;
+      return;
+    }
+    std::string_view row_name = field_from_start(row_start, cursor.end);
+    if (row_name == state.objective_name_sv) {
+      state.problem.objective_offset_ = -value;
+      return;
+    }
+    if (state.is_ignored_objective_name(row_name)) { return; }
+    error_unknown_row(cursor, row_start, "RHS");
+  };
+
+  while (cursor.ptr < cursor.end && !is_rhs_section_end(cursor.ptr, cursor.end)) {
+    auto rhs_name = cursor.read_field();
+    (void)rhs_name;
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    const char* row_start = cursor.ptr;
+    size_t row_idx        = state.read_row_lookup(cursor);
+    auto value            = expect_number_fast_pm_one(cursor);
+    apply_rhs(row_start, row_idx, (f_t)value);
+
+    accept_comment(cursor);
+    if (!cursor.eol()) {
+      const char* row_start2 = cursor.ptr;
+      size_t row_idx2        = state.read_row_lookup(cursor);
+      auto value2            = expect_number_fast_pm_one(cursor);
+      apply_rhs(row_start2, row_idx2, (f_t)value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
+                                                cursor_t& cursor,
+                                                const char* bounds_body_start,
+                                                const char* bounds_body_end,
+                                                size_t n_vars)
+{
+  const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start);
+  const int num_threads     = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
+  if (!state.col_dense_ordered || bounds_bytes < MPS_BOUNDS_PARALLEL_MIN_BYTES || num_threads < 2) {
+    return false;
+  }
+
+  MPS_NVTX_RANGE("parse_bounds_parallel_dense", nvtx::colors::bounds);
+
+  struct BoundsParallelStats {
+    size_t lines            = 0;
+    size_t dense_hits       = 0;
+    size_t dense_misses     = 0;
+    size_t comments         = 0;
+    size_t min_var          = SIZE_MAX;
+    size_t max_var          = 0;
+    size_t non_strict_order = 0;
+    bool saw_integer_type   = false;
+    bool saw_negative_upper = false;
+    const char* error_ptr   = nullptr;
+    char error_msg[192]     = {};
+  };
+
+  std::vector<BoundsParallelStats> stats((size_t)num_threads);
+  auto boundaries = compute_line_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
+
+  std::vector<uint8_t> bound_seen;
+  {
+    scoped_timer_t timer("bounds_parallel_seen_alloc");
+    bound_seen.resize(n_vars, 0);
+  }
+
+  {
+    scoped_timer_t timer("parse_bounds_parallel_dense");
+    // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse
+    // optimistically, then accept only if chunk summaries prove strict order.
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      auto& local = stats[(size_t)t];
+      cursor_t cursor(boundaries[(size_t)t].start,
+                      (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start));
+      cursor.skip_ws();
+      size_t prev_var = SIZE_MAX;
+      try {
+        while (cursor.ptr < cursor.end) {
+          if (__unlikely(*cursor.ptr == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          auto bound_type = cursor.read_field();
+          if (__unlikely(bound_type.empty())) { break; }
+          if (__unlikely(bound_type[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          auto bound_name = cursor.read_field();
+          (void)bound_name;
+          auto var_name = cursor.read_field();
+          if (__unlikely(!var_name.empty() && var_name[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          size_t var_idx = state.col_lookup_dense_ordered(var_name);
+          if (__unlikely(var_idx == SIZE_MAX)) {
+            local.dense_misses++;
+            std::snprintf(local.error_msg,
+                          sizeof(local.error_msg),
+                          "unknown variable name in BOUNDS: %.*s",
+                          (int)var_name.size(),
+                          var_name.data());
+            local.error_ptr = cursor.ptr;
+            break;
+          }
+          local.dense_hits++;
+          local.lines++;
+          local.min_var = std::min(local.min_var, var_idx);
+          local.max_var = std::max(local.max_var, var_idx);
+          if (prev_var != SIZE_MAX && var_idx <= prev_var) { local.non_strict_order++; }
+          prev_var = var_idx;
+
+          bool first_bound_for_var = bound_seen[var_idx] == 0;
+          bound_seen[var_idx]      = 1;
+
+          f_t value = 0;
+          accept_comment(cursor);
+          if (!cursor.eol()) {
+            value = (f_t)expect_number_fast_pm_one(cursor);
+            accept_comment(cursor);
+          }
+
+          if (bound_type == "LO") {
+            state.problem.variable_lower_bounds_[var_idx] = value;
+          } else if (bound_type == "UP") {
+            state.problem.variable_upper_bounds_[var_idx] = value;
+            if (first_bound_for_var && value < f_t{0}) {
+              state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+              local.saw_negative_upper                      = true;
+            }
+          } else if (bound_type == "FX") {
+            state.problem.variable_lower_bounds_[var_idx] = value;
+            state.problem.variable_upper_bounds_[var_idx] = value;
+          } else if (bound_type == "FR") {
+            state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+            state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+          } else if (bound_type == "MI") {
+            state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+          } else if (bound_type == "PL") {
+            state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+          } else if (bound_type == "BV") {
+            state.problem.variable_lower_bounds_[var_idx] = 0;
+            state.problem.variable_upper_bounds_[var_idx] = 1;
+            state.problem.var_types_[var_idx]             = 'I';
+            local.saw_integer_type                        = true;
+          } else if (bound_type == "LI") {
+            state.problem.variable_lower_bounds_[var_idx] = value;
+            state.problem.var_types_[var_idx]             = 'I';
+            local.saw_integer_type                        = true;
+          } else if (bound_type == "UI") {
+            state.problem.variable_upper_bounds_[var_idx] = value;
+            if (first_bound_for_var && value < f_t{0}) {
+              state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+              local.saw_negative_upper                      = true;
+            }
+            state.problem.var_types_[var_idx] = 'I';
+            local.saw_integer_type            = true;
+          } else {
+            std::snprintf(local.error_msg,
+                          sizeof(local.error_msg),
+                          "unknown bound type: %.*s",
+                          (int)bound_type.size(),
+                          bound_type.data());
+            local.error_ptr = cursor.ptr;
+            break;
+          }
+
+          expect_eol(cursor);
+        }
+      } catch (const std::exception& e) {
+        std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", e.what());
+        local.error_ptr = cursor.ptr;
+      }
+    }
+  }
+
+  size_t dense_misses     = 0;
+  size_t non_strict_order = 0;
+  size_t overlap_chunks   = 0;
+  size_t prev_max         = SIZE_MAX;
+  for (int t = 0; t < num_threads; ++t) {
+    const auto& local = stats[(size_t)t];
+    if (local.error_ptr != nullptr) {
+      cursor.ptr = local.error_ptr;
+      cursor.error("%s", local.error_msg);
+    }
+    dense_misses += local.dense_misses;
+    non_strict_order += local.non_strict_order;
+    if (local.lines > 0) {
+      if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; }
+      prev_max = local.max_var;
+    }
+  }
+
+  const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0;
+
+  if (!order_safe) {
+    cursor.ptr = bounds_body_start;
+    return false;
+  }
+
+  {
+    scoped_timer_t timer("bounds_integer_defaults");
+    for (size_t i = 0; i < n_vars; ++i) {
+      if (!bound_seen[i] && state.problem.var_types_[i] == 'I') {
+        state.problem.variable_lower_bounds_[i] = f_t{0};
+        state.problem.variable_upper_bounds_[i] = f_t{1};
+      }
+    }
+  }
+
+  cursor.ptr = bounds_body_end;
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
+                                 cursor_t& cursor,
+                                 bool allow_parallel_dense = false)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+
+  // Initialize bounds with defaults
+  {
+    scoped_timer_t timer("bounds_init_defaults");
+    const bool parallel_init =
+      n_vars >= MPS_BOUNDS_PARALLEL_INIT_MIN_VARS && omp_get_max_threads() >= 2;
+
+    if (parallel_init) {
+#pragma omp parallel sections num_threads(2)
+      {
+#pragma omp section
+        {
+          state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
+        }
+#pragma omp section
+        {
+          state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
+        }
+      }
+    } else {
+      state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
+      state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
+    }
+  }
+
+  {
+    scoped_timer_t timer("bounds_madvise_pretouch");
+    materialize_vector_hugepages("variable_lower_bounds",
+                                 state.problem.variable_lower_bounds_,
+                                 materialize_touch_t::write_4kb);
+    materialize_vector_hugepages("variable_upper_bounds",
+                                 state.problem.variable_upper_bounds_,
+                                 materialize_touch_t::write_4kb);
+  }
+
+  std::vector<uint64_t> bound_seen((n_vars + 63) / 64, 0);
+  auto has_bound = [&](size_t var_idx) {
+    return (bound_seen[var_idx >> 6] & (uint64_t{1} << (var_idx & 63))) != 0;
+  };
+  auto mark_bound = [&](size_t var_idx) {
+    bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63);
+  };
+  auto apply_unspecified_integer_bounds = [&]() {
+    scoped_timer_t timer("bounds_integer_defaults");
+    for (size_t i = 0; i < n_vars; ++i) {
+      if (!has_bound(i) && state.problem.var_types_[i] == 'I') {
+        state.problem.variable_lower_bounds_[i] = f_t{0};
+        state.problem.variable_upper_bounds_[i] = f_t{1};
+      }
+    }
+  };
+
+  if (!accept_section(cursor, "BOUNDS")) {
+    apply_unspecified_integer_bounds();
+    return;
+  }
+
+  const char* bounds_body_start = cursor.ptr;
+  const char* bounds_body_end =
+    allow_parallel_dense ? find_bounds_body_end(bounds_body_start, cursor.end) : cursor.end;
+  if (allow_parallel_dense) {
+    if (parse_bounds_section_parallel_dense(
+          state, cursor, bounds_body_start, bounds_body_end, n_vars)) {
+      return;
+    }
+    {
+      scoped_timer_t timer("bounds_parallel_fallback_reset");
+      std::fill(state.problem.variable_lower_bounds_.begin(),
+                state.problem.variable_lower_bounds_.end(),
+                f_t{0});
+      std::fill(state.problem.variable_upper_bounds_.begin(),
+                state.problem.variable_upper_bounds_.end(),
+                std::numeric_limits<f_t>::infinity());
+    }
+  }
+
+  size_t hint_idx = 0;
+  {
+    scoped_timer_t timer("parse_bounds");
+    for (;;) {
+      bool done = cursor.done() || peek(cursor) == "RANGES" || peek(cursor) == "ENDATA" ||
+                  is_quadratic_section_start(cursor.ptr, cursor.end);
+      if (done) break;
+
+      auto bound_type = cursor.read_field();
+      auto bound_name = cursor.read_field();
+      (void)bound_name;
+      auto var_name = cursor.read_field();
+      if (__unlikely(!var_name.empty() && var_name[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+
+      // optimized lookup using hint (bounds often in same order as columns)
+      size_t var_idx = SIZE_MAX;
+      if (__likely(state.col_dense_ordered)) {
+        var_idx = state.col_lookup_dense_ordered(var_name);
+        if (var_idx == SIZE_MAX) {
+          cursor.error(
+            "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data());
+        }
+      } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) {
+        var_idx = hint_idx + 1;
+      } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) {
+        var_idx = hint_idx;
+      } else {
+        size_t search_start = hint_idx + 2;
+        size_t search_end   = n_vars;
+
+      search_loop:
+        for (size_t i = search_start; i < search_end; ++i) {
+          if (state.var_names_sv[i] == var_name) {
+            var_idx = i;
+            goto found;
+          }
+        }
+        if (search_start != 0) {
+          search_end   = hint_idx;
+          search_start = 0;
+          goto search_loop;
+        }
+        cursor.error(
+          "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data());
+      }
+    found:
+      hint_idx                 = var_idx;
+      bool first_bound_for_var = !has_bound(var_idx);
+
+      f_t value = 0;
+      accept_comment(cursor);
+      if (!cursor.eol()) {
+        // bounds are often just set to 0 or 1
+        if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') {
+          value = cursor.ptr[0] - '0';
+          cursor.ptr += 1;
+        } else {
+          value = (f_t)expect_number(cursor);
+        }
+        accept_comment(cursor);
+      }
+
+      if (bound_type == "LO") {
+        state.problem.variable_lower_bounds_[var_idx] = value;
+      } else if (bound_type == "UP") {
+        state.problem.variable_upper_bounds_[var_idx] = value;
+        if (first_bound_for_var && value < f_t{0}) {
+          state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+        }
+      } else if (bound_type == "FX") {
+        state.problem.variable_lower_bounds_[var_idx] = value;
+        state.problem.variable_upper_bounds_[var_idx] = value;
+      } else if (bound_type == "FR") {
+        state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+        state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+      } else if (bound_type == "MI") {
+        state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+      } else if (bound_type == "PL") {
+        state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+      } else if (bound_type == "BV") {
+        state.problem.variable_lower_bounds_[var_idx] = 0;
+        state.problem.variable_upper_bounds_[var_idx] = 1;
+        state.problem.var_types_[var_idx]             = 'I';
+      } else if (bound_type == "LI") {
+        state.problem.variable_lower_bounds_[var_idx] = value;
+        state.problem.var_types_[var_idx]             = 'I';
+      } else if (bound_type == "UI") {
+        state.problem.variable_upper_bounds_[var_idx] = value;
+        if (first_bound_for_var && value < f_t{0}) {
+          state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+        }
+        state.problem.var_types_[var_idx] = 'I';
+      } else {
+        cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data());
+      }
+      mark_bound(var_idx);
+
+      expect_eol(cursor);
+    }
+  }
+  apply_unspecified_integer_bounds();
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_ranges");
+
+  // Initialize constraint bounds from row_types and b_
+  state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_);
+  state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_);
+
+  for (i_t i = 0; i < state.problem.n_constraints_; ++i) {
+    char row_type = state.problem.row_types_[i];
+    f_t b         = state.problem.b_[i];
+    if (row_type == 'E') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[i] = -std::numeric_limits<f_t>::infinity();
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'G') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = std::numeric_limits<f_t>::infinity();
+    }
+  }
+
+  if (!accept_section(cursor, "RANGES")) { return; }
+
+  auto apply_range = [&](std::string_view row_name, f_t range_val) {
+    size_t row_idx = state.row_lookup(row_name);
+    if (row_idx == SIZE_MAX) {
+      cursor.error("unknown row name in RANGES: %.*s", (int)row_name.size(), row_name.data());
+    }
+    char row_type = state.problem.row_types_[row_idx];
+    f_t abs_range = std::abs(range_val);
+
+    if (row_type == 'E') {
+      if (range_val >= 0) {
+        state.problem.constraint_upper_bounds_[row_idx] =
+          state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+      } else {
+        state.problem.constraint_lower_bounds_[row_idx] =
+          state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+      }
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[row_idx] =
+        state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+    } else if (row_type == 'G') {
+      state.problem.constraint_upper_bounds_[row_idx] =
+        state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+    }
+  };
+
+  while (cursor.ptr < cursor.end && peek(cursor) != "BOUNDS" && peek(cursor) != "ENDATA" &&
+         !is_quadratic_section_start(cursor.ptr, cursor.end)) {
+    auto range_name = cursor.read_field();
+    (void)range_name;
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    auto row_name = cursor.read_field();
+    auto value    = (f_t)expect_number(cursor);
+    apply_range(row_name, value);
+
+    accept_comment(cursor);
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      auto value2 = (f_t)expect_number(cursor);
+      apply_range(row_name2, value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void build_var_name_map_if_needed(parse_state_t<i_t, f_t>& state)
+{
+  if (state.col_dense_ordered || !state.var_names_map.empty()) { return; }
+  scoped_timer_t timer("quadratic_build_var_name_map");
+  state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2);
+  for (size_t i = 0; i < state.var_names_sv.size(); ++i) {
+    state.var_names_map.emplace(state.var_names_sv[i], i);
+  }
+}
+
+template <typename i_t, typename f_t>
+static size_t lookup_quadratic_var(parse_state_t<i_t, f_t>& state, std::string_view name)
+{
+  if (state.col_dense_ordered) { return state.col_lookup_dense_ordered(name); }
+  auto it = state.var_names_map.find(name);
+  return it == state.var_names_map.end() ? SIZE_MAX : it->second;
+}
+
+template <typename i_t, typename f_t>
+static void build_quadratic_csr(parse_state_t<i_t, f_t>& state,
+                                const std::vector<std::tuple<i_t, i_t, f_t>>& entries,
+                                bool symmetric_upper_triangular)
+{
+  scoped_timer_t timer("build_quadratic_csr");
+  const size_t n_vars = (size_t)state.problem.n_vars_;
+  if (entries.empty()) { return; }
+
+  struct ExpandedEntry {
+    size_t row;
+    size_t col;
+    size_t seq;
+    f_t value;
+  };
+
+  std::vector<ExpandedEntry> expanded;
+  expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size());
+  size_t seq = 0;
+  for (const auto& [row_i, col_i, value] : entries) {
+    size_t row = (size_t)row_i;
+    size_t col = (size_t)col_i;
+    expanded.push_back({row, col, seq++, value});
+    if (symmetric_upper_triangular && row != col) { expanded.push_back({col, row, seq++, value}); }
+  }
+
+  std::stable_sort(expanded.begin(), expanded.end(), [](const auto& a, const auto& b) {
+    if (a.row != b.row) return a.row < b.row;
+    if (a.col != b.col) return a.col < b.col;
+    return a.seq < b.seq;
+  });
+
+  auto& values  = state.problem.Q_objective_values_;
+  auto& indices = state.problem.Q_objective_indices_;
+  auto& offsets = state.problem.Q_objective_offsets_;
+  values.clear();
+  indices.clear();
+  offsets.assign(n_vars + 1, i_t{0});
+  values.reserve(expanded.size());
+  indices.reserve(expanded.size());
+
+  size_t current_row = 0;
+  offsets[0]         = 0;
+  for (const auto& entry : expanded) {
+    while (current_row < entry.row) {
+      offsets[++current_row] = (i_t)values.size();
+    }
+    values.push_back(entry.value * f_t{0.5});
+    indices.push_back((i_t)entry.col);
+  }
+  while (current_row < n_vars) {
+    offsets[++current_row] = (i_t)values.size();
+  }
+}
+
+template <typename i_t, typename f_t>
+[[maybe_unused]] static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state,
+                                                      cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_quadratic_sections");
+  if (cursor.done() || peek(cursor) == "ENDATA") { return; }
+  if (!is_quadratic_section_start(cursor.ptr, cursor.end)) { return; }
+
+  build_var_name_map_if_needed(state);
+  std::vector<std::tuple<i_t, i_t, f_t>> quadobj_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>> qmatrix_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>>* active_entries = nullptr;
+
+  auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) {
+    size_t var1_idx = lookup_quadratic_var(state, var1);
+    if (var1_idx == SIZE_MAX) {
+      cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var1.size(), var1.data());
+    }
+    size_t var2_idx = lookup_quadratic_var(state, var2);
+    if (var2_idx == SIZE_MAX) {
+      cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var2.size(), var2.data());
+    }
+    active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value);
+  };
+
+  while (cursor.ptr < cursor.end) {
+    if (peek(cursor) == "ENDATA") { break; }
+    if (accept_section(cursor, "QUADOBJ")) {
+      active_entries = &quadobj_entries;
+      continue;
+    }
+    if (accept_section(cursor, "QMATRIX")) {
+      active_entries = &qmatrix_entries;
+      continue;
+    }
+    if (active_entries == nullptr) { break; }
+
+    auto var1 = cursor.read_field();
+    if (__unlikely(var1.empty())) { break; }
+    if (__unlikely(var1[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    auto var2 = cursor.read_field();
+    if (__unlikely(!var2.empty() && var2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    f_t value = (f_t)expect_number(cursor);
+    add_entry(var1, var2, value);
+    accept_comment(cursor);
+    expect_eol(cursor);
+  }
+
+  if (!quadobj_entries.empty()) {
+    build_quadratic_csr(state, quadobj_entries, true);
+  } else if (!qmatrix_entries.empty()) {
+    build_quadratic_csr(state, qmatrix_entries, false);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void set_cursor_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  state.cursor.ptr = range.begin;
+  state.cursor.end = range.end;
+}
+
+template <typename i_t, typename f_t>
+static void parse_header_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  accept_comment_line(state.cursor);
+  if (state.cursor.done()) { return; }
+  parse_name_section(state);
+  parse_objsense_section(state);
+  parse_objname_section(state);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  parse_rows_section(state, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_columns_range(parse_state_t<i_t, f_t>& state,
+                                mps_phase_range_t range,
+                                int num_threads = 0)
+{
+  set_cursor_range(state, range);
+  parse_columns_section_parallel(state, num_threads, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_rhs_section(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_range(parse_state_t<i_t, f_t>& state,
+                               mps_phase_range_t range,
+                               const char* fallback_ptr)
+{
+  if (range.present) {
+    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+    parse_bounds_section(state, cursor, range.present);
+  } else {
+    cursor_t cursor(fallback_ptr, 16);
+    parse_bounds_section(state, cursor, range.present);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_range(parse_state_t<i_t, f_t>& state,
+                               mps_phase_range_t range,
+                               const char* fallback_ptr)
+{
+  if (range.present) {
+    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+    parse_ranges_section(state, cursor);
+  } else {
+    cursor_t cursor(fallback_ptr, 16);
+    parse_ranges_section(state, cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_quadratic_range(parse_state_t<i_t, f_t>& state,
+                                  mps_phase_range_t range,
+                                  const char* fallback_ptr)
+{
+  (void)state;
+  if (range.present) {
+    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+    if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) {
+      throw std::logic_error(
+        "experimental fast MPS reader currently supports LP/MIP MPS files only; "
+        "quadratic MPS sections are not supported");
+    }
+  } else {
+    cursor_t cursor(fallback_ptr, 16);
+    if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) {
+      throw std::logic_error(
+        "experimental fast MPS reader currently supports LP/MIP MPS files only; "
+        "quadratic MPS sections are not supported");
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_names");
+  int num_threads = phase_thread_count(MPS_NAMES_THREAD_CAP);
+  // Copy string_views to actual strings (this is where allocation happens)
+  {
+    scoped_timer_t timer("materialize_problem_scalar_names");
+    state.problem.problem_name_   = std::string(state.problem_name_sv);
+    state.problem.objective_name_ = std::string(state.objective_name_sv);
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_row_names");
+    size_t n = state.row_names_sv.size();
+    state.problem.row_names_.resize(n);
+    // row names are usually small enough for SSO - parallel assigns mostly don't touch the heap and
+    // as such may help a lot ideally we could just allocate an arena and store non-owning string
+    // views but that'd require a refactor of the problem representation
+    if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    }
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_var_names");
+    size_t n = state.col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size();
+    state.problem.var_names_.resize(n);
+    if (state.col_dense_ordered && n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.dense_col_name(i, state.problem.var_names_[i]);
+      }
+    } else if (state.col_dense_ordered) {
+      for (size_t i = 0; i < n; ++i) {
+        state.dense_col_name(i, state.problem.var_names_[i]);
+      }
+    } else if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    }
+  }
+}
+
+template <typename Stream, typename i_t, typename f_t>
+static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
+  Stream& stream, const char* total_timer_name, const char* producer_task_name)
+{
+  auto total_timer = std::make_unique<scoped_timer_t>(total_timer_name);
+  omp_set_max_active_levels(2);
+
+  input_stream_view_t input = stream.view();
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
+  problem.n_vars_                   = 0;
+  problem.n_constraints_            = 0;
+  problem.nnz_                      = 0;
+  problem.maximize_                 = false;
+  problem.objective_scaling_factor_ = f_t{1};
+  problem.objective_offset_         = f_t{0};
+
+  std::size_t reserve_size = std::max<std::size_t>(stream.reserve_size_hint(), 1024 * 1024);
+  std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
+  problem.A_offsets_.reserve(reserve_dim);
+  problem.b_.reserve(reserve_dim);
+  problem.variable_lower_bounds_.reserve(reserve_dim);
+  problem.variable_upper_bounds_.reserve(reserve_dim);
+  problem.var_types_.reserve(reserve_dim);
+  problem.row_types_.reserve(reserve_dim);
+  problem.row_names_.reserve(reserve_dim);
+  problem.var_names_.reserve(reserve_dim);
+  problem.constraint_lower_bounds_.reserve(reserve_dim);
+  problem.constraint_upper_bounds_.reserve(reserve_dim);
+
+  cursor_t cursor(input.data, 0);
+  parse_state_t<i_t, f_t> state(problem, cursor);
+  state.row_names_sv.reserve(reserve_dim);
+
+  auto phase_end = [](const char*) { flush_timers(); };
+
+  std::mutex task_error_mutex;
+  std::exception_ptr first_task_error = nullptr;
+  std::atomic<bool> task_failed{false};
+
+  auto mark_task_error = [&](std::exception_ptr eptr) {
+    {
+      std::lock_guard<std::mutex> lock(task_error_mutex);
+      if (!first_task_error) { first_task_error = eptr; }
+    }
+    task_failed.store(true, std::memory_order_release);
+  };
+
+  auto run_parser_task = [&](auto&& fn) {
+    if (task_failed.load(std::memory_order_acquire)) { return; }
+    try {
+      fn();
+    } catch (...) {
+      mark_task_error(std::current_exception());
+    }
+  };
+
+  auto unblock_phase_waiters_after_error = [&]() {
+    mps_phase_range_t empty{input.data, input.data, false};
+    input.registry->publish(mps_phase_kind::header, empty);
+    input.registry->publish(mps_phase_kind::rows, empty);
+    input.registry->publish(mps_phase_kind::columns, empty);
+    input.registry->publish(mps_phase_kind::rhs, empty);
+    input.registry->publish(mps_phase_kind::bounds, empty);
+    input.registry->publish(mps_phase_kind::ranges, empty);
+    input.registry->publish(mps_phase_kind::quadratic, empty);
+  };
+
+  int header_ready = 0, rows_ready = 0, columns_ready = 0;
+  int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0;
+  int header_done = 0, rows_done = 0, columns_done = 0;
+  int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
+
+#pragma omp parallel num_threads(omp_get_max_threads())
+  {
+    std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num());
+    nvtx::name_current_thread(thread_name.c_str());
+
+#pragma omp single
+    {
+      omp_event_handle_t ev_header;
+#pragma omp task detach(ev_header) depend(out : header_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::header, ev_header);
+      }
+      omp_event_handle_t ev_rows;
+#pragma omp task detach(ev_rows) depend(out : rows_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rows, ev_rows);
+      }
+      omp_event_handle_t ev_columns;
+#pragma omp task detach(ev_columns) depend(out : columns_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::columns, ev_columns);
+      }
+      omp_event_handle_t ev_rhs;
+#pragma omp task detach(ev_rhs) depend(out : rhs_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rhs, ev_rhs);
+      }
+      omp_event_handle_t ev_bounds;
+#pragma omp task detach(ev_bounds) depend(out : bounds_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::bounds, ev_bounds);
+      }
+      omp_event_handle_t ev_ranges;
+#pragma omp task detach(ev_ranges) depend(out : ranges_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::ranges, ev_ranges);
+      }
+      omp_event_handle_t ev_quadratic;
+#pragma omp task detach(ev_quadratic) depend(out : quadratic_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic);
+      }
+
+#pragma omp task
+      {
+        MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io);
+        try {
+          stream.run_decode_tasks();
+        } catch (...) {
+          mark_task_error(std::current_exception());
+          unblock_phase_waiters_after_error();
+        }
+      }
+
+#pragma omp task depend(in : header_ready) depend(out : header_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_header", nvtx::colors::generic);
+          parse_header_range(state, input.registry->range(mps_phase_kind::header));
+          phase_end("header");
+        });
+      }
+
+#pragma omp task depend(in : rows_ready, header_done) depend(out : rows_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rows", nvtx::colors::rows);
+          parse_rows_range(state, input.registry->range(mps_phase_kind::rows));
+          phase_end("rows");
+        });
+      }
+
+#pragma omp task depend(in : columns_ready, rows_done) depend(out : columns_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_columns", nvtx::colors::columns);
+          parse_columns_range(state, input.registry->range(mps_phase_kind::columns));
+          phase_end("columns");
+        });
+      }
+
+#pragma omp task depend(in : columns_done) depend(out : names_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_names", nvtx::colors::names);
+          scoped_timer_t timer("materialize_problem_names_task");
+          materialize_problem_names(state);
+        });
+      }
+
+#pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rhs", nvtx::colors::rhs);
+          parse_rhs_range(state, input.registry->range(mps_phase_kind::rhs));
+          phase_end("rhs");
+        });
+      }
+
+#pragma omp task depend(in : ranges_ready, rhs_done) depend(out : ranges_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges);
+          parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges), input.data);
+          phase_end("ranges");
+        });
+      }
+
+#pragma omp task depend(in : bounds_ready, columns_done) depend(out : bounds_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds);
+          parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds), input.data);
+          phase_end("bounds");
+        });
+      }
+
+#pragma omp task depend(in : quadratic_ready, columns_done) depend(out : quadratic_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic);
+          parse_quadratic_range(
+            state, input.registry->range(mps_phase_kind::quadratic), input.data);
+          phase_end("quadratic");
+        });
+      }
+
+#pragma omp taskwait
+    }
+  }
+
+  if (first_task_error) { std::rethrow_exception(first_task_error); }
+
+  input.size = stream.size();
+  cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present
+                 ? input.registry->range(mps_phase_kind::quadratic).end
+                 : (input.registry->range(mps_phase_kind::bounds).present
+                      ? input.registry->range(mps_phase_kind::bounds).end
+                      : (input.registry->range(mps_phase_kind::ranges).present
+                           ? input.registry->range(mps_phase_kind::ranges).end
+                           : input.registry->range(mps_phase_kind::rhs).end));
+  cursor.end = input.data + input.size;
+  if (!cursor.done()) { expect(cursor, "ENDATA"); }
+
+  total_timer.reset();
+  flush_timers();
+  return problem;
+}
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method)
+{
+  FileReadMethod effective_method = effective_file_read_method(path, read_method);
+  if (effective_method == FileReadMethod::Lz4) {
+    Lz4InputStream stream(path);
+    return parse_mps_fast_stream<Lz4InputStream, i_t, f_t>(
+      stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
+  }
+  if (effective_method == FileReadMethod::Read) {
+    RawInputStream stream(path);
+    return parse_mps_fast_stream<RawInputStream, i_t, f_t>(
+      stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
+  }
+  throw std::runtime_error("experimental fast MPS reader supports raw and LZ4 inputs only");
+}
+
+template cuopt::linear_programming::io::mps_data_model_t<int, float> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method);
+template cuopt::linear_programming::io::mps_data_model_t<int, double> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method);
+template cuopt::linear_programming::io::mps_data_model_t<int64_t, float> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method);
+template cuopt::linear_programming::io::mps_data_model_t<int64_t, double> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method);
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
new file mode 100644
index 0000000000..20e9901024
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "file_reader.hpp"
+
+#include <cuopt/linear_programming/io/mps_data_model.hpp>
+
+#include <cstddef>
+#include <string>
+
+namespace mps_fast {
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
+  const std::string& path, FileReadMethod read_method = FileReadMethod::Read);
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
new file mode 100644
index 0000000000..9e5777efc2
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
@@ -0,0 +1,23 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <cuopt/linear_programming/io/parser.hpp>
+
+#include "fast_parser.hpp"
+
+namespace cuopt::linear_programming::io {
+
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
+{
+  return mps_fast::parse_mps_fast_file<i_t, f_t>(mps_file_path);
+}
+
+template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
+
+}  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
new file mode 100644
index 0000000000..819b1948bf
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -0,0 +1,252 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace mps_fast {
+
+char* string_buffer;
+char* string_buffer_ptr;
+
+namespace {
+
+constexpr std::size_t raw_input_window_bytes     = 64ull * 1024ull * 1024ull;
+constexpr std::size_t raw_input_max_read_threads = 8;
+
+bool path_has_suffix(const std::string& path, const char* suffix) noexcept
+{
+  std::size_t suffix_len = std::strlen(suffix);
+  return path.size() >= suffix_len &&
+         path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
+}
+
+}  // namespace
+
+namespace {
+
+class FileDescriptor {
+ public:
+  explicit FileDescriptor(int fd) : fd_(fd) {}
+  ~FileDescriptor()
+  {
+    if (fd_ >= 0) { ::close(fd_); }
+  }
+
+  FileDescriptor(const FileDescriptor&)            = delete;
+  FileDescriptor& operator=(const FileDescriptor&) = delete;
+
+  int get() const noexcept { return fd_; }
+  bool valid() const noexcept { return fd_ >= 0; }
+
+ private:
+  int fd_;
+};
+
+std::size_t get_file_size(int fd, const std::string& path)
+{
+  struct stat st;
+  if (::fstat(fd, &st) != 0) {
+    throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno));
+  }
+  return static_cast<std::size_t>(st.st_size);
+}
+
+std::size_t system_page_size()
+{
+  static std::size_t page_size = [] {
+    long value = ::sysconf(_SC_PAGESIZE);
+    return value > 0 ? static_cast<std::size_t>(value) : static_cast<std::size_t>(4096);
+  }();
+  return page_size;
+}
+
+std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
+{
+  if (alignment == 0) { return value; }
+  std::size_t remainder = value % alignment;
+  if (remainder == 0) { return value; }
+  std::size_t increment = alignment - remainder;
+  if (value > std::numeric_limits<std::size_t>::max() - increment) {
+    throw std::runtime_error("allocation size overflow");
+  }
+  return value + increment;
+}
+
+}  // namespace
+
+RawInputStream::RawInputStream(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
+  fd_ = ::open(path.c_str(), O_RDONLY);
+  if (fd_ < 0) {
+    throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno));
+  }
+
+  file_size_    = get_file_size(fd_, path);
+  window_bytes_ = raw_input_window_bytes;
+  window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
+
+  output_mapped_size_ =
+    round_up_to_multiple(std::max<std::size_t>(file_size_, 1), system_page_size());
+  output_region_ = mmap_region_t::anonymous(
+    output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
+  output_data_ = output_region_.char_data();
+  output_region_.advise(MADV_HUGEPAGE);
+
+  block_done_.resize(window_count_, 0);
+  block_end_.resize(window_count_, 0);
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
+}
+
+RawInputStream::~RawInputStream()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+}
+
+const char* RawInputStream::data() const noexcept { return output_data_; }
+char* RawInputStream::mutable_data() noexcept { return output_data_; }
+std::size_t RawInputStream::size() const noexcept { return output_view_size_; }
+std::size_t RawInputStream::compressed_size() const noexcept { return file_size_; }
+std::size_t RawInputStream::reserve_size_hint() const noexcept { return file_size_; }
+mps_phase_registry_t& RawInputStream::registry() noexcept { return registry_; }
+input_stream_view_t RawInputStream::view() noexcept
+{
+  return {output_data_, output_data_, output_view_size_, file_size_, &registry_};
+}
+
+void RawInputStream::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io);
+  if (file_size_ == 0) {
+    output_view_size_ = 0;
+    section_scanner_->publish_ready(0);
+    return;
+  }
+
+  std::size_t hw_threads =
+    std::max<std::size_t>(1, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+  std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
+  thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
+
+  std::atomic_size_t next_window{0};
+  std::exception_ptr first_error = nullptr;
+  std::mutex error_mutex;
+  std::atomic_bool stop{false};
+
+  auto mark_error = [&](std::exception_ptr eptr) {
+    std::lock_guard<std::mutex> lock(error_mutex);
+    if (!first_error) {
+      first_error = eptr;
+      stop.store(true, std::memory_order_release);
+    }
+  };
+
+  auto read_window = [&](std::size_t index) {
+    std::size_t offset = index * window_bytes_;
+    std::size_t size   = std::min(window_bytes_, file_size_ - offset);
+    std::size_t done   = 0;
+    while (done < size) {
+      ssize_t got =
+        ::pread(fd_, output_data_ + offset + done, size - done, static_cast<off_t>(offset + done));
+      if (got < 0) {
+        if (errno == EINTR) { continue; }
+        throw std::runtime_error("Failed to pread raw MPS file '" + path_ +
+                                 "': " + std::strerror(errno));
+      }
+      if (got == 0) {
+        throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'");
+      }
+      done += static_cast<std::size_t>(got);
+    }
+
+    section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
+    frontier_mutex_.lock();
+    block_done_[index] = 1;
+    block_end_[index]  = offset + size;
+    std::size_t before = ready_bytes_;
+    while (next_block_ < block_done_.size() && block_done_[next_block_]) {
+      ready_bytes_ = block_end_[next_block_];
+      ++next_block_;
+    }
+    std::size_t after = ready_bytes_;
+    frontier_mutex_.unlock();
+    if (after > before) { section_scanner_->publish_ready(after); }
+  };
+
+  std::vector<std::thread> workers;
+  workers.reserve(thread_count);
+  for (std::size_t t = 0; t < thread_count; ++t) {
+    workers.emplace_back([&, t] {
+      std::string thread_name = "raw-input-read-" + std::to_string(t);
+      nvtx::name_current_thread(thread_name.c_str());
+      while (!stop.load(std::memory_order_acquire)) {
+        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
+        if (index >= window_count_) { break; }
+        try {
+          read_window(index);
+        } catch (...) {
+          mark_error(std::current_exception());
+          return;
+        }
+      }
+    });
+  }
+  for (auto& worker : workers) {
+    worker.join();
+  }
+  if (first_error) { std::rethrow_exception(first_error); }
+
+  output_view_size_ = ready_bytes_;
+  section_scanner_->publish_ready(output_view_size_);
+}
+
+bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
+
+void drop_file_cache(const std::string& path)
+{
+  MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
+  FileDescriptor fd(::open(path.c_str(), O_RDONLY));
+  if (!fd.valid()) { return; }
+
+  ::posix_fadvise(fd.get(), 0, 0, POSIX_FADV_DONTNEED);
+}
+
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
+{
+  if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
+  if (method == FileReadMethod::Lz4) {
+    throw std::runtime_error("lz4 read method requires a .lz4 input: " + path);
+  }
+  return method;
+}
+
+const char* file_read_method_name(FileReadMethod method) noexcept
+{
+  switch (method) {
+    case FileReadMethod::Read: return "read";
+    case FileReadMethod::Lz4: return "lz4";
+    default: return "unknown";
+  }
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
new file mode 100644
index 0000000000..3232a23e84
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace mps_fast {
+
+/**
+ * @brief File reading method selection
+ */
+enum class FileReadMethod { Read, Lz4 };
+
+/**
+ * @brief Return the effective method for a path.
+ *
+ * .lz4 inputs are decompressed; all other inputs use raw input reads.
+ */
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method);
+
+/**
+ * @brief Human-readable method name.
+ */
+const char* file_read_method_name(FileReadMethod method) noexcept;
+
+/**
+ * @brief True when the file name has an lz4 extension.
+ */
+bool has_lz4_extension(const std::string& path) noexcept;
+
+/**
+ * @brief Ask the OS to evict clean cached pages for this file.
+ *
+ * This is advisory and affects the local client page cache only.
+ */
+void drop_file_cache(const std::string& path);
+
+struct input_stream_view_t {
+  const char* data               = nullptr;
+  char* mutable_data             = nullptr;
+  std::size_t size               = 0;
+  std::size_t compressed_size    = 0;
+  mps_phase_registry_t* registry = nullptr;
+};
+
+class Lz4InputStream {
+ public:
+  explicit Lz4InputStream(const std::string& path);
+  ~Lz4InputStream();
+
+  Lz4InputStream(const Lz4InputStream&)            = delete;
+  Lz4InputStream& operator=(const Lz4InputStream&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+  mps_phase_registry_t& registry() noexcept;
+  input_stream_view_t view() noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  struct Block {
+    std::size_t compressed_offset   = 0;
+    std::size_t compressed_size     = 0;
+    std::size_t read_end_offset     = 0;
+    std::size_t decompressed_offset = 0;
+    std::size_t decompressed_size   = 0;
+    std::size_t index               = 0;
+    bool uncompressed               = false;
+  };
+
+  void commit_up_to(std::size_t bytes);
+
+  std::string path_;
+  int fd_ = -1;
+  mmap_region_t output_region_;
+  std::size_t compressed_size_       = 0;
+  char* output_data_                 = nullptr;
+  std::size_t output_mapped_size_    = 0;
+  std::size_t output_view_size_      = 0;
+  std::size_t output_committed_size_ = 0;
+  std::size_t block_max_size_        = 0;
+  std::size_t content_size_          = 0;
+  std::size_t header_size_           = 0;
+  bool content_size_present_         = false;
+  bool block_checksum_               = false;
+  bool content_checksum_             = false;
+  bool dict_id_                      = false;
+  std::vector<Block> blocks_;
+  mps_phase_registry_t registry_;
+  std::mutex commit_mutex_;
+  std::mutex frontier_mutex_;
+  std::vector<unsigned char> block_done_;
+  std::vector<std::size_t> block_end_;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+  std::size_t next_block_  = 0;
+  std::size_t ready_bytes_ = 0;
+
+  struct BatchMetric {
+    std::size_t index                    = 0;
+    std::size_t first_block              = 0;
+    std::size_t blocks                   = 0;
+    std::size_t file_bytes               = 0;
+    std::size_t decompressed_bytes       = 0;
+    double read_ms                       = 0.0;
+    double decode_ms                     = 0.0;
+    double commit_ms                     = 0.0;
+    double frontier_lock_wait_ms         = 0.0;
+    double frontier_update_ms            = 0.0;
+    double section_scan_ms               = 0.0;
+    std::size_t ready_bytes_delta        = 0;
+    std::size_t frontier_blocks_advanced = 0;
+    double total_ms                      = 0.0;
+  };
+  std::vector<BatchMetric> batch_metrics_;
+};
+
+class RawInputStream {
+ public:
+  explicit RawInputStream(const std::string& path);
+  ~RawInputStream();
+
+  RawInputStream(const RawInputStream&)            = delete;
+  RawInputStream& operator=(const RawInputStream&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+  mps_phase_registry_t& registry() noexcept;
+  input_stream_view_t view() noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  std::string path_;
+  int fd_ = -1;
+  mmap_region_t output_region_;
+  char* output_data_              = nullptr;
+  std::size_t output_mapped_size_ = 0;
+  std::size_t output_view_size_   = 0;
+  std::size_t file_size_          = 0;
+  std::size_t window_bytes_       = 0;
+  std::size_t window_count_       = 0;
+  mps_phase_registry_t registry_;
+  std::mutex frontier_mutex_;
+  std::vector<unsigned char> block_done_;
+  std::vector<std::size_t> block_end_;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+  std::size_t next_block_  = 0;
+  std::size_t ready_bytes_ = 0;
+};
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
new file mode 100644
index 0000000000..7aa302da23
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -0,0 +1,330 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "simd_compat.hpp"
+
+#include <cstdint>
+#include <cstring>
+
+#define __assume(cond)                    \
+  do {                                    \
+    if (!(cond)) __builtin_unreachable(); \
+  } while (0)
+
+#define BUCKET_COUNT (4194304 * 2 * 2 * 4)  // 2^22
+
+// Set to 1 for 32-byte keys, 0 for 16-byte keys
+#ifndef USE_32B_HASH_KEYS
+#define USE_32B_HASH_KEYS 1
+#endif
+
+namespace mps_fast {
+
+static inline uint32_t crcHash(const uint8_t* key, int64_t len)
+{
+  __assume(len < 256);
+
+  uint64_t crc = 0;
+  while (len > 8) {
+    uint64_t val = *(const uint64_t*)key;
+    crc          = simde_mm_crc32_u64(crc, val);
+    len -= 8;
+    key += 8;
+  }
+
+  // CRC the final 1-7 bytes
+  uint64_t val = *(const uint64_t*)key;
+  val &= ~(~0ULL << len * 8);  // Compiles to a bzhi instruction (also UB)
+  crc = simde_mm_crc32_u64(crc, val);
+
+  return crc;
+}
+
+static const simde__m128i aes_seed_128 =
+  simde_mm_set_epi64x(0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL);
+static const simde__m256i aes_seed_256 = simde_mm256_set_epi64x(
+  0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL, 0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL);
+
+static inline uint32_t aes_hash(simde__m128i key)
+{
+  simde__m128i h      = simde_mm_aesenc_si128(key, aes_seed_128);
+  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
+  simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8));
+  return (uint32_t)simde_mm_cvtsi128_si32(folded);
+}
+
+static inline uint32_t aes_hash(simde__m256i key)
+{
+  simde__m128i lo     = simde_mm256_castsi256_si128(key);
+  simde__m128i hi     = simde_mm256_extracti128_si256(key, 1);
+  simde__m128i h      = simde_mm_xor_si128(lo, hi);
+  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
+  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
+  simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8));
+  return (uint32_t)simde_mm_cvtsi128_si32(folded);
+}
+
+static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3)
+{
+  uint64_t crc = 0;
+  crc          = simde_mm_crc32_u64(crc, q0);
+  crc          = simde_mm_crc32_u64(crc, q1);
+  crc          = simde_mm_crc32_u64(crc, q2);
+  crc          = simde_mm_crc32_u64(crc, q3);
+
+  return crc;
+}
+
+// FNV-1a hash, processes bytes in reverse to better handle common-prefix strings
+static inline uint32_t fnv1a_hash(const char* ptr, size_t len)
+{
+  constexpr uint32_t FNV_OFFSET = 2166136261u;
+  constexpr uint32_t FNV_PRIME  = 16777619u;
+
+  uint32_t h    = FNV_OFFSET;
+  const char* p = ptr + len;
+  while (p > ptr) {
+    --p;
+    h ^= (uint8_t)*p;
+    h *= FNV_PRIME;
+  }
+  return h;
+}
+
+struct __attribute__((packed)) hash_slot_32_t {
+  uint32_t count;
+  simde__m256i node;
+};
+
+struct alignas(16) hash_slot_16_t {
+  char key[16];
+  uint32_t count;
+};
+
+static inline bool key_cmpeq_16(const char* slot_key, simde__m128i key)
+{
+  simde__m128i slot_vec = simde_mm_loadu_si128((const simde__m128i*)slot_key);
+  int mask              = simde_mm_movemask_epi8(simde_mm_cmpeq_epi8(slot_vec, key));
+  return mask == 0xFFFF;
+}
+
+// 32-byte aligned slot: 28-byte key + 4-byte count = 32 bytes total (one cache line half)
+struct alignas(32) hash_slot_28_t {
+  char key[28];
+  uint32_t count;
+};
+
+static inline simde__m256i make_key_28(const char* ptr, size_t len)
+{
+  alignas(32) char buf[32] = {0};
+  size_t copy_len          = len < 28 ? len : 28;
+  std::memcpy(buf, ptr, copy_len);
+  return simde_mm256_load_si256((const simde__m256i*)buf);
+}
+
+// Compare 28-byte keys stored in simde__m256i (ignore last 4 bytes)
+static inline bool key_cmpeq_28(const char* slot_key, simde__m256i key)
+{
+  simde__m256i slot_vec = simde_mm256_loadu_si256((const simde__m256i*)slot_key);
+  int mask              = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key));
+  return (mask & 0x0FFFFFFF) == 0x0FFFFFFF;  // Only check first 28 bytes
+}
+
+#if USE_32B_HASH_KEYS
+using hash_key_t                = simde__m256i;
+using hash_slot_var_t           = hash_slot_28_t;
+constexpr size_t HASH_KEY_BYTES = 28;
+constexpr int HASH_KEY_CMP_MASK = 0x0FFFFFFF;
+#define make_key                 make_key_28
+#define key_cmpeq(slot_key, key) key_cmpeq_28(slot_key, key)
+#define key_store(slot_key, key) simde_mm256_store_si256((simde__m256i*)(slot_key), key)
+#else
+using hash_key_t                = simde__m128i;
+using hash_slot_var_t           = hash_slot_16_t;
+constexpr size_t HASH_KEY_BYTES = 16;
+constexpr int HASH_KEY_CMP_MASK = 0xFFFF;
+#define make_key                 make_key_16
+#define key_cmpeq(slot_key, key) key_cmpeq_16(slot_key, key)
+#define key_store(slot_key, key) simde_mm_store_si128((simde__m128i*)(slot_key), key)
+#endif
+
+// Legacy alias
+using hash_slot_t = hash_slot_32_t;
+
+struct hash_table_t {
+  hash_slot_t slots[BUCKET_COUNT];
+};
+
+static inline void hash_table_push(
+  hash_table_t* table, uint32_t hash, simde__m256i val, int len, const uint8_t* ptr)
+{
+  hash %= BUCKET_COUNT;
+
+  hash_slot_t* slot = &table->slots[hash];
+
+  if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) {
+    ++slot->count;
+    return;
+  }
+
+  bool relooped = false;
+
+loop:
+  for (; slot < &table->slots[BUCKET_COUNT]; ++slot) {
+    if (slot->count == 0) {
+      slot->count = 1;
+      slot->node  = val;
+      return;
+    }
+
+    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) {
+      ++slot->count;
+      return;
+    }
+  }
+
+  if (!relooped) {
+    relooped = true;
+    slot     = &table->slots[0];
+    goto loop;
+  } else {
+    __builtin_trap();
+  }
+}
+
+extern char* string_buffer;
+extern char* string_buffer_ptr;
+
+// Lookup: returns the stored value (count-1) or SIZE_MAX if not found
+// For small strings <= 32 bytes stored inline in node
+static inline size_t hash_table_lookup(const hash_table_t* table, uint32_t hash, simde__m256i val)
+{
+  hash %= BUCKET_COUNT;
+  const hash_slot_t* slot = &table->slots[hash];
+
+  for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) {
+    if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; }
+
+    if (slot->count == 0) {
+      return SIZE_MAX;  // Not found
+    }
+
+    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) {
+      return slot->count - 1;  // Found, return index
+    }
+  }
+
+  return SIZE_MAX;  // Not found
+}
+
+// Insert with index: stores index+1 in count field (0 means empty)
+static inline void hash_table_insert(hash_table_t* table,
+                                     uint32_t hash,
+                                     simde__m256i val,
+                                     size_t index)
+{
+  hash %= BUCKET_COUNT;
+  hash_slot_t* slot = &table->slots[hash];
+
+  for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) {
+    if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; }
+
+    if (slot->count == 0) {
+      slot->count = (uint32_t)(index + 1);
+      slot->node  = val;
+      return;
+    }
+
+    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) {
+      // Already exists, update index
+      slot->count = (uint32_t)(index + 1);
+      return;
+    }
+  }
+
+  __builtin_trap();
+}
+
+// Create simde__m256i key from string_view (zero-padded)
+static inline simde__m256i make_key_32(const char* ptr, size_t len)
+{
+  alignas(32) char buf[32] = {0};
+  if (len > 32) len = 32;
+  memcpy(buf, ptr, len);
+  return simde_mm256_load_si256((const simde__m256i*)buf);
+}
+
+// Create simde__m128i key from string_view (zero-padded, for strings <= 16 bytes)
+static inline simde__m128i make_key_16(const char* ptr, size_t len)
+{
+  alignas(16) char buf[16] = {0};
+  if (len > 16) len = 16;
+  memcpy(buf, ptr, len);
+  return simde_mm_load_si128((const simde__m128i*)buf);
+}
+
+static inline uint64_t m256_u64_lane(simde__m256i value, size_t lane)
+{
+  simde__m256i_private private_value = simde__m256i_to_private(value);
+  return private_value.u64[lane];
+}
+
+static inline void hash_table_push_ptr(hash_table_t* table,
+                                       uint32_t hash,
+                                       int len,
+                                       const uint8_t* ptr)
+{
+  hash %= BUCKET_COUNT;
+
+  hash_slot_t* slot = &table->slots[hash];
+  bool relooped     = false;
+
+  uint32_t len_in_qwords = (len / 8) + (len % 8 ? 1 : 0);
+
+loop:
+  do {
+    uint64_t node_len = m256_u64_lane(slot->node, 3);
+    uint64_t node_tag = m256_u64_lane(slot->node, 0);
+    // nonzero, it's not a pointer of the same length, skip
+    if (__builtin_expect(node_len != (uint64_t)len, 0)) {
+      if (__builtin_expect(node_tag == 0, 1)) {
+        slot->count = 1;
+        slot->node  = simde_mm256_set_epi64x(len,
+                                            ((uint64_t*)ptr)[0],
+                                            (uint64_t)string_buffer_ptr,
+                                            0u | ((uint64_t)len_in_qwords << 32u));
+
+        memcpy(string_buffer_ptr, ptr, len);
+        string_buffer_ptr += len;
+        // Pad
+        string_buffer_ptr += (8 - len % 8) + 8;
+
+        return;
+      } else
+        continue;
+    }
+    if (m256_u64_lane(slot->node, 2) != ((uint64_t*)ptr)[0])  // First 8 bytes differ
+      continue;
+
+    uint8_t* other_ptr = reinterpret_cast<uint8_t*>(m256_u64_lane(slot->node, 1));
+    if (__builtin_expect(memcmp(ptr + 16, other_ptr + 16, len - 16) == 0, 1)) {
+      ++slot->count;
+
+      return;
+    }
+  } while (++slot < &table->slots[BUCKET_COUNT]);
+
+  if (!relooped) {
+    relooped = true;
+    slot     = &table->slots[0];
+    goto loop;
+  } else {
+    __builtin_trap();
+  }
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
new file mode 100644
index 0000000000..fbe18768af
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -0,0 +1,759 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cerrno>
+#include <cstring>
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace mps_fast {
+
+namespace {
+
+constexpr uint32_t lz4_frame_magic                      = 0x184D2204u;
+constexpr uint32_t lz4_uncompressed_block               = 0x80000000u;
+constexpr uint32_t lz4_block_size_mask                  = 0x7FFFFFFFu;
+constexpr std::size_t lz4_pipeline_batch_bytes          = 64ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_input_max_io_threads          = 8;
+constexpr std::size_t lz4_no_content_size_reserve_ratio = 16;
+
+#if defined(MPS_PARSER_WITH_LZ4)
+using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
+
+struct lz4_runtime_t {
+  void* handle                          = nullptr;
+  LZ4_decompress_safe_t decompress_safe = nullptr;
+
+  lz4_runtime_t()
+  {
+    for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+      handle = dlopen(soname, RTLD_LAZY);
+      if (handle != nullptr) { break; }
+    }
+    if (handle == nullptr) {
+      throw std::logic_error(
+        "Could not open .mps.lz4 file since liblz4 was not found "
+        "(tried liblz4.so.1, liblz4.so). In order to open .mps.lz4 files "
+        "directly, please ensure liblz4 is installed. Alternatively, decompress "
+        "the .lz4 file manually and open the uncompressed .mps file.");
+    }
+
+    decompress_safe = reinterpret_cast<LZ4_decompress_safe_t>(dlsym(handle, "LZ4_decompress_safe"));
+    if (decompress_safe == nullptr) {
+      throw std::logic_error(
+        "Error loading liblz4! Library version might be incompatible. Please decompress "
+        "the .lz4 file manually and open the uncompressed .mps file.");
+    }
+  }
+
+  ~lz4_runtime_t()
+  {
+    if (handle != nullptr) { dlclose(handle); }
+  }
+
+  lz4_runtime_t(const lz4_runtime_t&)            = delete;
+  lz4_runtime_t& operator=(const lz4_runtime_t&) = delete;
+};
+
+const lz4_runtime_t& lz4_runtime()
+{
+  static const lz4_runtime_t runtime;
+  return runtime;
+}
+#endif
+
+int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, int dst_capacity)
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity);
+#else
+  (void)src;
+  (void)dst;
+  (void)compressed_size;
+  (void)dst_capacity;
+  throw std::logic_error(
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+void ensure_lz4_runtime_available()
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  (void)lz4_runtime();
+#else
+  throw std::logic_error(
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+int open_lz4_fd(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    throw std::runtime_error("Failed to open LZ4 file '" + path + "': " + std::strerror(errno));
+  }
+  return fd;
+}
+
+std::size_t system_page_size();
+std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment);
+
+class FileDescriptor {
+ public:
+  explicit FileDescriptor(int fd) : fd_(fd) {}
+  ~FileDescriptor()
+  {
+    if (fd_ >= 0) { ::close(fd_); }
+  }
+
+  FileDescriptor(const FileDescriptor&)            = delete;
+  FileDescriptor& operator=(const FileDescriptor&) = delete;
+
+  int get() const noexcept { return fd_; }
+  bool valid() const noexcept { return fd_ >= 0; }
+
+ private:
+  int fd_;
+};
+
+uint32_t read_le32(const char* ptr)
+{
+  const auto* p = reinterpret_cast<const unsigned char*>(ptr);
+  return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
+}
+
+uint64_t read_le64(const char* ptr)
+{
+  const auto* p  = reinterpret_cast<const unsigned char*>(ptr);
+  uint64_t value = 0;
+  for (int i = 7; i >= 0; --i) {
+    value = (value << 8) | p[i];
+  }
+  return value;
+}
+
+std::size_t block_max_size_from_bd(unsigned char bd)
+{
+  unsigned block_size_id = (bd >> 4) & 0x7u;
+  switch (block_size_id) {
+    case 4: return 64ull * 1024ull;
+    case 5: return 256ull * 1024ull;
+    case 6: return 1024ull * 1024ull;
+    case 7: return 4ull * 1024ull * 1024ull;
+    default: throw std::runtime_error("unsupported LZ4 frame block size ID");
+  }
+}
+
+std::size_t checked_size(uint64_t value, const char* label)
+{
+  if (value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max())) {
+    throw std::runtime_error(std::string("LZ4 ") + label + " exceeds size_t");
+  }
+  return static_cast<std::size_t>(value);
+}
+
+std::size_t get_file_size(int fd, const std::string& path)
+{
+  struct stat st;
+  if (::fstat(fd, &st) != 0) {
+    throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno));
+  }
+  if (st.st_size < 0) { throw std::runtime_error("Invalid negative file size for '" + path + "'"); }
+  return static_cast<std::size_t>(st.st_size);
+}
+
+std::size_t system_page_size()
+{
+  static std::size_t page_size = [] {
+    long value = ::sysconf(_SC_PAGESIZE);
+    return value > 0 ? static_cast<std::size_t>(value) : static_cast<std::size_t>(4096);
+  }();
+  return page_size;
+}
+
+std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
+{
+  if (alignment == 0) { return value; }
+  std::size_t remainder = value % alignment;
+  if (remainder == 0) { return value; }
+  std::size_t increment = alignment - remainder;
+  if (value > std::numeric_limits<std::size_t>::max() - increment) {
+    throw std::runtime_error("allocation size overflow");
+  }
+  return value + increment;
+}
+
+std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
+{
+  if (a != 0 && b > std::numeric_limits<std::size_t>::max() / a) {
+    throw std::runtime_error(std::string(label) + " size overflow");
+  }
+  return a * b;
+}
+
+bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
+{
+  std::size_t done = 0;
+  while (done < bytes) {
+    std::size_t remaining = bytes - done;
+    std::size_t chunk     = std::min<std::size_t>(
+      remaining, static_cast<std::size_t>(std::numeric_limits<ssize_t>::max()));
+    ssize_t got = ::pread(fd, dst + done, chunk, static_cast<off_t>(offset + done));
+    if (got < 0) {
+      if (errno == EINTR) { continue; }
+      return false;
+    }
+    if (got == 0) {
+      errno = EIO;
+      return false;
+    }
+    done += static_cast<std::size_t>(got);
+  }
+  return true;
+}
+
+struct lz4_resident_window_t {
+  std::size_t index       = 0;
+  std::size_t file_offset = 0;
+  std::size_t size        = 0;
+  std::unique_ptr<char[]> data;
+};
+
+class lz4_resident_windows_t {
+ public:
+  explicit lz4_resident_windows_t(std::vector<lz4_resident_window_t>& windows) : windows_(windows)
+  {
+  }
+
+  const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const
+  {
+    if (size == 0) return nullptr;
+    const auto& w     = window_for_offset(offset);
+    std::size_t local = offset - w.file_offset;
+    if (local <= w.size && size <= w.size - local) { return w.data.get() + local; }
+    return nullptr;
+  }
+
+  void copy_to(std::size_t offset, char* dst, std::size_t size) const
+  {
+    std::size_t copied = 0;
+    while (copied < size) {
+      const auto& w     = window_for_offset(offset + copied);
+      std::size_t local = offset + copied - w.file_offset;
+      std::size_t take  = std::min(w.size - local, size - copied);
+      std::memcpy(dst + copied, w.data.get() + local, take);
+      copied += take;
+    }
+  }
+
+  uint8_t read_u8(std::size_t offset) const
+  {
+    uint8_t value = 0;
+    copy_to(offset, reinterpret_cast<char*>(&value), sizeof(value));
+    return value;
+  }
+
+  uint32_t read_u32(std::size_t offset) const
+  {
+    char bytes[4];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le32(bytes);
+  }
+
+  uint64_t read_u64(std::size_t offset) const
+  {
+    char bytes[8];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le64(bytes);
+  }
+
+ private:
+  const lz4_resident_window_t& window_for_offset(std::size_t offset) const
+  {
+    if (windows_.empty()) {
+      throw std::runtime_error("LZ4 resident window lookup with no windows");
+    }
+    std::size_t lo = 0;
+    std::size_t hi = windows_.size();
+    while (lo < hi) {
+      std::size_t mid = lo + (hi - lo) / 2;
+      const auto& w   = windows_[mid];
+      if (offset < w.file_offset) {
+        hi = mid;
+      } else if (offset >= w.file_offset + w.size) {
+        lo = mid + 1;
+      } else {
+        return w;
+      }
+    }
+    throw std::runtime_error("LZ4 offset outside resident windows");
+  }
+
+  std::vector<lz4_resident_window_t>& windows_;
+};
+
+}  // namespace
+
+Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io);
+  ensure_lz4_runtime_available();
+
+  fd_ = open_lz4_fd(path);
+  ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+
+  compressed_size_ = get_file_size(fd_, path);
+
+  char header[32];
+  if (compressed_size_ < 7) {
+    throw std::runtime_error("LZ4 input is too small to contain a frame header");
+  }
+  std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size_);
+  if (!pread_full_plain(fd_, header, header_bytes, 0)) {
+    throw std::runtime_error("Failed to read LZ4 frame header '" + path +
+                             "': " + std::strerror(errno));
+  }
+
+  std::size_t offset = 0;
+  uint32_t magic     = read_le32(header + offset);
+  if (magic != lz4_frame_magic) {
+    throw std::runtime_error("unsupported LZ4 input: expected standard LZ4 frame magic");
+  }
+  offset += 4;
+  unsigned char flg = static_cast<unsigned char>(header[offset++]);
+  unsigned char bd  = static_cast<unsigned char>(header[offset++]);
+  unsigned version  = (flg >> 6) & 0x3u;
+  if (version != 1) { throw std::runtime_error("unsupported LZ4 frame version"); }
+  bool block_independent = (flg & 0x20u) != 0;
+  block_checksum_        = (flg & 0x10u) != 0;
+  content_size_present_  = (flg & 0x08u) != 0;
+  content_checksum_      = (flg & 0x04u) != 0;
+  dict_id_               = (flg & 0x01u) != 0;
+  if (!block_independent) {
+    throw std::runtime_error("parallel LZ4 reader requires independent blocks; compress with -BI");
+  }
+  block_max_size_ = block_max_size_from_bd(bd);
+  if (content_size_present_) {
+    if (offset + 8 > header_bytes) {
+      throw std::runtime_error("truncated LZ4 frame while reading content size");
+    }
+    content_size_ = checked_size(read_le64(header + offset), "content size");
+    offset += 8;
+  }
+  if (dict_id_) {
+    if (offset + 4 > header_bytes) {
+      throw std::runtime_error("truncated LZ4 frame while reading dictionary id");
+    }
+    offset += 4;
+  }
+  if (offset + 1 > header_bytes) {
+    throw std::runtime_error("truncated LZ4 frame while reading header checksum");
+  }
+  offset += 1;
+  header_size_ = offset;
+
+  std::size_t reserve_size = content_size_;
+  if (!content_size_present_) {
+    reserve_size =
+      checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve");
+    reserve_size = std::max(reserve_size, block_max_size_);
+  }
+
+  constexpr std::size_t huge_alignment = 2 * 1024 * 1024;
+  output_mapped_size_                  = round_up_to_multiple(reserve_size, system_page_size());
+  output_region_                       = mmap_region_t::anonymous_aligned(output_mapped_size_,
+                                                    huge_alignment,
+                                                    PROT_NONE,
+                                                    MAP_PRIVATE | MAP_NORESERVE,
+                                                    "LZ4 output buffer");
+  output_data_                         = output_region_.char_data();
+
+  std::size_t block_slots =
+    std::max<std::size_t>(1, (reserve_size + block_max_size_ - 1) / block_max_size_ + 1);
+  block_done_.resize(block_slots, 0);
+  block_end_.resize(block_slots, 0);
+
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, block_slots, registry_);
+}
+
+Lz4InputStream::~Lz4InputStream()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+}
+
+const char* Lz4InputStream::data() const noexcept { return output_data_; }
+char* Lz4InputStream::mutable_data() noexcept { return output_data_; }
+std::size_t Lz4InputStream::size() const noexcept { return output_view_size_; }
+std::size_t Lz4InputStream::compressed_size() const noexcept { return compressed_size_; }
+std::size_t Lz4InputStream::reserve_size_hint() const noexcept
+{
+  return content_size_present_ ? content_size_
+                               : std::max<std::size_t>(compressed_size_ * 6, 1024 * 1024);
+}
+mps_phase_registry_t& Lz4InputStream::registry() noexcept { return registry_; }
+input_stream_view_t Lz4InputStream::view() noexcept
+{
+  return {output_data_, output_data_, output_view_size_, compressed_size_, &registry_};
+}
+
+void Lz4InputStream::commit_up_to(std::size_t bytes)
+{
+  MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc);
+  std::lock_guard<std::mutex> lock(commit_mutex_);
+  if (bytes <= output_committed_size_) return;
+  if (bytes > output_mapped_size_) {
+    throw std::runtime_error("LZ4 output exceeded reserved virtual mapping");
+  }
+  std::size_t new_committed = round_up_to_multiple(bytes, system_page_size());
+  if (new_committed > output_mapped_size_) new_committed = output_mapped_size_;
+  std::size_t add = new_committed - output_committed_size_;
+  void* target    = output_data_ + output_committed_size_;
+  mmap_region_t::map_fixed_or_throw(
+    target, add, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0, "LZ4 output commit");
+  ::madvise(target, add, MADV_HUGEPAGE);
+  output_committed_size_ = new_committed;
+}
+
+void Lz4InputStream::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
+  std::exception_ptr first_error = nullptr;
+  std::mutex error_mutex;
+  std::atomic_bool stop_workers{false};
+  auto mark_error = [&](std::exception_ptr eptr) {
+    std::lock_guard<std::mutex> lock(error_mutex);
+    if (!first_error) {
+      first_error = eptr;
+      stop_workers.store(true, std::memory_order_release);
+    }
+  };
+
+  const std::size_t window_bytes = lz4_pipeline_batch_bytes;
+  const std::size_t window_count = (compressed_size_ + window_bytes - 1) / window_bytes;
+  std::vector<lz4_resident_window_t> windows(window_count);
+  for (std::size_t i = 0; i < window_count; ++i) {
+    std::size_t offset     = i * window_bytes;
+    std::size_t size       = std::min(window_bytes, compressed_size_ - offset);
+    windows[i].index       = i;
+    windows[i].file_offset = offset;
+    windows[i].size        = size;
+    windows[i].data.reset(new char[size]);
+  }
+
+  const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count);
+
+  struct resident_block_desc_t {
+    const char* src                 = nullptr;
+    std::size_t compressed_size     = 0;
+    std::size_t decompressed_offset = 0;
+    std::size_t decompressed_size   = 0;
+    std::size_t index               = 0;
+    bool uncompressed               = false;
+  };
+
+  std::atomic_size_t next_window{0};
+  std::vector<unsigned char> window_done(window_count, 0);
+  std::mutex window_mutex;
+  std::condition_variable window_cv;
+
+  std::deque<std::vector<resident_block_desc_t>> desc_queue;
+  bool scanner_done = false;
+  std::mutex desc_mutex;
+  std::condition_variable desc_cv;
+
+  auto fail_and_notify = [&](std::exception_ptr eptr) {
+    mark_error(eptr);
+    window_cv.notify_all();
+    desc_cv.notify_all();
+  };
+
+  auto decode_worker = [&](std::size_t tid) {
+    try {
+      std::string thread_name = "lz4-window-decode-" + std::to_string(tid);
+      nvtx::name_current_thread(thread_name.c_str());
+      while (true) {
+        std::vector<resident_block_desc_t> batch;
+        {
+          MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
+          std::unique_lock<std::mutex> lock(desc_mutex);
+          desc_cv.wait(lock, [&] {
+            return stop_workers.load(std::memory_order_acquire) || scanner_done ||
+                   !desc_queue.empty();
+          });
+          if (stop_workers.load(std::memory_order_acquire)) { return; }
+          if (desc_queue.empty()) {
+            if (scanner_done) return;
+            continue;
+          }
+          batch = std::move(desc_queue.front());
+          desc_queue.pop_front();
+        }
+
+        MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
+        for (const auto& block : batch) {
+          char* dst  = output_data_ + block.decompressed_offset;
+          int actual = 0;
+          {
+            MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode);
+            if (block.uncompressed) {
+              std::memcpy(dst, block.src, block.decompressed_size);
+              actual = static_cast<int>(block.decompressed_size);
+            } else if (block.compressed_size >
+                         static_cast<std::size_t>(std::numeric_limits<int>::max()) ||
+                       block.decompressed_size >
+                         static_cast<std::size_t>(std::numeric_limits<int>::max())) {
+              actual = -1;
+            } else {
+              actual = lz4_decompress_safe_runtime(block.src,
+                                                   dst,
+                                                   static_cast<int>(block.compressed_size),
+                                                   static_cast<int>(block.decompressed_size));
+            }
+          }
+          if (actual < 0 || static_cast<std::size_t>(actual) > block.decompressed_size) {
+            throw std::runtime_error("LZ4 input block decompressed to invalid size");
+          }
+
+          std::size_t actual_size = static_cast<std::size_t>(actual);
+          {
+            MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
+            section_scanner_->observe_block(block.index, dst, dst + actual_size);
+          }
+          std::size_t before = 0;
+          std::size_t after  = 0;
+          {
+            MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic);
+            frontier_mutex_.lock();
+            block_done_[block.index] = 1;
+            block_end_[block.index]  = block.decompressed_offset + actual_size;
+            before                   = ready_bytes_;
+            while (next_block_ < block_done_.size() && block_done_[next_block_]) {
+              ready_bytes_ = block_end_[next_block_];
+              ++next_block_;
+            }
+            after = ready_bytes_;
+            frontier_mutex_.unlock();
+          }
+          if (after > before) {
+            MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic);
+            section_scanner_->publish_ready(after);
+          }
+        }
+      }
+    } catch (...) {
+      fail_and_notify(std::current_exception());
+    }
+  };
+
+  std::vector<std::thread> readers;
+  readers.reserve(io_threads);
+  for (std::size_t t = 0; t < io_threads; ++t) {
+    readers.emplace_back([&, t] {
+      std::string thread_name = "lz4-window-read-" + std::to_string(t);
+      nvtx::name_current_thread(thread_name.c_str());
+      while (!stop_workers.load(std::memory_order_acquire)) {
+        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
+        if (index >= windows.size()) { break; }
+        auto& w = windows[index];
+        bool ok = false;
+        {
+          MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+          ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset);
+        }
+        if (!ok) {
+          fail_and_notify(std::make_exception_ptr(std::runtime_error(
+            "Failed to pread LZ4 resident window: " + std::string(std::strerror(errno)))));
+          return;
+        }
+        {
+          MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+          std::lock_guard<std::mutex> lock(window_mutex);
+          window_done[index] = 1;
+        }
+        window_cv.notify_all();
+      }
+    });
+  }
+
+  std::atomic_size_t blocks_scanned{0};
+  std::vector<std::vector<char>> crossing_payloads;
+  std::thread scanner([&] {
+    try {
+      nvtx::name_current_thread("lz4-metadata-scan");
+      lz4_resident_windows_t resident(windows);
+      auto wait_range_ready = [&](std::size_t begin, std::size_t size) {
+        if (size == 0) return;
+        std::size_t first = begin / window_bytes;
+        std::size_t last  = (begin + size - 1) / window_bytes;
+        for (std::size_t wi = first; wi <= last; ++wi) {
+          MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
+          std::unique_lock<std::mutex> lock(window_mutex);
+          window_cv.wait(lock, [&] {
+            return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0;
+          });
+          if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) {
+            throw std::runtime_error(
+              "LZ4 metadata scanner stopped before required window was ready");
+          }
+        }
+      };
+      auto push_batch = [&](std::vector<resident_block_desc_t>& batch) {
+        if (batch.empty()) return;
+        {
+          MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc);
+          commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size);
+        }
+        {
+          MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic);
+          std::lock_guard<std::mutex> lock(desc_mutex);
+          desc_queue.push_back(std::move(batch));
+        }
+        batch.clear();
+        desc_cv.notify_one();
+      };
+
+      std::vector<resident_block_desc_t> batch;
+      batch.reserve(1024);
+      std::size_t offset              = header_size_;
+      std::size_t decompressed_offset = 0;
+      while (true) {
+        MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
+        wait_range_ready(offset, 4);
+        if (offset + 4 > compressed_size_) {
+          throw std::runtime_error("truncated LZ4 frame while reading block header");
+        }
+        uint32_t raw_block_size = resident.read_u32(offset);
+        offset += 4;
+        if (raw_block_size == 0) { break; }
+
+        bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
+        std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
+        if (block_payload_size == 0) {
+          throw std::runtime_error("invalid zero-sized LZ4 data block");
+        }
+        if (block_payload_size > block_max_size_ && uncompressed) {
+          throw std::runtime_error("LZ4 uncompressed block exceeds frame block maximum");
+        }
+        if (content_size_present_ && decompressed_offset >= content_size_) {
+          throw std::runtime_error("LZ4 frame contains more blocks than content size allows");
+        }
+        wait_range_ready(offset, block_payload_size);
+        if (offset + block_payload_size > compressed_size_) {
+          throw std::runtime_error("truncated LZ4 frame while reading block payload");
+        }
+
+        std::size_t decompressed_size = block_payload_size;
+        if (!uncompressed) {
+          if (content_size_present_) {
+            decompressed_size = std::min(block_max_size_, content_size_ - decompressed_offset);
+          } else {
+            decompressed_size = block_max_size_;
+          }
+        }
+        if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) {
+          throw std::runtime_error("LZ4 block exceeds declared content size");
+        }
+
+        const char* src = resident.ptr_if_contiguous(offset, block_payload_size);
+        if (src == nullptr) {
+          crossing_payloads.emplace_back(block_payload_size);
+          resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size);
+          src = crossing_payloads.back().data();
+        }
+        batch.push_back({src,
+                         block_payload_size,
+                         decompressed_offset,
+                         decompressed_size,
+                         blocks_scanned.load(std::memory_order_relaxed),
+                         uncompressed});
+        blocks_scanned.fetch_add(1, std::memory_order_relaxed);
+        decompressed_offset += decompressed_size;
+        offset += block_payload_size;
+        if (block_checksum_) {
+          wait_range_ready(offset, 4);
+          if (offset + 4 > compressed_size_) {
+            throw std::runtime_error("truncated LZ4 frame while reading block checksum");
+          }
+          offset += 4;
+        }
+        if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) {
+          throw std::runtime_error("LZ4 input block count exceeded reserved metadata slots");
+        }
+        if (batch.size() >= 1024) { push_batch(batch); }
+      }
+      if (content_checksum_) {
+        wait_range_ready(offset, 4);
+        if (offset + 4 > compressed_size_) {
+          throw std::runtime_error("truncated LZ4 frame while reading content checksum");
+        }
+        offset += 4;
+      }
+      if (content_size_present_ && decompressed_offset != content_size_) {
+        throw std::runtime_error("LZ4 frame ended before declared content size was reached");
+      }
+      if (offset != compressed_size_) {
+        throw std::runtime_error("LZ4 input contains trailing data after the first frame");
+      }
+      push_batch(batch);
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      desc_cv.notify_all();
+    } catch (...) {
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      fail_and_notify(std::current_exception());
+    }
+  });
+
+  std::vector<std::thread> io_workers;
+  io_workers.reserve(io_threads);
+  for (std::size_t t = 0; t < io_threads; ++t) {
+    io_workers.emplace_back(decode_worker, t);
+  }
+  for (auto& reader : readers) {
+    reader.join();
+  }
+  scanner.join();
+  for (auto& worker : io_workers) {
+    worker.join();
+  }
+  if (first_error) std::rethrow_exception(first_error);
+  output_view_size_ = ready_bytes_;
+  section_scanner_->publish_ready(output_view_size_);
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
new file mode 100644
index 0000000000..c1f411111a
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -0,0 +1,141 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+#include <string>
+
+namespace mps_fast {
+
+// Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range
+// are still released by unmapping the owning outer range.
+class mmap_region_t {
+ public:
+  mmap_region_t() = default;
+  mmap_region_t(void* ptr, std::size_t size) noexcept : ptr_(ptr), size_(size) {}
+
+  mmap_region_t(const mmap_region_t&)            = delete;
+  mmap_region_t& operator=(const mmap_region_t&) = delete;
+
+  mmap_region_t(mmap_region_t&& other) noexcept : ptr_(other.ptr_), size_(other.size_)
+  {
+    other.ptr_  = nullptr;
+    other.size_ = 0;
+  }
+
+  mmap_region_t& operator=(mmap_region_t&& other) noexcept
+  {
+    if (this != &other) {
+      reset();
+      ptr_        = other.ptr_;
+      size_       = other.size_;
+      other.ptr_  = nullptr;
+      other.size_ = 0;
+    }
+    return *this;
+  }
+
+  ~mmap_region_t() { reset(); }
+
+  static mmap_region_t map(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags, fd, offset);
+    if (ptr == MAP_FAILED) {
+      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
+                               std::strerror(errno));
+    }
+    return mmap_region_t(ptr, size);
+  }
+
+  static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context)
+  {
+    return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context);
+  }
+
+  static mmap_region_t anonymous_aligned(
+    std::size_t size, std::size_t alignment, int prot, int flags, const char* context)
+  {
+    if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
+      throw std::runtime_error("mmap aligned allocation requires power-of-two alignment");
+    }
+    if (size > std::numeric_limits<std::size_t>::max() - alignment) {
+      throw std::runtime_error("mmap aligned allocation size overflow");
+    }
+
+    std::size_t raw_size = size + alignment;
+    void* raw            = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0);
+    if (raw == MAP_FAILED) {
+      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
+                               std::strerror(errno));
+    }
+
+    uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
+    uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1);
+    std::size_t prefix     = static_cast<std::size_t>(aligned_addr - raw_addr);
+    std::size_t suffix     = raw_size - prefix - size;
+    if (prefix > 0) { ::munmap(raw, prefix); }
+    if (suffix > 0) { ::munmap(reinterpret_cast<void*>(aligned_addr + size), suffix); }
+    return mmap_region_t(reinterpret_cast<void*>(aligned_addr), size);
+  }
+
+  static void map_fixed_or_throw(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset);
+    if (ptr == MAP_FAILED) {
+      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
+                               std::strerror(errno));
+    }
+  }
+
+  void reset() noexcept
+  {
+    if (ptr_ != nullptr && size_ != 0) { ::munmap(ptr_, size_); }
+    ptr_  = nullptr;
+    size_ = 0;
+  }
+
+  void reset(void* ptr, std::size_t size) noexcept
+  {
+    reset();
+    ptr_  = ptr;
+    size_ = size;
+  }
+
+  void* release() noexcept
+  {
+    void* ptr = ptr_;
+    ptr_      = nullptr;
+    size_     = 0;
+    return ptr;
+  }
+
+  void advise(int advice) const noexcept
+  {
+    if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); }
+  }
+
+  void* data() noexcept { return ptr_; }
+  const void* data() const noexcept { return ptr_; }
+  char* char_data() noexcept { return static_cast<char*>(ptr_); }
+  const char* char_data() const noexcept { return static_cast<const char*>(ptr_); }
+  std::size_t size() const noexcept { return size_; }
+  bool empty() const noexcept { return ptr_ == nullptr || size_ == 0; }
+  explicit operator bool() const noexcept { return !empty(); }
+
+ private:
+  void* ptr_        = nullptr;
+  std::size_t size_ = 0;
+};
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
new file mode 100644
index 0000000000..3ed8763428
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -0,0 +1,413 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "mps_section_scanner.hpp"
+#include "simd_compat.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <stdexcept>
+
+namespace mps_fast {
+
+namespace {
+
+bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; }
+
+simde__m256i nonblank_column1_mask(simde__m256i bytes)
+{
+  return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' '));
+}
+
+const char* section_name(mps_section_kind kind)
+{
+  switch (kind) {
+    case mps_section_kind::rows: return "ROWS";
+    case mps_section_kind::columns: return "COLUMNS";
+    case mps_section_kind::rhs: return "RHS";
+    case mps_section_kind::bounds: return "BOUNDS";
+    case mps_section_kind::ranges: return "RANGES";
+    case mps_section_kind::quadobj: return "QUADOBJ";
+    case mps_section_kind::qmatrix: return "QMATRIX";
+    case mps_section_kind::qcmatrix: return "QCMATRIX";
+    case mps_section_kind::endata: return "ENDATA";
+  }
+  return "";
+}
+
+std::size_t section_name_len(mps_section_kind kind) { return std::strlen(section_name(kind)); }
+
+}  // namespace
+
+std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase)
+{
+  switch (phase) {
+    case mps_phase_kind::header: return 0;
+    case mps_phase_kind::rows: return 1;
+    case mps_phase_kind::columns: return 2;
+    case mps_phase_kind::rhs: return 3;
+    case mps_phase_kind::bounds: return 4;
+    case mps_phase_kind::ranges: return 5;
+    case mps_phase_kind::quadratic: return 6;
+  }
+  throw std::runtime_error("invalid MPS phase kind");
+}
+
+void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range)
+{
+  std::size_t idx = phase_index(phase);
+  omp_event_handle_t event{};
+  bool fulfill = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (ready_[idx].load(std::memory_order_acquire)) { return; }
+    ranges_[idx] = range;
+    ready_[idx].store(true, std::memory_order_release);
+    if (has_event_[idx] && !event_fulfilled_[idx]) {
+      event                 = events_[idx];
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+void mps_phase_registry_t::attach_event(mps_phase_kind phase, omp_event_handle_t event)
+{
+  std::size_t idx = phase_index(phase);
+  bool fulfill    = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    events_[idx]    = event;
+    has_event_[idx] = true;
+    if (ready_[idx].load(std::memory_order_acquire) && !event_fulfilled_[idx]) {
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+bool mps_phase_registry_t::ready(mps_phase_kind phase) const
+{
+  return ready_[phase_index(phase)].load(std::memory_order_acquire);
+}
+
+mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
+{
+  return ranges_[phase_index(phase)];
+}
+
+bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind)
+{
+  if (line_start >= line_end) { return false; }
+
+  mps_section_kind candidate;
+  switch (*line_start) {
+    case 'R':
+      if (line_end - line_start >= 3 && std::memcmp(line_start, "RHS", 3) == 0) {
+        candidate = mps_section_kind::rhs;
+      } else if (line_end - line_start >= 4 && std::memcmp(line_start, "ROWS", 4) == 0) {
+        candidate = mps_section_kind::rows;
+      } else if (line_end - line_start >= 6 && std::memcmp(line_start, "RANGES", 6) == 0) {
+        candidate = mps_section_kind::ranges;
+      } else {
+        return false;
+      }
+      break;
+    case 'C':
+      if (line_end - line_start >= 7 && std::memcmp(line_start, "COLUMNS", 7) == 0) {
+        candidate = mps_section_kind::columns;
+      } else {
+        return false;
+      }
+      break;
+    case 'B':
+      if (line_end - line_start >= 6 && std::memcmp(line_start, "BOUNDS", 6) == 0) {
+        candidate = mps_section_kind::bounds;
+      } else {
+        return false;
+      }
+      break;
+    case 'E':
+      if (line_end - line_start >= 6 && std::memcmp(line_start, "ENDATA", 6) == 0) {
+        candidate = mps_section_kind::endata;
+      } else {
+        return false;
+      }
+      break;
+    case 'Q':
+      if (line_end - line_start >= 7 && std::memcmp(line_start, "QUADOBJ", 7) == 0) {
+        candidate = mps_section_kind::quadobj;
+      } else if (line_end - line_start >= 7 && std::memcmp(line_start, "QMATRIX", 7) == 0) {
+        candidate = mps_section_kind::qmatrix;
+      } else if (line_end - line_start >= 8 && std::memcmp(line_start, "QCMATRIX", 8) == 0) {
+        candidate = mps_section_kind::qcmatrix;
+      } else {
+        return false;
+      }
+      break;
+    default: return false;
+  }
+
+  const char* after = line_start + section_name_len(candidate);
+  while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
+    ++after;
+  }
+  if (after != line_end) { return false; }
+  *kind = candidate;
+  return true;
+}
+
+mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data,
+                                                         std::size_t block_count,
+                                                         mps_phase_registry_t& registry)
+  : data_(data),
+    block_count_(block_count),
+    registry_(registry),
+    block_decoded_(std::make_unique<std::atomic<unsigned char>[]>(block_count)),
+    block_begin_offsets_(std::make_unique<std::atomic_size_t[]>(block_count)),
+    block_end_offsets_(std::make_unique<std::atomic_size_t[]>(block_count))
+{
+  for (std::size_t i = 0; i < block_count_; ++i) {
+    block_decoded_[i].store(0, std::memory_order_relaxed);
+    block_begin_offsets_[i].store(0, std::memory_order_relaxed);
+    block_end_offsets_[i].store(0, std::memory_order_relaxed);
+  }
+}
+
+std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind)
+{
+  switch (kind) {
+    case mps_section_kind::rows: return 0;
+    case mps_section_kind::columns: return 1;
+    case mps_section_kind::rhs: return 2;
+    case mps_section_kind::bounds: return 3;
+    case mps_section_kind::ranges: return 4;
+    case mps_section_kind::quadobj: return 5;
+    case mps_section_kind::qmatrix: return 6;
+    case mps_section_kind::qcmatrix: return 7;
+    case mps_section_kind::endata: return 8;
+  }
+  return 0;
+}
+
+void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr)
+{
+  std::atomic<const char*>& slot = section_hits_[section_hit_index(kind)];
+  const char* expected           = nullptr;
+  if (slot.compare_exchange_strong(
+        expected, ptr, std::memory_order_release, std::memory_order_acquire)) {
+    publish_section_ranges();
+  }
+}
+
+void mps_section_block_scanner_t::scan_section_range(const char* begin,
+                                                     const char* end,
+                                                     bool boundary_scan)
+{
+  (void)boundary_scan;
+  if (begin >= end) return;
+  const char* p = begin;
+
+  // Interior scans that start inside a decoded block skip the leading partial
+  // line. A separate boundary scan covers section titles whose newline/title
+  // bytes straddle adjacent LZ4 blocks.
+  if (p != data_) {
+    const void* nl = __builtin_memchr(p, '\n', static_cast<std::size_t>(end - p));
+    if (nl == nullptr) { return; }
+    p = static_cast<const char*>(nl) + 1;
+  }
+
+  auto try_candidate = [&](const char* line_start) {
+    const void* nl = __builtin_memchr(line_start, '\n', static_cast<std::size_t>(end - line_start));
+    const char* line_end = nl == nullptr ? end : static_cast<const char*>(nl);
+    mps_section_kind kind;
+    if (line_is_section(line_start, line_end, &kind)) { record_section_hit(kind, line_start); }
+  };
+
+  // Handle the very first line of a file (NAME indicator, usually)
+  if (p == data_) {
+    if (p < end && is_nonblank_column1(static_cast<unsigned char>(*p))) { try_candidate(p); }
+    ++p;
+  }
+
+  // In compliant MPS, indicator records begin in column 1 while data records
+  // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap
+  // candidate signal, then run the exact section matcher only for candidates.
+  const simde__m256i newline = simde_mm256_set1_epi8('\n');
+  while (static_cast<std::size_t>(end - p) >= 32) {
+    simde__m256i current  = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p));
+    simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p - 1));
+    std::uint32_t mask = static_cast<std::uint32_t>(simde_mm256_movemask_epi8(simde_mm256_and_si256(
+      simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current))));
+    while (mask != 0) {
+      int bit = __builtin_ctz(mask);
+      try_candidate(p + bit);
+      mask &= mask - 1;
+    }
+    p += 32;
+  }
+
+  // scalar tail
+  while (p < end) {
+    if (*(p - 1) == '\n' && is_nonblank_column1(static_cast<unsigned char>(*p))) {
+      try_candidate(p);
+    }
+    ++p;
+  }
+}
+
+void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::size_t right_index)
+{
+  std::size_t left_begin = block_begin_offsets_[left_index].load(std::memory_order_acquire);
+  std::size_t boundary   = block_begin_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t right_end  = block_end_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t begin =
+    boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin;
+  std::size_t end =
+    right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end;
+  scan_section_range(data_ + begin, data_ + end, true);
+}
+
+void mps_section_block_scanner_t::observe_block(std::size_t block_index,
+                                                const char* begin,
+                                                const char* end)
+{
+  if (block_index >= block_count_) {
+    throw std::runtime_error("MPS section scanner observed invalid LZ4 block index");
+  }
+
+  scan_section_range(begin, end, false);
+  block_begin_offsets_[block_index].store(static_cast<std::size_t>(begin - data_),
+                                          std::memory_order_relaxed);
+  block_end_offsets_[block_index].store(static_cast<std::size_t>(end - data_),
+                                        std::memory_order_relaxed);
+  block_decoded_[block_index].store(1, std::memory_order_release);
+
+  if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index - 1, block_index);
+  }
+  if (block_index + 1 < block_count_ &&
+      block_decoded_[block_index + 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index, block_index + 1);
+  }
+}
+
+void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
+{
+  ready_bytes_.store(ready_bytes, std::memory_order_release);
+  publish_section_ranges();
+}
+
+void mps_section_block_scanner_t::publish_section_ranges()
+{
+  std::lock_guard<std::mutex> lock(publish_mutex_);
+  std::size_t ready     = ready_bytes_.load(std::memory_order_acquire);
+  const char* ready_ptr = data_ + ready;
+  const char* rows =
+    section_hits_[section_hit_index(mps_section_kind::rows)].load(std::memory_order_acquire);
+  const char* columns =
+    section_hits_[section_hit_index(mps_section_kind::columns)].load(std::memory_order_acquire);
+  const char* rhs =
+    section_hits_[section_hit_index(mps_section_kind::rhs)].load(std::memory_order_acquire);
+  const char* bounds =
+    section_hits_[section_hit_index(mps_section_kind::bounds)].load(std::memory_order_acquire);
+  const char* ranges =
+    section_hits_[section_hit_index(mps_section_kind::ranges)].load(std::memory_order_acquire);
+  const char* quadobj =
+    section_hits_[section_hit_index(mps_section_kind::quadobj)].load(std::memory_order_acquire);
+  const char* qmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qmatrix)].load(std::memory_order_acquire);
+  const char* qcmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qcmatrix)].load(std::memory_order_acquire);
+  const char* endata =
+    section_hits_[section_hit_index(mps_section_kind::endata)].load(std::memory_order_acquire);
+  auto available = [&](const char* p) { return p != nullptr && p <= ready_ptr; };
+  bool final_ready =
+    block_count_ == 0 ||
+    (block_decoded_[block_count_ - 1].load(std::memory_order_acquire) &&
+     ready == block_end_offsets_[block_count_ - 1].load(std::memory_order_acquire));
+  const char* final_boundary    = available(endata) ? endata : (final_ready ? ready_ptr : nullptr);
+  auto earliest_available_after = [&](const char* after,
+                                      std::initializer_list<const char*> candidates) {
+    const char* best = nullptr;
+    for (const char* p : candidates) {
+      if (!available(p) || (after != nullptr && p <= after)) { continue; }
+      if (best == nullptr || p < best) { best = p; }
+    }
+    return best;
+  };
+
+  if (available(rows) && !registry_.ready(mps_phase_kind::header)) {
+    registry_.publish(mps_phase_kind::header, {data_, rows, true});
+  }
+  if (available(rows) && available(columns) && !registry_.ready(mps_phase_kind::rows)) {
+    registry_.publish(mps_phase_kind::rows, {rows, columns, true});
+  }
+  if (available(columns) && !registry_.ready(mps_phase_kind::columns)) {
+    const char* columns_end = earliest_available_after(
+      columns, {rhs, ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+    if (columns_end != nullptr) {
+      registry_.publish(mps_phase_kind::columns, {columns, columns_end, true});
+    }
+  }
+
+  if (!registry_.ready(mps_phase_kind::rhs)) {
+    if (available(rhs)) {
+      const char* rhs_end =
+        earliest_available_after(rhs, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+      if (rhs_end != nullptr) { registry_.publish(mps_phase_kind::rhs, {rhs, rhs_end, true}); }
+    } else {
+      const char* after_columns = earliest_available_after(
+        columns, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+      if (after_columns != nullptr) {
+        registry_.publish(mps_phase_kind::rhs, {nullptr, nullptr, false});
+      }
+    }
+  }
+
+  if (!registry_.ready(mps_phase_kind::ranges)) {
+    const char* ranges_end =
+      earliest_available_after(ranges, {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+    const char* after_rhs = earliest_available_after(
+      rhs ? rhs : columns, {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+    if (available(ranges) && ranges_end != nullptr) {
+      registry_.publish(mps_phase_kind::ranges, {ranges, ranges_end, true});
+    } else if (!ranges && after_rhs != nullptr) {
+      registry_.publish(mps_phase_kind::ranges, {nullptr, nullptr, false});
+    }
+  }
+
+  if (!registry_.ready(mps_phase_kind::bounds)) {
+    const char* bounds_end =
+      earliest_available_after(bounds, {quadobj, qmatrix, qcmatrix, final_boundary});
+    const char* after_ranges = earliest_available_after(
+      ranges ? ranges : (rhs ? rhs : columns), {quadobj, qmatrix, qcmatrix, final_boundary});
+    if (available(bounds) && bounds_end != nullptr) {
+      registry_.publish(mps_phase_kind::bounds, {bounds, bounds_end, true});
+    } else if (!bounds && after_ranges != nullptr) {
+      registry_.publish(mps_phase_kind::bounds, {nullptr, nullptr, false});
+    }
+  }
+
+  if (!registry_.ready(mps_phase_kind::quadratic)) {
+    const char* quadratic_begin = nullptr;
+    if (available(quadobj)) { quadratic_begin = quadobj; }
+    if (available(qmatrix) && (quadratic_begin == nullptr || qmatrix < quadratic_begin)) {
+      quadratic_begin = qmatrix;
+    }
+    if (available(qcmatrix) && (quadratic_begin == nullptr || qcmatrix < quadratic_begin)) {
+      quadratic_begin = qcmatrix;
+    }
+    if (quadratic_begin != nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {quadratic_begin, final_boundary, true});
+    } else if (quadratic_begin == nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false});
+    }
+  }
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
new file mode 100644
index 0000000000..0c492b0074
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -0,0 +1,98 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include <omp.h>
+
+namespace mps_fast {
+
+enum class mps_section_kind {
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadobj,
+  qmatrix,
+  qcmatrix,
+  endata,
+};
+
+enum class mps_phase_kind {
+  header,
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadratic,
+};
+
+struct mps_phase_range_t {
+  const char* begin = nullptr;
+  const char* end   = nullptr;
+  bool present      = false;
+};
+
+class mps_phase_registry_t {
+ public:
+  void publish(mps_phase_kind phase, mps_phase_range_t range);
+  void attach_event(mps_phase_kind phase, omp_event_handle_t event);
+
+  bool ready(mps_phase_kind phase) const;
+  mps_phase_range_t range(mps_phase_kind phase) const;
+
+ private:
+  static constexpr std::size_t phase_count = 7;
+
+  static std::size_t phase_index(mps_phase_kind phase);
+
+  mps_phase_range_t ranges_[phase_count]{};
+  std::atomic<bool> ready_[phase_count]{};
+  omp_event_handle_t events_[phase_count]{};
+  bool has_event_[phase_count]{};
+  bool event_fulfilled_[phase_count]{};
+  mutable std::mutex mutex_;
+};
+
+bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind);
+
+class mps_section_block_scanner_t {
+ public:
+  mps_section_block_scanner_t(const char* data,
+                              std::size_t block_count,
+                              mps_phase_registry_t& registry);
+
+  void observe_block(std::size_t block_index, const char* begin, const char* end);
+  void publish_ready(std::size_t ready_bytes);
+
+ private:
+  static constexpr std::size_t section_count    = 9;
+  static constexpr std::size_t boundary_overlap = 128;
+
+  static std::size_t section_hit_index(mps_section_kind kind);
+
+  void scan_section_range(const char* begin, const char* end, bool boundary_scan);
+  void scan_boundary(std::size_t left_index, std::size_t right_index);
+  void record_section_hit(mps_section_kind kind, const char* ptr);
+  void publish_section_ranges();
+
+  const char* data_        = nullptr;
+  std::size_t block_count_ = 0;
+  mps_phase_registry_t& registry_;
+  std::mutex publish_mutex_;
+  std::unique_ptr<std::atomic<unsigned char>[]> block_decoded_;
+  std::unique_ptr<std::atomic_size_t[]> block_begin_offsets_;
+  std::unique_ptr<std::atomic_size_t[]> block_end_offsets_;
+  std::atomic_size_t ready_bytes_{0};
+  std::atomic<const char*> section_hits_[section_count]{};
+};
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
new file mode 100644
index 0000000000..650d28dbc2
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -0,0 +1,135 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#ifdef MPS_FAST_NVTX
+#include <nvtx3/nvToolsExt.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+namespace mps_fast::nvtx {
+
+namespace colors {
+constexpr std::uint32_t generic  = 0xff8b949e;
+constexpr std::uint32_t io       = 0xff58a6ff;
+constexpr std::uint32_t decode   = 0xff3fb950;
+constexpr std::uint32_t rows     = 0xffd29922;
+constexpr std::uint32_t columns  = 0xffff7b72;
+constexpr std::uint32_t rhs      = 0xffa371f7;
+constexpr std::uint32_t bounds   = 0xfff0883e;
+constexpr std::uint32_t ranges   = 0xff79c0ff;
+constexpr std::uint32_t names    = 0xff56d364;
+constexpr std::uint32_t alloc    = 0xffdb61a2;
+constexpr std::uint32_t finalize = 0xffc9d1d9;
+}  // namespace colors
+
+inline std::uint32_t color_for_name(std::string_view name) noexcept
+{
+  if (name.find("lz4") != std::string_view::npos || name.find("read") != std::string_view::npos) {
+    return colors::io;
+  }
+  if (name.find("decode") != std::string_view::npos ||
+      name.find("decompress") != std::string_view::npos) {
+    return colors::decode;
+  }
+  if (name.find("row") != std::string_view::npos) { return colors::rows; }
+  if (name.find("column") != std::string_view::npos || name.find("csr") != std::string_view::npos) {
+    return colors::columns;
+  }
+  if (name.find("rhs") != std::string_view::npos) { return colors::rhs; }
+  if (name.find("bound") != std::string_view::npos) { return colors::bounds; }
+  if (name.find("range") != std::string_view::npos) { return colors::ranges; }
+  if (name.find("name") != std::string_view::npos ||
+      name.find("materialize") != std::string_view::npos) {
+    return colors::names;
+  }
+  if (name.find("alloc") != std::string_view::npos ||
+      name.find("resize") != std::string_view::npos ||
+      name.find("mmap") != std::string_view::npos) {
+    return colors::alloc;
+  }
+  if (name.find("finalize") != std::string_view::npos) { return colors::finalize; }
+  return colors::generic;
+}
+
+class scoped_range {
+ public:
+  explicit scoped_range(const char* name,
+                        std::uint32_t color    = colors::generic,
+                        std::uint32_t category = 0)
+  {
+    push(name, color, category);
+  }
+
+  explicit scoped_range(std::string name,
+                        std::uint32_t color    = colors::generic,
+                        std::uint32_t category = 0)
+    : owned_name_(std::move(name))
+  {
+    push(owned_name_.c_str(), color, category);
+  }
+
+  ~scoped_range() { end(); }
+
+  void end()
+  {
+#ifdef MPS_FAST_NVTX
+    if (active_) {
+      nvtxRangePop();
+      active_ = false;
+    }
+#endif
+  }
+
+  scoped_range(const scoped_range&)            = delete;
+  scoped_range& operator=(const scoped_range&) = delete;
+
+ private:
+  void push(const char* name, std::uint32_t color, std::uint32_t category)
+  {
+#ifdef MPS_FAST_NVTX
+    nvtxEventAttributes_t event{};
+    event.version       = NVTX_VERSION;
+    event.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    event.colorType     = NVTX_COLOR_ARGB;
+    event.color         = color;
+    event.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+    event.message.ascii = name;
+    event.category      = category;
+    nvtxRangePushEx(&event);
+    active_ = true;
+#else
+    (void)name;
+    (void)color;
+    (void)category;
+#endif
+  }
+
+  std::string owned_name_;
+#ifdef MPS_FAST_NVTX
+  bool active_ = false;
+#endif
+};
+
+inline void name_current_thread(const char* name)
+{
+#ifdef MPS_FAST_NVTX
+  nvtxNameOsThreadA(static_cast<std::uint32_t>(::syscall(SYS_gettid)), name);
+#else
+  (void)name;
+#endif
+}
+
+}  // namespace mps_fast::nvtx
+
+#define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b
+#define MPS_FAST_NVTX_CONCAT(a, b)       MPS_FAST_NVTX_CONCAT_INNER(a, b)
+#define MPS_NVTX_RANGE(name, color) \
+  ::mps_fast::nvtx::scoped_range MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color)
diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp
new file mode 100644
index 0000000000..d81af7a2eb
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/simd_compat.hpp
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Use SIMDe's explicit simde_* API. On x86 it can still lower to native
+// intrinsics; on other targets it provides the portable implementation.
+#include <simde/x86/aes.h>
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index af1368865d..12f9ed488a 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -56,6 +56,21 @@ bool file_exists(const std::string& file)
 
 namespace {
 
+struct mps_reader_param_t {
+  const char* name;
+  mps_reader_type_t reader;
+};
+
+constexpr mps_reader_param_t default_mps_reader_param{"default_reader",
+                                                      mps_reader_type_t::default_reader};
+constexpr mps_reader_param_t fast_mps_reader_param{"fast_experimental",
+                                                   mps_reader_type_t::fast_experimental};
+
+std::string mps_reader_param_name(const ::testing::TestParamInfo<mps_reader_param_t>& info)
+{
+  return info.param.name;
+}
+
 // Non-template forwarding wrapper around read_lp_from_string<int, double>.
 // Exists only so EXPECT_THROW(read_lp_string(R"LP(...)LP"), exc) is parsed
 // correctly — gtest's macro splits its args on top-level commas, and the
@@ -115,14 +130,14 @@ double q_entry(const mps_data_model_t<int, double>& m, int row, int col)
 // ===========================================================================
 // Per-fixture test classes. Each class describes one named problem fixture
 // and owns the checker for that problem's expected parsed data model. The
-// MPS and LP TEST_F cases within a fixture share the same `check_model`
+// MPS TEST_P and LP TEST_F cases within a fixture share the same `check_model`
 // method, so the expected values live in exactly one place per fixture.
 //
 // All fixtures inherit a common base that supplies read_mps_file and
 // read_lp_file helpers.
 // ===========================================================================
 
-class parser_fixture_base : public ::testing::Test {
+class parser_fixture_base : public ::testing::TestWithParam<mps_reader_param_t> {
  protected:
   static mps_data_model_t<int, double> read_mps_file(const std::string& file,
                                                      bool fixed_format = true)
@@ -131,6 +146,18 @@ class parser_fixture_base : public ::testing::Test {
     return read_mps<int, double>(root + "/" + file, fixed_format);
   }
 
+  mps_data_model_t<int, double> read_param_mps_file(const std::string& file,
+                                                    bool fixed_format = true) const
+  {
+    const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
+    const auto reader       = GetParam().reader;
+    // The experimental reader has no fixed/free parser mode. Use the same file but do not force
+    // fixed-format dispatch for that reader.
+    const bool reader_fixed_format =
+      reader == mps_reader_type_t::default_reader ? fixed_format : false;
+    return read<int, double>(root + "/" + file, reader, reader_fixed_format);
+  }
+
   static mps_data_model_t<int, double> read_lp_file(const std::string& file)
   {
     const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
@@ -357,9 +384,13 @@ TEST(mps_parser, bad_mps_files)
   }
 }
 
-TEST_F(good_mps_1_test, mps)
+TEST_P(good_mps_1_test, mps)
+{
+  check_model(read_param_mps_file("linear_programming/good-mps-1.mps", false));
+}
+
+TEST_F(good_mps_1_test, mps_parser_internals)
 {
-  check_model(read_mps_file("linear_programming/good-mps-1.mps"));
   // Parser-struct fields that are MPS-only (not exposed via the data model).
   auto mps = read_from_mps("linear_programming/good-mps-1.mps");
   EXPECT_EQ("good-1", mps.problem_name);
@@ -592,9 +623,13 @@ TEST(mps_parser_free_format, bad_mps_files_free_format)
   }
 }
 
-TEST_F(up_low_bounds_test, mps)
+TEST_P(up_low_bounds_test, mps)
+{
+  check_model(read_param_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
+}
+
+TEST_F(up_low_bounds_test, mps_parser_internals)
 {
-  check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
   auto mps = read_from_mps("linear_programming/lp_model_with_var_bounds.mps", false);
   EXPECT_EQ("lp_model_with_var_bounds", mps.problem_name);
   EXPECT_EQ("OBJ", mps.objective_name);
@@ -607,16 +642,16 @@ TEST_F(up_low_bounds_test, lp)
   check_model(read_lp_file("linear_programming/lp_model_with_var_bounds.lp"));
 }
 
-TEST_F(good_mps_1_test, mps_free_format)
+TEST_P(good_mps_1_test, mps_free_format)
 {
   // free-format-mps-1.mps encodes the same problem as good-mps-1 with default
   // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker.
-  check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false));
+  check_model(read_param_mps_file("linear_programming/free-format-mps-1.mps", false));
 }
 
-TEST_F(some_var_bounds_test, mps)
+TEST_P(some_var_bounds_test, mps)
 {
-  check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
+  check_model(read_param_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
 }
 
 TEST_F(some_var_bounds_test, lp)
@@ -624,9 +659,9 @@ TEST_F(some_var_bounds_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-some-var-bounds.lp"));
 }
 
-TEST_F(fixed_var_bound_test, mps)
+TEST_P(fixed_var_bound_test, mps)
 {
-  check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps"));
+  check_model(read_param_mps_file("linear_programming/good-mps-fixed-var.mps"));
 }
 
 TEST_F(fixed_var_bound_test, lp)
@@ -634,9 +669,9 @@ TEST_F(fixed_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-fixed-var.lp"));
 }
 
-TEST_F(free_var_bound_test, mps)
+TEST_P(free_var_bound_test, mps)
 {
-  check_model(read_mps_file("linear_programming/good-mps-free-var.mps"));
+  check_model(read_param_mps_file("linear_programming/good-mps-free-var.mps"));
 }
 
 TEST_F(free_var_bound_test, lp)
@@ -644,9 +679,9 @@ TEST_F(free_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-free-var.lp"));
 }
 
-TEST_F(lower_inf_var_bound_test, mps)
+TEST_P(lower_inf_var_bound_test, mps)
 {
-  check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
+  check_model(read_param_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
 }
 
 TEST_F(lower_inf_var_bound_test, lp)
@@ -662,9 +697,9 @@ TEST(mps_bounds, rhs_cost)
   EXPECT_EQ(int(-5), mps.objective_offset_value);
 }
 
-TEST_F(upper_inf_var_bound_test, mps)
+TEST_P(upper_inf_var_bound_test, mps)
 {
-  check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
+  check_model(read_param_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
 }
 
 TEST_F(upper_inf_var_bound_test, lp)
@@ -817,9 +852,13 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types)
   };
 }
 
-TEST_F(mip_with_bounds_test, mps)
+TEST_P(mip_with_bounds_test, mps)
+{
+  check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
+}
+
+TEST_F(mip_with_bounds_test, mps_parser_internals)
 {
-  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
   auto mps = read_from_mps("mixed_integer_programming/good-mip-mps-1.mps", false);
   EXPECT_EQ("COST", mps.objective_name);
   ASSERT_EQ(int(2), mps.row_types.size());
@@ -877,9 +916,9 @@ TEST(mps_parser, good_mps_file_mip_no_marker)
   EXPECT_EQ(10., mps.variable_upper_bounds[1]);
 }
 
-TEST_F(mip_no_bounds_test, mps)
+TEST_P(mip_no_bounds_test, mps)
 {
-  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
+  check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
 }
 
 TEST_F(mip_no_bounds_test, lp)
@@ -887,9 +926,10 @@ TEST_F(mip_no_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-no-bounds.lp"));
 }
 
-TEST_F(mip_partial_bounds_test, mps)
+TEST_P(mip_partial_bounds_test, mps)
 {
-  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
+  check_model(
+    read_param_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
 }
 
 TEST_F(mip_partial_bounds_test, lp)
@@ -897,6 +937,25 @@ TEST_F(mip_partial_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-partial-bounds.lp"));
 }
 
+#define INSTANTIATE_MPS_READER_TEST(Fixture)                                                   \
+  INSTANTIATE_TEST_SUITE_P(mps_readers,                                                        \
+                           Fixture,                                                            \
+                           ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \
+                           mps_reader_param_name)
+
+INSTANTIATE_MPS_READER_TEST(good_mps_1_test);
+INSTANTIATE_MPS_READER_TEST(up_low_bounds_test);
+INSTANTIATE_MPS_READER_TEST(some_var_bounds_test);
+INSTANTIATE_MPS_READER_TEST(fixed_var_bound_test);
+INSTANTIATE_MPS_READER_TEST(free_var_bound_test);
+INSTANTIATE_MPS_READER_TEST(lower_inf_var_bound_test);
+INSTANTIATE_MPS_READER_TEST(upper_inf_var_bound_test);
+INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test);
+
+#undef INSTANTIATE_MPS_READER_TEST
+
 #ifdef MPS_PARSER_WITH_BZIP2
 TEST(mps_parser, good_mps_file_bzip2_compressed)
 {

From 68daf3d31af007cb36fb44fe7a45da67b962ee23 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 3 Jun 2026 04:17:59 -0700
Subject: [PATCH 02/22] thread count cap

---
 cpp/src/io/experimental_mps_fast/fast_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index bce17a435f..ae881bebe2 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -2591,7 +2591,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   int header_done = 0, rows_done = 0, columns_done = 0;
   int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
 
-#pragma omp parallel num_threads(omp_get_max_threads())
+#pragma omp parallel num_threads(std::min(32, omp_get_max_threads()))
   {
     std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num());
     nvtx::name_current_thread(thread_name.c_str());

From eb0e285da92169aff0cf94f9978d1fd8b96bc7e2 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 5 Jun 2026 02:42:06 -0700
Subject: [PATCH 03/22] fix crashes, more opti

---
 .../fast_parse_primitives.hpp                 | 231 ++----
 .../io/experimental_mps_fast/fast_parser.cpp  | 705 +++++++++++++-----
 .../fast_parser_adapter.cpp                   |   6 +
 .../io/experimental_mps_fast/file_reader.cpp  |  52 +-
 .../experimental_mps_fast/lz4_file_reader.cpp |  33 +-
 .../experimental_mps_fast/perf_counters.hpp   | 163 ++++
 6 files changed, 810 insertions(+), 380 deletions(-)
 create mode 100644 cpp/src/io/experimental_mps_fast/perf_counters.hpp

diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index 9da59e7b44..453687df01 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -26,124 +26,6 @@
 
 namespace mps_fast {
 
-// double values in MPS data rarely need more than this many fractional digits.
-inline constexpr double decimals[16][10] = {
-  {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9},
-  {0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09},
-  {0.000, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009},
-  {0.0000, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009},
-  {0.00000, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009},
-  {0.000000,
-   0.000001,
-   0.000002,
-   0.000003,
-   0.000004,
-   0.000005,
-   0.000006,
-   0.000007,
-   0.000008,
-   0.000009},
-  {0.0000000,
-   0.0000001,
-   0.0000002,
-   0.0000003,
-   0.0000004,
-   0.0000005,
-   0.0000006,
-   0.0000007,
-   0.0000008,
-   0.0000009},
-  {0.00000000,
-   0.00000001,
-   0.00000002,
-   0.00000003,
-   0.00000004,
-   0.00000005,
-   0.00000006,
-   0.00000007,
-   0.00000008,
-   0.00000009},
-  {0.000000000,
-   0.000000001,
-   0.000000002,
-   0.000000003,
-   0.000000004,
-   0.000000005,
-   0.000000006,
-   0.000000007,
-   0.000000008,
-   0.000000009},
-  {0.0000000000,
-   0.0000000001,
-   0.0000000002,
-   0.0000000003,
-   0.0000000004,
-   0.0000000005,
-   0.0000000006,
-   0.0000000007,
-   0.0000000008,
-   0.0000000009},
-  {0.00000000000,
-   0.00000000001,
-   0.00000000002,
-   0.00000000003,
-   0.00000000004,
-   0.00000000005,
-   0.00000000006,
-   0.00000000007,
-   0.00000000008,
-   0.00000000009},
-  {0.000000000000,
-   0.000000000001,
-   0.000000000002,
-   0.000000000003,
-   0.000000000004,
-   0.000000000005,
-   0.000000000006,
-   0.000000000007,
-   0.000000000008,
-   0.000000000009},
-  {0.0000000000000,
-   0.0000000000001,
-   0.0000000000002,
-   0.0000000000003,
-   0.0000000000004,
-   0.0000000000005,
-   0.0000000000006,
-   0.0000000000007,
-   0.0000000000008,
-   0.0000000000009},
-  {0.00000000000000,
-   0.00000000000001,
-   0.00000000000002,
-   0.00000000000003,
-   0.00000000000004,
-   0.00000000000005,
-   0.00000000000006,
-   0.00000000000007,
-   0.00000000000008,
-   0.00000000000009},
-  {0.000000000000000,
-   0.000000000000001,
-   0.000000000000002,
-   0.000000000000003,
-   0.000000000000004,
-   0.000000000000005,
-   0.000000000000006,
-   0.000000000000007,
-   0.000000000000008,
-   0.000000000000009},
-  {0.0000000000000000,
-   0.0000000000000001,
-   0.0000000000000002,
-   0.0000000000000003,
-   0.0000000000000004,
-   0.0000000000000005,
-   0.0000000000000006,
-   0.0000000000000007,
-   0.0000000000000008,
-   0.0000000000000009}};
-
 inline constexpr int EXP10_TABLE_MAX = 308;
 
 constexpr double constexpr_pow10(int exp)
@@ -173,42 +55,9 @@ inline constexpr auto table_exp10 = make_exp10_table();
 
 static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; }
 
-static inline double fast_frac_atoi(const char*& data, const char* end)
-{
-  double val = 0.0;
-
-#define MPS_FAST_FRAC_DIGIT(i)                                   \
-  do {                                                           \
-    if (data >= end || !is_digit_byte(*data)) return val;        \
-    val += decimals[i][static_cast<unsigned char>(*data) & 0xF]; \
-    ++data;                                                      \
-  } while (0)
-
-  MPS_FAST_FRAC_DIGIT(0);
-  MPS_FAST_FRAC_DIGIT(1);
-  MPS_FAST_FRAC_DIGIT(2);
-  MPS_FAST_FRAC_DIGIT(3);
-  MPS_FAST_FRAC_DIGIT(4);
-  MPS_FAST_FRAC_DIGIT(5);
-  MPS_FAST_FRAC_DIGIT(6);
-  MPS_FAST_FRAC_DIGIT(7);
-  MPS_FAST_FRAC_DIGIT(8);
-  MPS_FAST_FRAC_DIGIT(9);
-  MPS_FAST_FRAC_DIGIT(10);
-  MPS_FAST_FRAC_DIGIT(11);
-  MPS_FAST_FRAC_DIGIT(12);
-  MPS_FAST_FRAC_DIGIT(13);
-  MPS_FAST_FRAC_DIGIT(14);
-  MPS_FAST_FRAC_DIGIT(15);
-
-#undef MPS_FAST_FRAC_DIGIT
-
-  while (data < end && is_digit_byte(*data)) {
-    ++data;
-  }
-  return val;
-}
-
+// Honestly, it's pretty bare bones as it is. It could take advantage of SIMD/SWAR
+// or use the Eisel-Lemire trick. Would have to be validated through benchmarking
+// but usually MPS files use simple enough coefficients
 static inline double fast_atof_core(const char*& data, const char* end)
 {
   double sign = 1.0;
@@ -219,17 +68,32 @@ static inline double fast_atof_core(const char*& data, const char* end)
     ++data;
   }
 
-  uint64_t int_part = 0;
-  while (data < end && is_digit_byte(*data)) {
-    int_part = int_part * 10 + (*data - '0');
-    ++data;
-  }
-
-  double result = static_cast<double>(int_part);
-
-  if (data < end && *data == '.') {
-    ++data;
-    result += fast_frac_atoi(data, end);
+  uint64_t significand   = 0;
+  int decimal_exponent   = 0;
+  int significant_digits = 0;
+  bool seen_dot          = false;
+
+  while (data < end) {
+    char c = *data;
+    if (is_digit_byte(c)) {
+      int digit = c - '0';
+      if (seen_dot) { --decimal_exponent; }
+      if (significand != 0 || digit != 0) {
+        // FP64 can't represent more than that
+        if (significant_digits < 19) {
+          significand = significand * 10 + static_cast<uint64_t>(digit);
+          ++significant_digits;
+        } else if (!seen_dot) {
+          ++decimal_exponent;
+        }
+      }
+      ++data;
+    } else if (c == '.' && !seen_dot) {
+      seen_dot = true;
+      ++data;
+    } else {
+      break;
+    }
   }
 
   if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) {
@@ -249,11 +113,14 @@ static inline double fast_atof_core(const char*& data, const char* end)
     }
 
     exponent *= exp_sign;
-    if (exponent >= -EXP10_TABLE_MAX && exponent <= EXP10_TABLE_MAX) {
-      result *= table_exp10[static_cast<size_t>(exponent + EXP10_TABLE_MAX)];
-    } else {
-      result *= std::pow(10.0, exponent);
-    }
+    decimal_exponent += exponent;
+  }
+
+  double result = static_cast<double>(significand);
+  if (decimal_exponent >= -EXP10_TABLE_MAX && decimal_exponent <= EXP10_TABLE_MAX) {
+    result *= table_exp10[static_cast<size_t>(decimal_exponent + EXP10_TABLE_MAX)];
+  } else {
+    result *= std::pow(10.0, decimal_exponent);
   }
 
   return sign * result;
@@ -352,17 +219,29 @@ struct cursor_t {
 
   void skip_ws() { ptr = simd_scan<true>(ptr, end); }
 
+  bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); }
+
+  void consume_eol()
+  {
+    if (ptr < end && *ptr == '\r') {
+      ptr++;
+      if (ptr < end && *ptr == '\n') { ptr++; }
+      return;
+    }
+    if (ptr < end && *ptr == '\n') { ptr++; }
+  }
+
   void skip_comment_line()
   {
-    while (!done() && *ptr != '\n') {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
       ptr++;
     }
-    if (!done()) ptr++;
+    consume_eol();
   }
 
   void skip_to_eol()
   {
-    while (!done() && *ptr != '\n') {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
       ptr++;
     }
   }
@@ -480,8 +359,6 @@ struct cursor_t {
     return {std::string_view(field1_start, field1_end_off),
             std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)};
   }
-
-  bool eol() const { return ptr < end && *ptr == '\n'; }
 };
 
 static inline void expect(cursor_t& cursor, const char* field)
@@ -494,7 +371,7 @@ static inline void accept_comment_line(cursor_t& cursor)
 {
   for (;;) {
     while (!cursor.done() && cursor.eol()) {
-      cursor.advance(1);
+      cursor.consume_eol();
     }
     if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; }
     cursor.skip_comment_line();
@@ -507,7 +384,7 @@ static inline void expect_eol(cursor_t& cursor)
 
   for (;;) {
     while (cursor.eol()) {
-      cursor.advance(1);
+      cursor.consume_eol();
     }
     if (__unlikely(cursor.done())) { return; }
 
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index ae881bebe2..35a67346c3 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -28,6 +28,7 @@
 #include <cstring>
 #include <exception>
 #include <limits>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
@@ -44,18 +45,39 @@
 
 namespace mps_fast {
 
-static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS       = 4096;
-static constexpr int MPS_ROWS_THREAD_CAP                  = 16;
-static constexpr int MPS_COLUMNS_THREAD_CAP               = 32;
-static constexpr int MPS_BOUNDS_THREAD_CAP                = 32;
-static constexpr int MPS_NAMES_THREAD_CAP                 = 16;
-static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024;
-static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES     = 256ull * 1024ull * 1024ull;
-static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES       = 1 * 1024 * 1024;
+static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS                = 4096;
+static constexpr int MPS_ROWS_THREAD_CAP                           = 16;
+static constexpr int MPS_COLUMNS_THREAD_CAP                        = 32;
+static constexpr int MPS_BOUNDS_THREAD_CAP                         = 32;
+static constexpr int MPS_NAMES_THREAD_CAP                          = 16;
+static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS          = 16 * 1024 * 1024;
+static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES              = 256ull * 1024ull * 1024ull;
+static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8ull * 1024ull * 1024ull;
+static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES                = 1 * 1024 * 1024;
+static constexpr size_t MPS_SMALL_RAW_FILE_BYTES                   = 4ull * 1024ull * 1024ull;
+static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES     = 100ull * 1000ull * 1000ull;
+static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS          = 64ull * 1024ull;
+static constexpr size_t MPS_ROW_HASH_PARTITIONS                    = 32;
+static constexpr int MPS_ROW_HASH_PARTITION_BITS                   = 5;
+static constexpr int MPS_SMALL_FILE_THREAD_CAP                     = 16;
+static constexpr int MPS_LARGE_FILE_THREAD_CAP                     = 32;
+
+static int parser_thread_cap_for_size(size_t bytes)
+{
+  int size_cap = bytes < MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES ? MPS_SMALL_FILE_THREAD_CAP
+                                                                : MPS_LARGE_FILE_THREAD_CAP;
+  return std::max(1, std::min(size_cap, omp_get_max_threads()));
+}
 
 static int phase_thread_count(int phase_cap)
 {
-  return std::max(1, std::min(phase_cap, omp_get_max_threads()));
+  const int available_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
+  return std::max(1, std::min(phase_cap, available_threads));
+}
+
+static inline size_t row_hash_partition_for(uint32_t hash)
+{
+  return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
 }
 
 // =============================================================================
@@ -82,12 +104,14 @@ static std::mutex& get_timer_mutex()
 
 static void flush_timers()
 {
+#ifdef MPS_FAST_TIMERS
   std::lock_guard<std::mutex> lock(get_timer_mutex());
   auto& buffer = get_timer_buffer();
   for (const auto& entry : buffer) {
     std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms);
   }
   buffer.clear();
+#endif
 }
 
 static size_t system_page_size()
@@ -144,60 +168,44 @@ static void materialize_vector_hugepages(const char* label,
 class scoped_timer_t {
  public:
   scoped_timer_t(const char* name, double* accumulator = nullptr)
+#ifdef MPS_FAST_TIMERS
     : name_(name),
       accumulator_(accumulator),
       nvtx_(name, nvtx::color_for_name(name)),
-      start_(std::chrono::high_resolution_clock::now())
+      start_(std::chrono::high_resolution_clock::now()){}
+#else
+    : accumulator_(accumulator)
   {
+    (void)name;
   }
+#endif
 
-  ~scoped_timer_t()
+      ~scoped_timer_t()
   {
+#ifdef MPS_FAST_TIMERS
     auto end          = std::chrono::high_resolution_clock::now();
     double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
     nvtx_.end();
     if (accumulator_) { *accumulator_ += elapsed_ms; }
     std::lock_guard<std::mutex> lock(get_timer_mutex());
     get_timer_buffer().push_back({name_, elapsed_ms});
+#endif
   }
 
   scoped_timer_t(const scoped_timer_t&)            = delete;
   scoped_timer_t& operator=(const scoped_timer_t&) = delete;
 
  private:
+#ifdef MPS_FAST_TIMERS
   const char* name_;
+#endif
   double* accumulator_;
+#ifdef MPS_FAST_TIMERS
   nvtx::scoped_range nvtx_;
   std::chrono::high_resolution_clock::time_point start_;
+#endif
 };
 
-static inline bool section_token_matches(const char* p,
-                                         const char* end,
-                                         const char* token,
-                                         size_t len)
-{
-  return (size_t)(end - p) >= len && std::memcmp(p, token, len) == 0 &&
-         ((size_t)(end - p) == len || p[len] <= ' ');
-}
-
-static inline bool is_quadratic_section_start(const char* p, const char* end)
-{
-  return section_token_matches(p, end, "QUADOBJ", 7) ||
-         section_token_matches(p, end, "QMATRIX", 7) ||
-         section_token_matches(p, end, "QCMATRIX", 8);
-}
-
-static inline bool is_rhs_section_end(const char* p, const char* end)
-{
-  switch (p[0]) {
-    case 'B': return std::memcmp(p, "BOUNDS", 6) == 0 && p[6] <= ' ';
-    case 'Q': return is_quadratic_section_start(p, end);
-    case 'R': return std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ';
-    case 'E': return std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ';
-    default: return false;
-  }
-}
-
 static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section)
 {
   const char* row_end = row_start;
@@ -287,6 +295,12 @@ static inline bool dense_suffix_width_ok(uint64_t value,
 
 template <typename i_t, typename f_t>
 struct parse_state_t {
+  struct row_hash_partition_t {
+    hash_slot_var_t* slots = nullptr;
+    size_t buckets         = 0;
+    size_t mask            = 0;
+  };
+
   cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem;
   cursor_t& cursor;
 
@@ -309,7 +323,9 @@ struct parse_state_t {
   size_t row_hash_buckets = 0;
   size_t row_hash_mask    = 0;  // buckets - 1, for fast modulo via &
   mmap_region_t row_hash_region;
-  hash_slot_var_t* row_names_ht = nullptr;
+  hash_slot_var_t* row_names_ht                                                 = nullptr;
+  size_t row_hash_partition_count                                               = 0;
+  std::array<row_hash_partition_t, MPS_ROW_HASH_PARTITIONS> row_hash_partitions = {};
   // Overflow map for row names longer than HASH_KEY_BYTES
   std::unordered_map<std::string_view, size_t, string_view_hash> row_names_long;
 
@@ -326,6 +342,15 @@ struct parse_state_t {
   // var_names still uses STL (only used in parse_bounds, not as hot)
   std::unordered_map<std::string_view, size_t, string_view_hash> var_names_map;
 
+  struct bounds_only_var_t {
+    f_t lb    = f_t{0};
+    f_t ub    = std::numeric_limits<f_t>::infinity();
+    char type = 'C';
+  };
+
+  // Some writers introduce zero-column variables only in BOUNDS.
+  std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
+
   parse_state_t(cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& p, cursor_t& c)
     : problem(p), cursor(c)
   {
@@ -423,13 +448,73 @@ struct parse_state_t {
     return true;
   }
 
+  size_t row_hash_bucket_count_for(size_t n_rows) const
+  {
+#ifdef MPS_FAST_COMPACT_ROW_HASH
+    // Keep the row hash compact. Probe counts are usually low, and a smaller
+    // table reduces cache/TLB footprint on medium instances.
+    return next_power_of_2(std::max(n_rows + n_rows / 2, (size_t)64));
+#else
+    // Original conservative sizing policy.
+    return next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64));
+#endif
+  }
+
   void init_row_hash_table_impl()
   {
     scoped_timer_t timer("row_hash_init_total");
-    size_t n_rows = row_names_sv.size();
-    // load factor 50%
-    row_hash_buckets          = next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64));
-    row_hash_mask             = row_hash_buckets - 1;
+    size_t n_rows              = row_names_sv.size();
+    const int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1;
+    std::vector<uint32_t> row_hashes;
+    std::vector<size_t> row_order;
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS> partition_counts      = {};
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1> partition_offsets = {};
+
+    if (use_partitioned) {
+      scoped_timer_t timer("row_hash_partition_metadata");
+      row_hashes.resize(n_rows);
+      size_t inline_rows = 0;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        std::string_view name = row_names_sv[idx];
+        if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+          row_names_long[name] = idx;
+          continue;
+        }
+        uint32_t hash   = fnv1a_hash(name.data(), name.size());
+        row_hashes[idx] = hash;
+        ++partition_counts[row_hash_partition_for(hash)];
+        ++inline_rows;
+      }
+
+      for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+        partition_offsets[p + 1] = partition_offsets[p] + partition_counts[p];
+      }
+
+      row_order.resize(inline_rows);
+      auto next_offsets = partition_offsets;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        if (__unlikely(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
+        size_t part                     = row_hash_partition_for(row_hashes[idx]);
+        row_order[next_offsets[part]++] = idx;
+      }
+    }
+
+    if (use_partitioned) {
+      row_hash_partition_count = MPS_ROW_HASH_PARTITIONS;
+      size_t total_buckets     = 0;
+      for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+        row_hash_partitions[p].buckets = row_hash_bucket_count_for(partition_counts[p]);
+        row_hash_partitions[p].mask    = row_hash_partitions[p].buckets - 1;
+        total_buckets += row_hash_partitions[p].buckets;
+      }
+      row_hash_buckets = total_buckets;
+      row_hash_mask    = row_hash_buckets - 1;
+    } else {
+      row_hash_partition_count = 0;
+      row_hash_buckets         = row_hash_bucket_count_for(n_rows);
+      row_hash_mask            = row_hash_buckets - 1;
+    }
     size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t);
 
     {
@@ -438,6 +523,13 @@ struct parse_state_t {
       row_hash_region = mmap_region_t::anonymous(
         row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table");
       row_names_ht = static_cast<hash_slot_var_t*>(row_hash_region.data());
+      if (use_partitioned) {
+        hash_slot_var_t* next_slots = row_names_ht;
+        for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+          row_hash_partitions[p].slots = next_slots;
+          next_slots += row_hash_partitions[p].buckets;
+        }
+      }
       // Request huge pages to reduce TLB misses
       row_hash_region.advise(MADV_HUGEPAGE);
     }
@@ -453,9 +545,86 @@ struct parse_state_t {
 
     {
       scoped_timer_t timer("row_hash_insert_all");
-      for (size_t idx = 0; idx < n_rows; ++idx) {
-        row_insert(row_names_sv[idx], idx);
+#ifdef MPS_FAST_PERF_COUNTERS
+      size_t total_probes = 0;
+      size_t max_probes   = 0;
+      size_t long_names   = row_names_long.size();
+#endif
+      if (use_partitioned) {
+        scoped_timer_t timer("row_hash_insert_partitioned");
+#ifdef MPS_FAST_PERF_COUNTERS
+        std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
+        std::vector<size_t> partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0);
+        std::vector<size_t> partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0);
+#endif
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+        for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
+          size_t p = (size_t)part_id;
+#ifdef MPS_FAST_PERF_COUNTERS
+          thread_perf_counters_t perf_counters;
+          size_t local_total_probes = 0;
+          size_t local_max_probes   = 0;
+#endif
+          const auto& part = row_hash_partitions[p];
+          for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
+            size_t idx = row_order[pos];
+#ifdef MPS_FAST_PERF_COUNTERS
+            size_t probes = row_insert_into(
+              part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx);
+            local_total_probes += probes;
+            local_max_probes = std::max(local_max_probes, probes);
+#else
+            row_insert_into(
+              part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx);
+#endif
+          }
+#ifdef MPS_FAST_PERF_COUNTERS
+          partition_total_probes[p] = local_total_probes;
+          partition_max_probes[p]   = local_max_probes;
+          perf_snapshots[p]         = perf_counters.stop();
+#endif
+        }
+#ifdef MPS_FAST_PERF_COUNTERS
+        for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+          total_probes += partition_total_probes[p];
+          max_probes = std::max(max_probes, partition_max_probes[p]);
+        }
+        print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
+#endif
+      } else {
+#ifdef MPS_FAST_PERF_COUNTERS
+        thread_perf_counters_t perf_counters;
+#endif
+        for (size_t idx = 0; idx < n_rows; ++idx) {
+#ifdef MPS_FAST_PERF_COUNTERS
+          size_t probes = row_insert(row_names_sv[idx], idx);
+          if (probes == 0) {
+            ++long_names;
+          } else {
+            total_probes += probes;
+            max_probes = std::max(max_probes, probes);
+          }
+#else
+          row_insert(row_names_sv[idx], idx);
+#endif
+        }
+#ifdef MPS_FAST_PERF_COUNTERS
+        print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
+#endif
       }
+#ifdef MPS_FAST_PERF_COUNTERS
+      size_t probed_rows = n_rows - long_names;
+      double mean_probes = probed_rows == 0 ? 0.0 : (double)total_probes / (double)probed_rows;
+      double load_factor = row_hash_buckets == 0 ? 0.0 : (double)n_rows / (double)row_hash_buckets;
+      std::fprintf(stderr,
+                   "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n",
+                   n_rows,
+                   row_hash_buckets,
+                   load_factor,
+                   long_names,
+                   mean_probes,
+                   max_probes);
+#endif
     }
 
     // Force the kernel to please please collapse the page range into THP pages
@@ -546,13 +715,21 @@ struct parse_state_t {
       auto it = row_names_long.find(name);
       return it != row_names_long.end() ? it->second : SIZE_MAX;
     }
-    hash_key_t key               = make_key(name.data(), name.size());
-    uint32_t hash                = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask;
-    const hash_slot_var_t* slots = row_names_ht;
-    const hash_slot_var_t* slot  = &slots[hash];
+    hash_key_t key = make_key(name.data(), name.size());
+    uint32_t hash  = fnv1a_hash(name.data(), name.size());
+    if (__likely(row_hash_partition_count != 0)) {
+      const auto& part = row_hash_partitions[row_hash_partition_for(hash)];
+      return row_lookup_in(part.slots, part.buckets, part.mask, key, hash);
+    }
+    return row_lookup_in(row_names_ht, row_hash_buckets, row_hash_mask, key, hash);
+  }
 
-    for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) {
-      if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; }
+  size_t row_lookup_in(
+    const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) const
+  {
+    const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+    for (size_t i = 0; i < buckets; ++i, ++slot) {
+      if (slot >= &slots[buckets]) { slot = &slots[0]; }
       if (slot->count == 0) { return SIZE_MAX; }
       if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
     }
@@ -593,27 +770,39 @@ struct parse_state_t {
     std::memcpy(suffix, digits_buf, digits_len);
   }
 
-  void row_insert(std::string_view name, size_t index)
+  size_t row_insert(std::string_view name, size_t index)
   {
     if (__unlikely(name.size() > HASH_KEY_BYTES)) {
       row_names_long[name] = index;
-      return;
-    }
-    hash_key_t key         = make_key(name.data(), name.size());
-    uint32_t hash          = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask;
-    hash_slot_var_t* slots = row_names_ht;
-    hash_slot_var_t* slot  = &slots[hash];
-
-    for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) {
-      if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; }
+      return 0;
+    }
+    return row_insert_into(row_names_ht,
+                           row_hash_buckets,
+                           row_hash_mask,
+                           name,
+                           fnv1a_hash(name.data(), name.size()),
+                           index);
+  }
+
+  size_t row_insert_into(hash_slot_var_t* slots,
+                         size_t buckets,
+                         size_t mask,
+                         std::string_view name,
+                         uint32_t hash,
+                         size_t index)
+  {
+    hash_key_t key        = make_key(name.data(), name.size());
+    hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+    for (size_t i = 0; i < buckets; ++i, ++slot) {
+      if (slot >= &slots[buckets]) { slot = &slots[0]; }
       if (slot->count == 0) {
         key_store(slot->key, key);            // Writes 32 bytes, including garbage in last 4
         slot->count = (uint32_t)(index + 1);  // Overwrite last 4 bytes with actual count
-        return;
+        return i + 1;
       }
       if (key_cmpeq(slot->key, key)) {
         slot->count = (uint32_t)(index + 1);
-        return;
+        return i + 1;
       }
     }
     __builtin_trap();
@@ -624,16 +813,31 @@ struct parse_state_t {
 // Section parsers
 // =============================================================================
 
+static std::string_view read_rest_of_line_trimmed(cursor_t& cursor)
+{
+  const char* begin = cursor.ptr;
+  const char* end   = begin;
+  while (end < cursor.end && *end != '\n' && *end != '\r') {
+    ++end;
+  }
+
+  while (begin < end && (*begin == ' ' || *begin == '\t')) {
+    ++begin;
+  }
+  while (end > begin && (end[-1] == ' ' || end[-1] == '\t')) {
+    --end;
+  }
+  cursor.ptr = end;
+  return std::string_view(begin, (size_t)(end - begin));
+}
+
 template <typename i_t, typename f_t>
 static void parse_name_section(parse_state_t<i_t, f_t>& state)
 {
   scoped_timer_t timer("parse_name");
   if (peek(state.cursor) == "ROWS") { return; }
   expect(state.cursor, "NAME");
-  if (!state.cursor.eol()) {
-    state.problem_name_sv = state.cursor.read_field();
-    accept_comment(state.cursor);
-  }
+  if (!state.cursor.eol()) { state.problem_name_sv = read_rest_of_line_trimmed(state.cursor); }
   expect_eol(state.cursor);
 }
 
@@ -643,12 +847,13 @@ static void parse_objsense_section(parse_state_t<i_t, f_t>& state)
   scoped_timer_t timer("parse_objsense");
   if (accept(state.cursor, "OBJSENSE")) {
     if (state.cursor.eol()) { expect_eol(state.cursor); }
-    if (accept(state.cursor, "MIN")) {
+    auto sense = state.cursor.read_field();
+    if (sense == "MIN" || sense == "MINIMIZE") {
       state.problem.maximize_ = false;
-    } else if (accept(state.cursor, "MAX")) {
+    } else if (sense == "MAX" || sense == "MAXIMIZE") {
       state.problem.maximize_ = true;
     } else {
-      state.cursor.error("expected MIN or MAX, got '%s'", state.cursor.read_field().data());
+      state.cursor.error("expected MIN/MAX or MINIMIZE/MAXIMIZE, got '%s'", sense.data());
     }
     accept_comment(state.cursor);
     expect_eol(state.cursor);
@@ -693,8 +898,7 @@ static bool parse_rows_line_fast(const char*& p,
                                  char& row_type,
                                  std::string_view& row_name)
 {
-  while (p < end && *p <= ' ' && *p != '\n')
-    p++;
+  p = cursor_t::simd_scan<true>(p, end);
   if (p >= end) { return false; }
   if (*p == '\n') {
     p++;
@@ -706,12 +910,10 @@ static bool parse_rows_line_fast(const char*& p,
   }
 
   row_type = *p++;
-  while (p < end && *p <= ' ' && *p != '\n')
-    p++;
+  p        = cursor_t::simd_scan<true>(p, end);
 
   const char* name_start = p;
-  while (p < end && *p > ' ')
-    p++;
+  p                      = cursor_t::simd_scan<false>(p, end);
   if (name_start == p) { return false; }
   row_name = std::string_view(name_start, (size_t)(p - name_start));
 
@@ -1135,20 +1337,6 @@ static const char* find_next_line(const char* p, const char* end)
   return p;
 }
 
-static const char* find_bounds_body_end(const char* bounds_body_start, const char* parse_end)
-{
-  const char* p = bounds_body_start;
-  while (p < parse_end) {
-    if ((*p == 'E' && parse_end - p >= 6 && std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ') ||
-        (*p == 'Q' && is_quadratic_section_start(p, parse_end)) ||
-        (*p == 'R' && parse_end - p >= 6 && std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ')) {
-      return p;
-    }
-    p = find_next_line(p, parse_end);
-  }
-  return parse_end;
-}
-
 static std::vector<BoundsChunkBoundary> compute_line_chunk_boundaries(const char* section_start,
                                                                       const char* section_end,
                                                                       int num_threads)
@@ -1306,7 +1494,8 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
       sign = -1.0;
       cursor.advance(1);
     }
-    if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && cursor.ptr[1] == '\n') {
+    if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) &&
+        (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) {
       value = sign * (cursor.ptr[0] - '0');
       cursor.advance(1);
     } else {
@@ -1720,7 +1909,8 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
   scoped_timer_t timer("parse_rhs");
   expect_section(cursor, "RHS");
 
-  auto field_from_start = [](const char* start, const char* end) {
+  // necessary on the cold path since we directly read and lookup on the hot path
+  auto reread_field_name = [](const char* start, const char* end) {
     const char* p = start;
     while (p < end && *p > ' ') {
       p++;
@@ -1729,20 +1919,24 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
   };
 
   auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) {
+    // This is a regular non-obj row.
     if (row_idx != SIZE_MAX) {
       state.problem.b_[row_idx] = value;
       return;
     }
-    std::string_view row_name = field_from_start(row_start, cursor.end);
+    // This is the objective row.
+    std::string_view row_name = reread_field_name(row_start, cursor.end);
     if (row_name == state.objective_name_sv) {
       state.problem.objective_offset_ = -value;
       return;
     }
+    // Other objectives, ignored currently. cold path
     if (state.is_ignored_objective_name(row_name)) { return; }
+    // Unexpected!
     error_unknown_row(cursor, row_start, "RHS");
   };
 
-  while (cursor.ptr < cursor.end && !is_rhs_section_end(cursor.ptr, cursor.end)) {
+  while (cursor.ptr < cursor.end) {
     auto rhs_name = cursor.read_field();
     (void)rhs_name;
     if (accept_comment(cursor)) {
@@ -1755,6 +1949,7 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
     apply_rhs(row_start, row_idx, (f_t)value);
 
     accept_comment(cursor);
+    // Optional second entry
     if (!cursor.eol()) {
       const char* row_start2 = cursor.ptr;
       size_t row_idx2        = state.read_row_lookup(cursor);
@@ -1773,13 +1968,16 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
                                                 const char* bounds_body_end,
                                                 size_t n_vars)
 {
-  const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start);
-  const int num_threads     = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
-  if (!state.col_dense_ordered || bounds_bytes < MPS_BOUNDS_PARALLEL_MIN_BYTES || num_threads < 2) {
-    return false;
-  }
+  const size_t bounds_bytes   = (size_t)(bounds_body_end - bounds_body_start);
+  const int num_threads       = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
+  const bool use_dense_lookup = state.col_dense_ordered;
+  const size_t min_parallel_bytes =
+    use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES;
+  if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; }
 
-  MPS_NVTX_RANGE("parse_bounds_parallel_dense", nvtx::colors::bounds);
+  MPS_NVTX_RANGE(
+    use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint",
+    nvtx::colors::bounds);
 
   struct BoundsParallelStats {
     size_t lines            = 0;
@@ -1805,7 +2003,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
   }
 
   {
-    scoped_timer_t timer("parse_bounds_parallel_dense");
+    scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense"
+                                          : "parse_bounds_parallel_ordered_hint");
     // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse
     // optimistically, then accept only if chunk summaries prove strict order.
 #pragma omp parallel for schedule(static) num_threads(num_threads)
@@ -1815,6 +2014,27 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
                       (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start));
       cursor.skip_ws();
       size_t prev_var = SIZE_MAX;
+      size_t hint_idx = 0;
+      auto lookup_var = [&](std::string_view var_name) {
+        if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); }
+        if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) {
+          return hint_idx + 1;
+        }
+        if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { return hint_idx; }
+
+        size_t search_start = hint_idx + 2;
+        size_t search_end   = n_vars;
+      search_loop:
+        for (size_t i = search_start; i < search_end; ++i) {
+          if (state.var_names_sv[i] == var_name) { return i; }
+        }
+        if (search_start != 0) {
+          search_end   = hint_idx;
+          search_start = 0;
+          goto search_loop;
+        }
+        return SIZE_MAX;
+      };
       try {
         while (cursor.ptr < cursor.end) {
           if (__unlikely(*cursor.ptr == '$')) {
@@ -1843,17 +2063,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
             continue;
           }
 
-          size_t var_idx = state.col_lookup_dense_ordered(var_name);
+          size_t var_idx = lookup_var(var_name);
           if (__unlikely(var_idx == SIZE_MAX)) {
             local.dense_misses++;
-            std::snprintf(local.error_msg,
-                          sizeof(local.error_msg),
-                          "unknown variable name in BOUNDS: %.*s",
-                          (int)var_name.size(),
-                          var_name.data());
-            local.error_ptr = cursor.ptr;
             break;
           }
+          hint_idx = var_idx;
           local.dense_hits++;
           local.lines++;
           local.min_var = std::min(local.min_var, var_idx);
@@ -1864,10 +2079,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
           bool first_bound_for_var = bound_seen[var_idx] == 0;
           bound_seen[var_idx]      = 1;
 
-          f_t value = 0;
+          f_t value      = 0;
+          bool has_value = false;
           accept_comment(cursor);
           if (!cursor.eol()) {
-            value = (f_t)expect_number_fast_pm_one(cursor);
+            value     = (f_t)expect_number_fast_pm_one(cursor);
+            has_value = true;
             accept_comment(cursor);
           }
 
@@ -1906,6 +2123,15 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
             }
             state.problem.var_types_[var_idx] = 'I';
             local.saw_integer_type            = true;
+          } else if (bound_type == "SC") {
+            if (__unlikely(!has_value)) {
+              std::snprintf(
+                local.error_msg, sizeof(local.error_msg), "SC bound requires an upper bound value");
+              local.error_ptr = cursor.ptr;
+              break;
+            }
+            state.problem.variable_upper_bounds_[var_idx] = value;
+            state.problem.var_types_[var_idx]             = 'S';
           } else {
             std::snprintf(local.error_msg,
                           sizeof(local.error_msg),
@@ -1946,6 +2172,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
   const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0;
 
   if (!order_safe) {
+    std::fprintf(stderr,
+                 "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu "
+                 "non_strict_order=%zu overlap_chunks=%zu\n",
+                 dense_misses,
+                 non_strict_order,
+                 overlap_chunks);
     cursor.ptr = bounds_body_start;
     return false;
   }
@@ -2028,8 +2260,7 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
   }
 
   const char* bounds_body_start = cursor.ptr;
-  const char* bounds_body_end =
-    allow_parallel_dense ? find_bounds_body_end(bounds_body_start, cursor.end) : cursor.end;
+  const char* bounds_body_end   = cursor.end;
   if (allow_parallel_dense) {
     if (parse_bounds_section_parallel_dense(
           state, cursor, bounds_body_start, bounds_body_end, n_vars)) {
@@ -2049,11 +2280,7 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
   size_t hint_idx = 0;
   {
     scoped_timer_t timer("parse_bounds");
-    for (;;) {
-      bool done = cursor.done() || peek(cursor) == "RANGES" || peek(cursor) == "ENDATA" ||
-                  is_quadratic_section_start(cursor.ptr, cursor.end);
-      if (done) break;
-
+    while (!cursor.done()) {
       auto bound_type = cursor.read_field();
       auto bound_name = cursor.read_field();
       (void)bound_name;
@@ -2065,13 +2292,11 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
       }
 
       // optimized lookup using hint (bounds often in same order as columns)
-      size_t var_idx = SIZE_MAX;
+      size_t var_idx                                               = SIZE_MAX;
+      typename parse_state_t<i_t, f_t>::bounds_only_var_t* aux_var = nullptr;
       if (__likely(state.col_dense_ordered)) {
         var_idx = state.col_lookup_dense_ordered(var_name);
-        if (var_idx == SIZE_MAX) {
-          cursor.error(
-            "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data());
-        }
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
       } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) {
         var_idx = hint_idx + 1;
       } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) {
@@ -2092,60 +2317,88 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
           search_start = 0;
           goto search_loop;
         }
-        cursor.error(
-          "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data());
+        aux_var = &state.bounds_only_vars[var_name];
       }
     found:
-      hint_idx                 = var_idx;
-      bool first_bound_for_var = !has_bound(var_idx);
+      if (var_idx != SIZE_MAX) { hint_idx = var_idx; }
+      bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx);
 
-      f_t value = 0;
+      f_t value      = 0;
+      bool has_value = false;
       accept_comment(cursor);
       if (!cursor.eol()) {
         // bounds are often just set to 0 or 1
         if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') {
           value = cursor.ptr[0] - '0';
           cursor.ptr += 1;
+          has_value = true;
         } else {
-          value = (f_t)expect_number(cursor);
+          value     = (f_t)expect_number(cursor);
+          has_value = true;
         }
         accept_comment(cursor);
       }
 
+      auto set_lb = [&](f_t x) {
+        if (aux_var) {
+          aux_var->lb = x;
+        } else {
+          state.problem.variable_lower_bounds_[var_idx] = x;
+        }
+      };
+      auto set_ub = [&](f_t x) {
+        if (aux_var) {
+          aux_var->ub = x;
+        } else {
+          state.problem.variable_upper_bounds_[var_idx] = x;
+        }
+      };
+      auto set_type = [&](char t) {
+        if (aux_var) {
+          aux_var->type = t;
+        } else {
+          state.problem.var_types_[var_idx] = t;
+        }
+      };
+
       if (bound_type == "LO") {
-        state.problem.variable_lower_bounds_[var_idx] = value;
+        set_lb(value);
       } else if (bound_type == "UP") {
-        state.problem.variable_upper_bounds_[var_idx] = value;
+        set_ub(value);
         if (first_bound_for_var && value < f_t{0}) {
-          state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+          set_lb(-std::numeric_limits<f_t>::infinity());
         }
       } else if (bound_type == "FX") {
-        state.problem.variable_lower_bounds_[var_idx] = value;
-        state.problem.variable_upper_bounds_[var_idx] = value;
+        set_lb(value);
+        set_ub(value);
       } else if (bound_type == "FR") {
-        state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
-        state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+        set_lb(-std::numeric_limits<f_t>::infinity());
+        set_ub(std::numeric_limits<f_t>::infinity());
       } else if (bound_type == "MI") {
-        state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+        set_lb(-std::numeric_limits<f_t>::infinity());
       } else if (bound_type == "PL") {
-        state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
+        set_ub(std::numeric_limits<f_t>::infinity());
       } else if (bound_type == "BV") {
-        state.problem.variable_lower_bounds_[var_idx] = 0;
-        state.problem.variable_upper_bounds_[var_idx] = 1;
-        state.problem.var_types_[var_idx]             = 'I';
+        set_lb(0);
+        set_ub(1);
+        set_type('I');
       } else if (bound_type == "LI") {
-        state.problem.variable_lower_bounds_[var_idx] = value;
-        state.problem.var_types_[var_idx]             = 'I';
+        set_lb(value);
+        set_type('I');
       } else if (bound_type == "UI") {
-        state.problem.variable_upper_bounds_[var_idx] = value;
+        set_ub(value);
         if (first_bound_for_var && value < f_t{0}) {
-          state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
+          set_lb(-std::numeric_limits<f_t>::infinity());
         }
-        state.problem.var_types_[var_idx] = 'I';
+        set_type('I');
+      } else if (bound_type == "SC") {
+        if (__unlikely(!has_value)) { cursor.error("SC bound requires an upper bound value"); }
+        set_ub(value);
+        set_type('S');
       } else {
         cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data());
       }
-      mark_bound(var_idx);
+      if (aux_var == nullptr) { mark_bound(var_idx); }
 
       expect_eol(cursor);
     }
@@ -2204,8 +2457,7 @@ static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& curso
     }
   };
 
-  while (cursor.ptr < cursor.end && peek(cursor) != "BOUNDS" && peek(cursor) != "ENDATA" &&
-         !is_quadratic_section_start(cursor.ptr, cursor.end)) {
+  while (cursor.ptr < cursor.end) {
     auto range_name = cursor.read_field();
     (void)range_name;
     if (accept_comment(cursor)) {
@@ -2307,12 +2559,10 @@ static void build_quadratic_csr(parse_state_t<i_t, f_t>& state,
 }
 
 template <typename i_t, typename f_t>
-[[maybe_unused]] static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state,
-                                                      cursor_t& cursor)
+static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
 {
   scoped_timer_t timer("parse_quadratic_sections");
-  if (cursor.done() || peek(cursor) == "ENDATA") { return; }
-  if (!is_quadratic_section_start(cursor.ptr, cursor.end)) { return; }
+  if (cursor.done()) { return; }
 
   build_var_name_map_if_needed(state);
   std::vector<std::tuple<i_t, i_t, f_t>> quadobj_entries;
@@ -2332,7 +2582,6 @@ template <typename i_t, typename f_t>
   };
 
   while (cursor.ptr < cursor.end) {
-    if (peek(cursor) == "ENDATA") { break; }
     if (accept_section(cursor, "QUADOBJ")) {
       active_entries = &quadobj_entries;
       continue;
@@ -2341,6 +2590,9 @@ template <typename i_t, typename f_t>
       active_entries = &qmatrix_entries;
       continue;
     }
+    if (accept_section(cursor, "QCMATRIX")) {
+      cursor.error("QCMATRIX sections are not supported by the experimental fast MPS parser");
+    }
     if (active_entries == nullptr) { break; }
 
     auto var1 = cursor.read_field();
@@ -2442,24 +2694,11 @@ static void parse_ranges_range(parse_state_t<i_t, f_t>& state,
 template <typename i_t, typename f_t>
 static void parse_quadratic_range(parse_state_t<i_t, f_t>& state,
                                   mps_phase_range_t range,
-                                  const char* fallback_ptr)
+                                  const char*)
 {
-  (void)state;
-  if (range.present) {
-    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
-    if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) {
-      throw std::logic_error(
-        "experimental fast MPS reader currently supports LP/MIP MPS files only; "
-        "quadratic MPS sections are not supported");
-    }
-  } else {
-    cursor_t cursor(fallback_ptr, 16);
-    if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) {
-      throw std::logic_error(
-        "experimental fast MPS reader currently supports LP/MIP MPS files only; "
-        "quadratic MPS sections are not supported");
-    }
-  }
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_quadratic_sections(state, cursor);
 }
 
 template <typename i_t, typename f_t>
@@ -2519,6 +2758,23 @@ static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
   }
 }
 
+template <typename i_t, typename f_t>
+static void append_bounds_only_variables(parse_state_t<i_t, f_t>& state)
+{
+  if (state.bounds_only_vars.empty()) { return; }
+  scoped_timer_t timer("append_bounds_only_variables");
+
+  // BOUNDS-only variables have no matrix entries; append after COLUMNS vars.
+  for (const auto& [name, aux] : state.bounds_only_vars) {
+    state.problem.var_names_.emplace_back(name);
+    state.problem.var_types_.push_back(aux.type);
+    state.problem.c_.push_back(f_t{0});
+    state.problem.variable_lower_bounds_.push_back(aux.lb);
+    state.problem.variable_upper_bounds_.push_back(aux.ub);
+  }
+  state.problem.n_vars_ = (i_t)state.problem.var_names_.size();
+}
+
 template <typename Stream, typename i_t, typename f_t>
 static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
   Stream& stream, const char* total_timer_name, const char* producer_task_name)
@@ -2591,7 +2847,10 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   int header_done = 0, rows_done = 0, columns_done = 0;
   int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
 
-#pragma omp parallel num_threads(std::min(32, omp_get_max_threads()))
+  const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size);
+  const int parser_threads      = parser_thread_cap_for_size(parser_size);
+
+#pragma omp parallel num_threads(parser_threads)
   {
     std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num());
     nvtx::name_current_thread(thread_name.c_str());
@@ -2724,6 +2983,8 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
 
   if (first_task_error) { std::rethrow_exception(first_task_error); }
 
+  append_bounds_only_variables(state);
+
   input.size = stream.size();
   cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present
                  ? input.registry->range(mps_phase_kind::quadratic).end
@@ -2740,6 +3001,102 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   return problem;
 }
 
+struct small_raw_read_t {
+  bool use_small_path = false;
+  std::vector<char> buffer;
+};
+
+static small_raw_read_t try_read_small_raw_file(const std::string& path)
+{
+  FILE* file = std::fopen(path.c_str(), "rb");
+  if (file == nullptr) {
+    throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno));
+  }
+  std::unique_ptr<FILE, decltype(&std::fclose)> file_guard(file, &std::fclose);
+
+  if (std::fseek(file, 0, SEEK_END) != 0) {
+    throw std::runtime_error("Failed to seek raw MPS file '" + path + "'");
+  }
+  long file_size_long = std::ftell(file);
+  if (file_size_long < 0) {
+    throw std::runtime_error("Failed to determine raw MPS file size '" + path + "'");
+  }
+  std::size_t file_size = static_cast<std::size_t>(file_size_long);
+  if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; }
+  if (std::fseek(file, 0, SEEK_SET) != 0) {
+    throw std::runtime_error("Failed to rewind raw MPS file '" + path + "'");
+  }
+
+  std::vector<char> buffer(file_size);
+  if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) {
+    throw std::runtime_error("Failed to read raw MPS file '" + path + "'");
+  }
+  return {true, std::move(buffer)};
+}
+
+template <typename i_t, typename f_t>
+static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_small_raw_file(
+  std::vector<char> buffer)
+{
+  auto total_timer = std::make_unique<scoped_timer_t>("parse_mps_fast_file_raw_small (total)");
+  const char* data = buffer.data();
+  const char* end  = data + buffer.size();
+
+  mps_phase_registry_t registry;
+  mps_section_block_scanner_t scanner(data, 1, registry);
+  scanner.observe_block(0, data, end);
+  scanner.publish_ready(buffer.size());
+
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
+  problem.n_vars_                   = 0;
+  problem.n_constraints_            = 0;
+  problem.nnz_                      = 0;
+  problem.maximize_                 = false;
+  problem.objective_scaling_factor_ = f_t{1};
+  problem.objective_offset_         = f_t{0};
+
+  std::size_t reserve_size = std::max<std::size_t>(buffer.size(), 1024 * 1024);
+  std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
+  problem.A_offsets_.reserve(reserve_dim);
+  problem.b_.reserve(reserve_dim);
+  problem.variable_lower_bounds_.reserve(reserve_dim);
+  problem.variable_upper_bounds_.reserve(reserve_dim);
+  problem.var_types_.reserve(reserve_dim);
+  problem.row_types_.reserve(reserve_dim);
+  problem.row_names_.reserve(reserve_dim);
+  problem.var_names_.reserve(reserve_dim);
+  problem.constraint_lower_bounds_.reserve(reserve_dim);
+  problem.constraint_upper_bounds_.reserve(reserve_dim);
+
+  cursor_t cursor(data, buffer.size());
+  parse_state_t<i_t, f_t> state(problem, cursor);
+  state.row_names_sv.reserve(reserve_dim);
+
+  parse_header_range(state, registry.range(mps_phase_kind::header));
+  parse_rows_range(state, registry.range(mps_phase_kind::rows));
+  parse_columns_range(state, registry.range(mps_phase_kind::columns), 1);
+  materialize_problem_names(state);
+  parse_rhs_range(state, registry.range(mps_phase_kind::rhs));
+  parse_ranges_range(state, registry.range(mps_phase_kind::ranges), data);
+  parse_bounds_range(state, registry.range(mps_phase_kind::bounds), data);
+  parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data);
+  append_bounds_only_variables(state);
+
+  cursor.ptr = registry.range(mps_phase_kind::quadratic).present
+                 ? registry.range(mps_phase_kind::quadratic).end
+                 : (registry.range(mps_phase_kind::bounds).present
+                      ? registry.range(mps_phase_kind::bounds).end
+                      : (registry.range(mps_phase_kind::ranges).present
+                           ? registry.range(mps_phase_kind::ranges).end
+                           : registry.range(mps_phase_kind::rhs).end));
+  cursor.end = end;
+  if (!cursor.done()) { expect(cursor, "ENDATA"); }
+
+  total_timer.reset();
+  flush_timers();
+  return problem;
+}
+
 template <typename i_t, typename f_t>
 cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
   const std::string& path, FileReadMethod read_method)
@@ -2751,11 +3108,15 @@ cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
       stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
   }
   if (effective_method == FileReadMethod::Read) {
+    small_raw_read_t small_raw = try_read_small_raw_file(path);
+    if (small_raw.use_small_path) {
+      return parse_mps_fast_small_raw_file<i_t, f_t>(std::move(small_raw.buffer));
+    }
     RawInputStream stream(path);
     return parse_mps_fast_stream<RawInputStream, i_t, f_t>(
       stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
   }
-  throw std::runtime_error("experimental fast MPS reader supports raw and LZ4 inputs only");
+  throw std::runtime_error("single-path parser supports raw read and LZ4 inputs only");
 }
 
 template cuopt::linear_programming::io::mps_data_model_t<int, float> parse_mps_fast_file(
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
index 9e5777efc2..49a7602739 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
@@ -9,6 +9,8 @@
 
 #include "fast_parser.hpp"
 
+#include <cstdint>
+
 namespace cuopt::linear_programming::io {
 
 template <typename i_t, typename f_t>
@@ -19,5 +21,9 @@ mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_fil
 
 template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
 template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int64_t, float> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+template mps_data_model_t<int64_t, double> read_mps_fast_experimental(
+  const std::string& mps_file_path);
 
 }  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 819b1948bf..97ef5c5cc4 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -162,35 +162,42 @@ void RawInputStream::run_decode_tasks()
   };
 
   auto read_window = [&](std::size_t index) {
+    MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
     std::size_t offset = index * window_bytes_;
     std::size_t size   = std::min(window_bytes_, file_size_ - offset);
     std::size_t done   = 0;
-    while (done < size) {
-      ssize_t got =
-        ::pread(fd_, output_data_ + offset + done, size - done, static_cast<off_t>(offset + done));
-      if (got < 0) {
-        if (errno == EINTR) { continue; }
-        throw std::runtime_error("Failed to pread raw MPS file '" + path_ +
-                                 "': " + std::strerror(errno));
-      }
-      if (got == 0) {
-        throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'");
+    {
+      MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+      while (done < size) {
+        ssize_t got = ::pread(
+          fd_, output_data_ + offset + done, size - done, static_cast<off_t>(offset + done));
+        if (got < 0) {
+          if (errno == EINTR) { continue; }
+          throw std::runtime_error("Failed to pread raw MPS file '" + path_ +
+                                   "': " + std::strerror(errno));
+        }
+        if (got == 0) {
+          throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'");
+        }
+        done += static_cast<std::size_t>(got);
       }
-      done += static_cast<std::size_t>(got);
     }
 
-    section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
-    frontier_mutex_.lock();
-    block_done_[index] = 1;
-    block_end_[index]  = offset + size;
-    std::size_t before = ready_bytes_;
-    while (next_block_ < block_done_.size() && block_done_[next_block_]) {
-      ready_bytes_ = block_end_[next_block_];
-      ++next_block_;
+    {
+      MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
+      section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
+      frontier_mutex_.lock();
+      block_done_[index] = 1;
+      block_end_[index]  = offset + size;
+      std::size_t before = ready_bytes_;
+      while (next_block_ < block_done_.size() && block_done_[next_block_]) {
+        ready_bytes_ = block_end_[next_block_];
+        ++next_block_;
+      }
+      std::size_t after = ready_bytes_;
+      frontier_mutex_.unlock();
+      if (after > before) { section_scanner_->publish_ready(after); }
     }
-    std::size_t after = ready_bytes_;
-    frontier_mutex_.unlock();
-    if (after > before) { section_scanner_->publish_ready(after); }
   };
 
   std::vector<std::thread> workers;
@@ -199,6 +206,7 @@ void RawInputStream::run_decode_tasks()
     workers.emplace_back([&, t] {
       std::string thread_name = "raw-input-read-" + std::to_string(t);
       nvtx::name_current_thread(thread_name.c_str());
+      MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io);
       while (!stop.load(std::memory_order_acquire)) {
         std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
         if (index >= window_count_) { break; }
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index fbe18768af..36c42ba79a 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -9,6 +9,7 @@
 #include <omp.h>
 #endif
 
+#ifndef _WIN32
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/mman.h>
@@ -16,6 +17,7 @@
 #include <unistd.h>
 #include <cerrno>
 #include <cstring>
+#endif
 
 #include <algorithm>
 #include <atomic>
@@ -44,9 +46,9 @@ constexpr std::size_t lz4_pipeline_batch_bytes          = 64ull * 1024ull * 1024
 constexpr std::size_t lz4_input_max_io_threads          = 8;
 constexpr std::size_t lz4_no_content_size_reserve_ratio = 16;
 
-#if defined(MPS_PARSER_WITH_LZ4)
 using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
 
+#if defined(MPS_PARSER_WITH_LZ4)
 struct lz4_runtime_t {
   void* handle                          = nullptr;
   LZ4_decompress_safe_t decompress_safe = nullptr;
@@ -54,28 +56,28 @@ struct lz4_runtime_t {
   lz4_runtime_t()
   {
     for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
-      handle = dlopen(soname, RTLD_LAZY);
+      handle = ::dlopen(soname, RTLD_LAZY);
       if (handle != nullptr) { break; }
     }
     if (handle == nullptr) {
       throw std::logic_error(
         "Could not open .mps.lz4 file since liblz4 was not found "
-        "(tried liblz4.so.1, liblz4.so). In order to open .mps.lz4 files "
-        "directly, please ensure liblz4 is installed. Alternatively, decompress "
-        "the .lz4 file manually and open the uncompressed .mps file.");
+        "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually "
+        "or install liblz4.");
     }
 
-    decompress_safe = reinterpret_cast<LZ4_decompress_safe_t>(dlsym(handle, "LZ4_decompress_safe"));
+    decompress_safe =
+      reinterpret_cast<LZ4_decompress_safe_t>(::dlsym(handle, "LZ4_decompress_safe"));
     if (decompress_safe == nullptr) {
       throw std::logic_error(
-        "Error loading liblz4! Library version might be incompatible. Please decompress "
-        "the .lz4 file manually and open the uncompressed .mps file.");
+        "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file manually "
+        "or install a compatible liblz4.");
     }
   }
 
   ~lz4_runtime_t()
   {
-    if (handle != nullptr) { dlclose(handle); }
+    if (handle != nullptr) { ::dlclose(handle); }
   }
 
   lz4_runtime_t(const lz4_runtime_t&)            = delete;
@@ -124,9 +126,12 @@ int open_lz4_fd(const std::string& path)
   return fd;
 }
 
+#ifndef _WIN32
 std::size_t system_page_size();
+#endif
 std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment);
 
+#ifndef _WIN32
 class FileDescriptor {
  public:
   explicit FileDescriptor(int fd) : fd_(fd) {}
@@ -145,6 +150,8 @@ class FileDescriptor {
   int fd_;
 };
 
+#endif
+
 uint32_t read_le32(const char* ptr)
 {
   const auto* p = reinterpret_cast<const unsigned char*>(ptr);
@@ -181,6 +188,7 @@ std::size_t checked_size(uint64_t value, const char* label)
   return static_cast<std::size_t>(value);
 }
 
+#ifndef _WIN32
 std::size_t get_file_size(int fd, const std::string& path)
 {
   struct stat st;
@@ -191,6 +199,9 @@ std::size_t get_file_size(int fd, const std::string& path)
   return static_cast<std::size_t>(st.st_size);
 }
 
+#endif
+
+#ifndef _WIN32
 std::size_t system_page_size()
 {
   static std::size_t page_size = [] {
@@ -199,6 +210,7 @@ std::size_t system_page_size()
   }();
   return page_size;
 }
+#endif
 
 std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
 {
@@ -212,6 +224,7 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
   return value + increment;
 }
 
+#ifndef _WIN32
 std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
 {
   if (a != 0 && b > std::numeric_limits<std::size_t>::max() / a) {
@@ -320,12 +333,14 @@ class lz4_resident_windows_t {
 
   std::vector<lz4_resident_window_t>& windows_;
 };
+#endif
 
 }  // namespace
 
 Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
 {
   MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io);
+
   ensure_lz4_runtime_available();
 
   fd_ = open_lz4_fd(path);
diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/io/experimental_mps_fast/perf_counters.hpp
new file mode 100644
index 0000000000..147a7ae7bb
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/perf_counters.hpp
@@ -0,0 +1,163 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <array>
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+namespace mps_fast {
+
+struct perf_counter_spec_t {
+  const char* name;
+  uint32_t type;
+  uint64_t config;
+};
+
+static constexpr uint64_t perf_cache_config(uint64_t cache, uint64_t op, uint64_t result)
+{
+  return cache | (op << 8) | (result << 16);
+}
+
+// Small scoped Linux perf_event_open wrapper for coarse phase diagnostics.
+//
+// Important limitations:
+// - Counters are per-thread: construct one instance inside each worker whose
+//   work should be measured, then aggregate snapshots.
+// - These are generic perf events; exact mappings vary by CPU. Some events may
+//   be unavailable or unhelpful, e.g. store-side DTLB misses on this node.
+// - This deliberately does not use event groups or time_enabled/time_running
+//   scaling, so counts are approximate if the kernel multiplexes counters.
+static constexpr std::array<perf_counter_spec_t, 8> PERF_COUNTER_SPECS = {{
+  {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES},
+  {"instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS},
+  {"cache_refs", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES},
+  {"cache_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES},
+  {"branch_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES},
+  {"backend_stall_cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND},
+  {"dtlb_load_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+  {"dtlb_store_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_WRITE, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+}};
+
+struct perf_counter_snapshot_t {
+  bool active                                            = false;
+  int open_errno                                         = 0;
+  std::array<uint64_t, PERF_COUNTER_SPECS.size()> values = {};
+};
+
+class thread_perf_counters_t {
+ public:
+  thread_perf_counters_t()
+  {
+    fds_.fill(-1);
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      perf_event_attr attr = {};
+      attr.type            = PERF_COUNTER_SPECS[i].type;
+      attr.size            = sizeof(attr);
+      attr.config          = PERF_COUNTER_SPECS[i].config;
+      attr.disabled        = 1;
+      attr.exclude_kernel  = 1;
+      attr.exclude_hv      = 1;
+
+      int fd = (int)syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+      if (fd < 0) {
+        if (first_errno_ == 0) { first_errno_ = errno; }
+        continue;
+      }
+      fds_[i] = fd;
+      active_ = true;
+    }
+
+    if (active_) {
+      for (int fd : fds_) {
+        if (fd >= 0) {
+          ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+          ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+        }
+      }
+    }
+  }
+
+  thread_perf_counters_t(const thread_perf_counters_t&)            = delete;
+  thread_perf_counters_t& operator=(const thread_perf_counters_t&) = delete;
+
+  ~thread_perf_counters_t() { close_all(); }
+
+  perf_counter_snapshot_t stop()
+  {
+    perf_counter_snapshot_t snapshot;
+    snapshot.active     = active_;
+    snapshot.open_errno = first_errno_;
+
+    for (size_t i = 0; i < fds_.size(); ++i) {
+      int fd = fds_[i];
+      if (fd < 0) continue;
+      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      uint64_t value = 0;
+      if (read(fd, &value, sizeof(value)) == (ssize_t)sizeof(value)) { snapshot.values[i] = value; }
+    }
+    close_all();
+    active_ = false;
+    return snapshot;
+  }
+
+ private:
+  void close_all()
+  {
+    for (int& fd : fds_) {
+      if (fd >= 0) {
+        close(fd);
+        fd = -1;
+      }
+    }
+  }
+
+  bool active_     = false;
+  int first_errno_ = 0;
+  std::array<int, PERF_COUNTER_SPECS.size()> fds_;
+};
+
+static inline void print_perf_totals(const char* label,
+                                     const std::vector<perf_counter_snapshot_t>& snapshots)
+{
+  std::array<unsigned long long, PERF_COUNTER_SPECS.size()> totals = {};
+  bool any_active                                                  = false;
+  int first_errno                                                  = 0;
+  for (const auto& snapshot : snapshots) {
+    if (snapshot.open_errno != 0 && first_errno == 0) { first_errno = snapshot.open_errno; }
+    if (!snapshot.active) continue;
+    any_active = true;
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      totals[i] += snapshot.values[i];
+    }
+  }
+
+  if (!any_active) {
+    std::fprintf(stderr, "[PERF] %s unavailable errno=%d\n", label, first_errno);
+    return;
+  }
+
+  double ipc       = totals[0] == 0 ? 0.0 : (double)totals[1] / (double)totals[0];
+  double miss_rate = totals[2] == 0 ? 0.0 : (double)totals[3] / (double)totals[2];
+  std::fprintf(stderr, "[PERF] %s", label);
+  for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+    std::fprintf(stderr, " %s=%llu", PERF_COUNTER_SPECS[i].name, totals[i]);
+  }
+  std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate);
+}
+
+}  // namespace mps_fast

From 91742cd0d5a1b01d72f49e5a65adc40ded0b50f5 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 5 Jun 2026 09:39:48 -0700
Subject: [PATCH 04/22] improved iee754 compliant float parsing, warn on nnz >
 INT_MAX

---
 cpp/CMakeLists.txt                            |   2 +-
 .../fast_fp64_parser.hpp                      | 400 ++++++++++++++++++
 .../fast_parse_primitives.hpp                 | 116 +----
 .../io/experimental_mps_fast/fast_parser.cpp  | 111 +++--
 .../io/experimental_mps_fast/fast_parser.hpp  |  11 +-
 .../fast_parser_adapter.cpp                   |   3 +
 .../io/experimental_mps_fast/file_reader.cpp  |  33 +-
 .../io/experimental_mps_fast/file_reader.hpp  |   2 +-
 .../hash_table_smallstr.hpp                   |  27 +-
 .../experimental_mps_fast/lz4_file_reader.cpp | 168 ++++----
 .../io/experimental_mps_fast/mmap_region.hpp  |  26 +-
 .../mps_section_scanner.cpp                   |  17 +-
 .../mps_section_scanner.hpp                   |   2 +-
 .../io/experimental_mps_fast/nvtx_ranges.hpp  |   2 +-
 .../experimental_mps_fast/perf_counters.hpp   |   2 +-
 .../io/experimental_mps_fast/simd_compat.hpp  |   2 +-
 cpp/src/io/utilities/error.hpp                |  28 +-
 17 files changed, 685 insertions(+), 267 deletions(-)
 create mode 100644 cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 60227547b4..712a132fc0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -465,7 +465,7 @@ endif ()
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
         CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
     set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
-            APPEND PROPERTY COMPILE_OPTIONS "-mavx2;-maes;-msse4.2")
+            APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2")
 endif ()
 
 # Apply -UNDEBUG only to solver source files (not gRPC infrastructure).
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
new file mode 100644
index 0000000000..605c6adc5b
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -0,0 +1,400 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <utilities/error.hpp>
+
+#include <array>
+#include <bit>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+#include <string_view>
+
+namespace mps_fast {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace fp64 {
+
+#define FASTP64_MIN_EXP_10    (-307)
+#define FASTP64_MAX_EXP_10    288
+#define FASTP64_POWER_COUNT   (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1)
+#define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1)
+
+// Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+
+struct power_10_lut_entry_t {
+  uint64_t high;
+  uint64_t low;
+  int biased_e2;
+};
+
+struct cuopt_uint256_t {
+  std::array<uint64_t, 4> limb{};
+
+  constexpr uint32_t mul_u32(uint32_t m)
+  {
+    unsigned __int128 carry = 0;
+    for (uint64_t& v : limb) {
+      unsigned __int128 x = static_cast<unsigned __int128>(v) * m + carry;
+      v                   = static_cast<uint64_t>(x);
+      carry               = x >> 64;
+    }
+    return static_cast<uint32_t>(carry);
+  }
+
+  constexpr cuopt_uint256_t shl_small(int bits) const
+  {
+    cuopt_uint256_t out;
+    if (bits == 0) return *this;
+    for (int i = 3; i >= 0; --i) {
+      uint64_t v = limb[i] << bits;
+      if (i > 0) v |= limb[i - 1] >> (64 - bits);
+      out.limb[i] = v;
+    }
+    return out;
+  }
+};
+
+struct cuopt_normalized_uint256_t {
+  cuopt_uint256_t sig;
+  int exp2 = 0;
+
+  static constexpr cuopt_normalized_uint256_t one()
+  {
+    cuopt_normalized_uint256_t x;
+    x.sig.limb[3] = uint64_t{1} << 63;
+    x.exp2        = -255;
+    return x;
+  }
+
+  constexpr void mul10()
+  {
+    uint32_t carry = sig.mul_u32(10);
+    int shift      = 32 - std::countl_zero(carry);
+    cuopt_uint256_t out;
+    for (int i = 0; i < 4; ++i) {
+      uint64_t lower = sig.limb[i] >> shift;
+      uint64_t upper = 0;
+      if (i + 1 < 4) {
+        upper = sig.limb[i + 1] << (64 - shift);
+      } else {
+        upper = static_cast<uint64_t>(carry) << (64 - shift);
+      }
+      out.limb[i] = lower | upper;
+    }
+    sig = out;
+    exp2 += shift;
+  }
+
+  constexpr void div10()
+  {
+    constexpr uint64_t div10_shift_4_threshold = 0xA000000000000000ULL;
+    int shift                                  = sig.limb[3] < div10_shift_4_threshold ? 4 : 3;
+    uint64_t extra                             = sig.limb[3] >> (64 - shift);
+    cuopt_uint256_t shifted                    = sig.shl_small(shift);
+
+    cuopt_uint256_t quotient;
+    unsigned __int128 rem = extra;
+    for (int i = 3; i >= 0; --i) {
+      unsigned __int128 cur = (rem << 64) | shifted.limb[i];
+      quotient.limb[i]      = static_cast<uint64_t>(cur / 10);
+      rem                   = cur % 10;
+    }
+    sig = quotient;
+    exp2 -= shift;
+  }
+};
+
+constexpr power_10_lut_entry_t make_power(const cuopt_normalized_uint256_t& p)
+{
+  int e2 = p.exp2 + 192;
+  return {p.sig.limb[3], p.sig.limb[2], 1150 + e2};
+}
+
+// build time LUT for the lemire trick
+constexpr std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> make_power_table()
+{
+  std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> table{};
+  cuopt_normalized_uint256_t p = cuopt_normalized_uint256_t::one();
+  table[-FASTP64_MIN_EXP_10]   = make_power(p);
+
+  for (int e = 1; e <= FASTP64_MAX_EXP_10; ++e) {
+    p.mul10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+
+  p = cuopt_normalized_uint256_t::one();
+  for (int e = -1; e >= FASTP64_MIN_EXP_10; --e) {
+    p.div10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+  return table;
+}
+
+inline constexpr auto fast_fp64_parse_lut = make_power_table();
+
+inline constexpr std::array<double, 23> small_powers = {
+  1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+  1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+inline constexpr std::array<uint64_t, 16> small_integer_powers = {1ULL,
+                                                                  10ULL,
+                                                                  100ULL,
+                                                                  1000ULL,
+                                                                  10000ULL,
+                                                                  100000ULL,
+                                                                  1000000ULL,
+                                                                  10000000ULL,
+                                                                  100000000ULL,
+                                                                  1000000000ULL,
+                                                                  10000000000ULL,
+                                                                  100000000000ULL,
+                                                                  1000000000000ULL,
+                                                                  10000000000000ULL,
+                                                                  100000000000000ULL,
+                                                                  1000000000000000ULL};
+
+struct ParsedDecimal {
+  bool negative      = false;
+  bool fast_eligible = false;
+  uint64_t mantissa  = 0;
+  int exp10          = 0;
+};
+
+static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; }
+
+// SWAR 8char run of digits -> integer representation
+static inline bool parse_8_digits(const char* p, uint32_t& out)
+{
+  std::array<char, sizeof(uint64_t)> bytes{};
+  std::memcpy(bytes.data(), p, bytes.size());
+  uint64_t raw       = std::bit_cast<uint64_t>(bytes);
+  uint64_t high      = raw & 0xF0F0F0F0F0F0F0F0ULL;
+  uint64_t low_check = (raw + 0x0606060606060606ULL) & 0xF0F0F0F0F0F0F0F0ULL;
+  if (high != 0x3030303030303030ULL || low_check != 0x3030303030303030ULL) { return false; }
+
+  uint64_t v     = raw - 0x3030303030303030ULL;
+  uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL;
+  uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL;
+  out            = static_cast<uint32_t>((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL);
+  return true;
+}
+
+static inline void scan_digit_run(const char*& p,
+                                  const char* end,
+                                  bool after_dot,
+                                  ParsedDecimal& out,
+                                  bool& saw_digit,
+                                  int& frac_digits,
+                                  int& sig_digits,
+                                  bool& too_many_digits)
+{
+  while (p < end) {
+    uint32_t chunk = 0;
+    if (end - p >= 8 && parse_8_digits(p, chunk)) {
+      saw_digit = true;
+      if (after_dot) frac_digits += 8;
+
+      if (!too_many_digits) {
+        if (sig_digits == 0 && chunk == 0) {
+          p += 8;
+          continue;
+        }
+
+        if (sig_digits + 8 <= 19) {
+          out.mantissa = out.mantissa * 100000000ULL + chunk;
+          sig_digits += 8;
+        } else {
+          too_many_digits = true;
+        }
+      }
+
+      p += 8;
+      continue;
+    }
+
+    if (!is_digit(*p)) return;
+    saw_digit = true;
+    int digit = *p - '0';
+    if (after_dot) ++frac_digits;
+    if (!too_many_digits && (digit != 0 || sig_digits != 0)) {
+      if (sig_digits < 19) {
+        out.mantissa = (out.mantissa * 10) + static_cast<uint64_t>(digit);
+        ++sig_digits;
+      } else {
+        too_many_digits = true;
+      }
+    }
+    ++p;
+  }
+}
+
+static inline bool parse_decimal_advance(const char*& p, const char* end, ParsedDecimal& out)
+{
+  if (p < end && (*p == '-' || *p == '+')) {
+    out.negative = *p == '-';
+    ++p;
+  }
+
+  bool saw_digit       = false;
+  int frac_digits      = 0;
+  int sig_digits       = 0;
+  bool too_many_digits = false;
+
+  scan_digit_run(p, end, false, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  if (p < end && *p == '.') {
+    ++p;
+    scan_digit_run(p, end, true, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  }
+
+  if (!saw_digit) return false;
+
+  int explicit_exp = 0;
+  if (p < end && (*p == 'e' || *p == 'E' || *p == 'd' || *p == 'D')) {
+    const char* exp_start = p;
+    ++p;
+    bool exp_negative = false;
+    if (p < end && (*p == '-' || *p == '+')) {
+      exp_negative = *p == '-';
+      ++p;
+    }
+    if (p == end || !is_digit(*p)) {
+      p = exp_start;
+    } else {
+      int exp_value = 0;
+      while (p < end && is_digit(*p)) {
+        if (exp_value < 1000000) exp_value = exp_value * 10 + (*p - '0');
+        ++p;
+      }
+      explicit_exp = exp_negative ? -exp_value : exp_value;
+    }
+  }
+
+  out.exp10         = explicit_exp - frac_digits;
+  out.fast_eligible = !too_many_digits;
+  return true;
+}
+
+// fallback to stdlib for edge case or ambiguous roundings (very rare)
+static inline double fallback_strtod(std::string_view s)
+{
+  char stack_buf[32];
+  if (s.size() >= sizeof(stack_buf)) {
+    mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length");
+  }
+  std::memcpy(stack_buf, s.data(), s.size());
+  stack_buf[s.size()] = '\0';
+  for (size_t i = 0; i < s.size(); ++i) {
+    if (stack_buf[i] == 'd' || stack_buf[i] == 'D') stack_buf[i] = 'e';
+  }
+
+  char* parse_end = nullptr;
+  errno           = 0;
+  return std::strtod(stack_buf, &parse_end);
+}
+
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits)
+{
+  if (exp10 < FASTP64_MIN_EXP_10 || exp10 > FASTP64_MAX_EXP_10) { return false; }
+
+  const power_10_lut_entry_t p = fast_fp64_parse_lut[exp10 - FASTP64_MIN_EXP_10];
+  int lz                       = std::countl_zero(man);
+  uint64_t norm                = man << lz;
+  int adj_e2                   = p.biased_e2 - lz;
+
+  unsigned __int128 product = static_cast<unsigned __int128>(norm) * p.high;
+  uint64_t hi               = static_cast<uint64_t>(product >> 64);
+  uint64_t lo               = static_cast<uint64_t>(product);
+
+  if ((hi & 0x1FF) == 0x1FF && lo + norm < norm) {
+    unsigned __int128 low_product = static_cast<unsigned __int128>(norm) * p.low;
+    uint64_t low_hi               = static_cast<uint64_t>(low_product >> 64);
+    uint64_t low_lo               = static_cast<uint64_t>(low_product);
+    uint64_t old_lo               = lo;
+    lo += low_hi;
+    hi += lo < old_lo ? 1 : 0;
+    if ((hi & 0x1FF) == 0x1FF && lo == std::numeric_limits<uint64_t>::max() &&
+        low_lo + norm < low_lo) {
+      return false;
+    }
+  }
+
+  uint64_t hi_msb = hi >> 63;
+  uint64_t x54    = hi >> (9 + hi_msb);
+  adj_e2 -= static_cast<int>(1 - hi_msb);
+
+  // half-way ambiguity, fallback
+  if (lo == 0 && (hi & 0x1FF) == 0 && (x54 & 3) == 1) { return false; }
+
+  // exponent overflow, fallback
+  uint64_t x53      = (x54 + (x54 & 1)) >> 1;
+  uint64_t overflow = x53 >> 53;
+  uint64_t ret_man  = (x53 >> overflow) & FASTP64_MANTISSA_MASK;
+  int ret_exp       = adj_e2 + static_cast<int>(overflow);
+  if (ret_exp <= 0 || ret_exp >= 0x7FF) { return false; }
+
+  bits = (static_cast<uint64_t>(ret_exp) << 52) | ret_man;
+  return true;
+}
+
+static inline double assemble_fp64(const ParsedDecimal& dec)
+{
+  uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0;
+  if (dec.mantissa == 0) { return std::bit_cast<double>(bits); }
+
+  if (dec.fast_eligible) {
+    double small    = 0.0;
+    bool used_small = false;
+    if (dec.exp10 >= 0 && dec.exp10 < static_cast<int>(small_integer_powers.size())) {
+      uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10];
+      if (dec.mantissa <= limit) {
+        small      = static_cast<double>(dec.mantissa) * small_powers[dec.exp10];
+        used_small = true;
+      }
+    } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) {
+      small      = static_cast<double>(dec.mantissa) / small_powers[-dec.exp10];
+      used_small = true;
+    }
+    if (used_small) { return dec.negative ? -small : small; }
+
+    uint64_t mag_bits = 0;
+    if (eisel_lemire(dec.mantissa, dec.exp10, mag_bits)) {
+      return std::bit_cast<double>(bits | mag_bits);
+    }
+  }
+
+  return std::numeric_limits<double>::quiet_NaN();
+}
+
+static inline double parse_fp64_advance(const char*& p, const char* end)
+{
+  const char* start = p;
+  ParsedDecimal dec;
+  if (!parse_decimal_advance(p, end, dec)) {
+    return fallback_strtod(std::string_view(start, static_cast<size_t>(p - start)));
+  }
+
+  double v = assemble_fp64(dec);
+  if (v == v) return v;
+  return fallback_strtod(std::string_view(start, static_cast<size_t>(p - start)));
+}
+
+static inline double parse_fp64_token(const char* p, const char* end)
+{
+  return parse_fp64_advance(p, end);
+}
+
+}  // namespace fp64
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index 453687df01..bd4ee4669a 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -1,21 +1,21 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved. SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-#include "simd_compat.hpp"
+#include "fast_fp64_parser.hpp"
 
-#include <array>
 #include <cctype>
-#include <cmath>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
-#include <cstdio>
 #include <stdexcept>
 #include <string_view>
 #include <utility>
 
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
 #ifndef __likely
 #define __likely(x) __builtin_expect(!!(x), 1)
 #endif
@@ -26,104 +26,14 @@
 
 namespace mps_fast {
 
-inline constexpr int EXP10_TABLE_MAX = 308;
-
-constexpr double constexpr_pow10(int exp)
-{
-  if (exp == 0) return 1.0;
-  double result = 1.0;
-  if (exp > 0) {
-    for (int i = 0; i < exp; ++i)
-      result *= 10.0;
-  } else {
-    for (int i = 0; i > exp; --i)
-      result /= 10.0;
-  }
-  return result;
-}
-
-constexpr auto make_exp10_table()
-{
-  std::array<double, EXP10_TABLE_MAX * 2 + 1> table{};
-  for (int i = -EXP10_TABLE_MAX; i <= EXP10_TABLE_MAX; ++i) {
-    table[(size_t)(i + EXP10_TABLE_MAX)] = constexpr_pow10(i);
-  }
-  return table;
-}
-
-inline constexpr auto table_exp10 = make_exp10_table();
+static inline void reset_number_parse_stats() {}
+static inline void print_number_parse_stats() {}
 
 static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; }
 
-// Honestly, it's pretty bare bones as it is. It could take advantage of SIMD/SWAR
-// or use the Eisel-Lemire trick. Would have to be validated through benchmarking
-// but usually MPS files use simple enough coefficients
 static inline double fast_atof_core(const char*& data, const char* end)
 {
-  double sign = 1.0;
-  if (data < end && *data == '-') {
-    sign = -1.0;
-    ++data;
-  } else if (data < end && *data == '+') {
-    ++data;
-  }
-
-  uint64_t significand   = 0;
-  int decimal_exponent   = 0;
-  int significant_digits = 0;
-  bool seen_dot          = false;
-
-  while (data < end) {
-    char c = *data;
-    if (is_digit_byte(c)) {
-      int digit = c - '0';
-      if (seen_dot) { --decimal_exponent; }
-      if (significand != 0 || digit != 0) {
-        // FP64 can't represent more than that
-        if (significant_digits < 19) {
-          significand = significand * 10 + static_cast<uint64_t>(digit);
-          ++significant_digits;
-        } else if (!seen_dot) {
-          ++decimal_exponent;
-        }
-      }
-      ++data;
-    } else if (c == '.' && !seen_dot) {
-      seen_dot = true;
-      ++data;
-    } else {
-      break;
-    }
-  }
-
-  if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) {
-    ++data;
-    int exp_sign = 1;
-    if (data < end && *data == '-') {
-      exp_sign = -1;
-      ++data;
-    } else if (data < end && *data == '+') {
-      ++data;
-    }
-
-    int exponent = 0;
-    while (data < end && is_digit_byte(*data)) {
-      exponent = exponent * 10 + (*data - '0');
-      ++data;
-    }
-
-    exponent *= exp_sign;
-    decimal_exponent += exponent;
-  }
-
-  double result = static_cast<double>(significand);
-  if (decimal_exponent >= -EXP10_TABLE_MAX && decimal_exponent <= EXP10_TABLE_MAX) {
-    result *= table_exp10[static_cast<size_t>(decimal_exponent + EXP10_TABLE_MAX)];
-  } else {
-    result *= std::pow(10.0, decimal_exponent);
-  }
-
-  return sign * result;
+  return fp64::parse_fp64_advance(data, end);
 }
 
 static inline double fast_atof(const char* data, const char* end)
@@ -167,14 +77,14 @@ struct cursor_t {
     char msg_buf[512];
     std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args);
     va_end(args);
-    char buf[1024];
-    std::snprintf(buf, sizeof(buf), "%zu:%zu: %s", line, col, msg_buf);
-    throw std::runtime_error(buf);
+    mps_parser_fail(error_type_t::ValidationError, "%zu:%zu: %s", line, col, msg_buf);
   }
 
   void advance(std::size_t n)
   {
-    if (ptr + n > end) { throw std::runtime_error("cursor advanced past end of file"); }
+    if (ptr + n > end) {
+      mps_parser_fail(error_type_t::ValidationError, "cursor advanced past end of file");
+    }
     ptr += n;
   }
 
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 35a67346c3..73f50c5341 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -1,6 +1,8 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
+#define MPS_FAST_TIMERS
+
 #include "fast_parser.hpp"
 #include "fast_parse_primitives.hpp"
 #include "file_reader.hpp"
@@ -1337,14 +1339,38 @@ static const char* find_next_line(const char* p, const char* end)
   return p;
 }
 
-static std::vector<BoundsChunkBoundary> compute_line_chunk_boundaries(const char* section_start,
-                                                                      const char* section_end,
-                                                                      int num_threads)
+static std::string_view peek_bounds_line_var_name(const char* line_start, const char* end)
+{
+  const char* p = line_start;
+  for (int field = 0; field < 2; ++field) {
+    while (p < end && *p <= ' ' && *p != '\n')
+      p++;
+    while (p < end && *p > ' ')
+      p++;
+  }
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+  const char* var_start = p;
+  while (p < end && *p > ' ')
+    p++;
+  return std::string_view(var_start, (size_t)(p - var_start));
+}
+
+static const char* find_line_start(const char* section_start, const char* p)
+{
+  while (p > section_start && p[-1] != '\n')
+    --p;
+  return p;
+}
+
+static std::vector<BoundsChunkBoundary> compute_bounds_chunk_boundaries(const char* section_start,
+                                                                        const char* section_end,
+                                                                        int num_threads)
 {
   scoped_timer_t timer("bounds_compute_chunk_boundaries");
 
-  size_t total_size = (size_t)(section_end - section_start);
-  size_t chunk_size = total_size / (size_t)num_threads;
+  const size_t total_size = (size_t)(section_end - section_start);
+  const size_t chunk_size = total_size / (size_t)num_threads;
 
   std::vector<BoundsChunkBoundary> boundaries((size_t)num_threads);
   boundaries[0].start = section_start;
@@ -1352,9 +1378,21 @@ static std::vector<BoundsChunkBoundary> compute_line_chunk_boundaries(const char
     if (t == num_threads - 1) {
       boundaries[(size_t)t].end = section_end;
     } else {
-      const char* boundary            = section_start + (size_t)(t + 1) * chunk_size;
-      boundaries[(size_t)t].end       = find_next_line(boundary, section_end);
-      boundaries[(size_t)t + 1].start = boundaries[(size_t)t].end;
+      const char* boundary =
+        find_next_line(section_start + (size_t)(t + 1) * chunk_size, section_end);
+
+      // Keep consecutive BOUNDS records for the same variable in one chunk.
+      // Then each thread owns full LO/UP-style groups and can apply file order locally.
+      while (boundary < section_end) {
+        const char* prev_line = find_line_start(section_start, boundary - 1);
+        const auto prev_var   = peek_bounds_line_var_name(prev_line, section_end);
+        const auto next_var   = peek_bounds_line_var_name(boundary, section_end);
+        if (prev_var.empty() || next_var.empty() || prev_var != next_var) { break; }
+        boundary = find_next_line(boundary, section_end);
+      }
+
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
     }
   }
   return boundaries;
@@ -1580,6 +1618,23 @@ static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
     }
   }
   size_t total_cols = global_col_offset[num_chunks];
+  if constexpr (std::numeric_limits<i_t>::max() < std::numeric_limits<int64_t>::max()) {
+    const size_t index_max = (size_t)std::numeric_limits<i_t>::max();
+    if (total_nnz > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu",
+                      total_nnz,
+                      index_max);
+    }
+    if (total_cols > index_max || (size_t)n_rows > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index "
+                      "max=%zu",
+                      (size_t)n_rows,
+                      total_cols,
+                      index_max);
+    }
+  }
   {
     scoped_timer_t timer("columns_dense_metadata");
     bool dense_ok   = total_cols > 0;
@@ -1986,7 +2041,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
     size_t comments         = 0;
     size_t min_var          = SIZE_MAX;
     size_t max_var          = 0;
-    size_t non_strict_order = 0;
+    size_t decreasing_order = 0;
     bool saw_integer_type   = false;
     bool saw_negative_upper = false;
     const char* error_ptr   = nullptr;
@@ -1994,7 +2049,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
   };
 
   std::vector<BoundsParallelStats> stats((size_t)num_threads);
-  auto boundaries = compute_line_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
+  auto boundaries =
+    compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
 
   std::vector<uint8_t> bound_seen;
   {
@@ -2005,8 +2061,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
   {
     scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense"
                                           : "parse_bounds_parallel_ordered_hint");
-    // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse
-    // optimistically, then accept only if chunk summaries prove strict order.
+    // Repeated BOUNDS for the same variable are safe inside a group-owned chunk.
+    // Parse optimistically, then accept only if chunk summaries prove no backward jumps.
 #pragma omp parallel for schedule(static) num_threads(num_threads)
     for (int t = 0; t < num_threads; ++t) {
       auto& local = stats[(size_t)t];
@@ -2073,7 +2129,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
           local.lines++;
           local.min_var = std::min(local.min_var, var_idx);
           local.max_var = std::max(local.max_var, var_idx);
-          if (prev_var != SIZE_MAX && var_idx <= prev_var) { local.non_strict_order++; }
+          if (prev_var != SIZE_MAX && var_idx < prev_var) { local.decreasing_order++; }
           prev_var = var_idx;
 
           bool first_bound_for_var = bound_seen[var_idx] == 0;
@@ -2152,7 +2208,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
   }
 
   size_t dense_misses     = 0;
-  size_t non_strict_order = 0;
+  size_t decreasing_order = 0;
   size_t overlap_chunks   = 0;
   size_t prev_max         = SIZE_MAX;
   for (int t = 0; t < num_threads; ++t) {
@@ -2162,21 +2218,21 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
       cursor.error("%s", local.error_msg);
     }
     dense_misses += local.dense_misses;
-    non_strict_order += local.non_strict_order;
+    decreasing_order += local.decreasing_order;
     if (local.lines > 0) {
       if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; }
       prev_max = local.max_var;
     }
   }
 
-  const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0;
+  const bool order_safe = dense_misses == 0 && decreasing_order == 0 && overlap_chunks == 0;
 
   if (!order_safe) {
     std::fprintf(stderr,
                  "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu "
-                 "non_strict_order=%zu overlap_chunks=%zu\n",
+                 "decreasing_order=%zu overlap_chunks=%zu\n",
                  dense_misses,
-                 non_strict_order,
+                 decreasing_order,
                  overlap_chunks);
     cursor.ptr = bounds_body_start;
     return false;
@@ -3010,26 +3066,30 @@ static small_raw_read_t try_read_small_raw_file(const std::string& path)
 {
   FILE* file = std::fopen(path.c_str(), "rb");
   if (file == nullptr) {
-    throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open raw MPS file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
   }
   std::unique_ptr<FILE, decltype(&std::fclose)> file_guard(file, &std::fclose);
 
   if (std::fseek(file, 0, SEEK_END) != 0) {
-    throw std::runtime_error("Failed to seek raw MPS file '" + path + "'");
+    mps_parser_fail(error_type_t::RuntimeError, "Failed to seek raw MPS file '%s'", path.c_str());
   }
   long file_size_long = std::ftell(file);
   if (file_size_long < 0) {
-    throw std::runtime_error("Failed to determine raw MPS file size '" + path + "'");
+    mps_parser_fail(
+      error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str());
   }
   std::size_t file_size = static_cast<std::size_t>(file_size_long);
   if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; }
   if (std::fseek(file, 0, SEEK_SET) != 0) {
-    throw std::runtime_error("Failed to rewind raw MPS file '" + path + "'");
+    mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str());
   }
 
   std::vector<char> buffer(file_size);
   if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) {
-    throw std::runtime_error("Failed to read raw MPS file '" + path + "'");
+    mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str());
   }
   return {true, std::move(buffer)};
 }
@@ -3116,7 +3176,8 @@ cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
     return parse_mps_fast_stream<RawInputStream, i_t, f_t>(
       stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
   }
-  throw std::runtime_error("single-path parser supports raw read and LZ4 inputs only");
+  mps_parser_fail(error_type_t::RuntimeError,
+                  "single-path parser supports raw read and LZ4 inputs only");
 }
 
 template cuopt::linear_programming::io::mps_data_model_t<int, float> parse_mps_fast_file(
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
index 20e9901024..9f6f0f107b 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved. SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
@@ -13,7 +13,10 @@
 namespace mps_fast {
 
 template <typename i_t, typename f_t>
-cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method = FileReadMethod::Read);
+using parser_model_t = cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>;
+
+template <typename i_t, typename f_t>
+parser_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path,
+                                             FileReadMethod read_method = FileReadMethod::Read);
 
 }  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
index 49a7602739..0d14f059bc 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
@@ -9,6 +9,8 @@
 
 #include "fast_parser.hpp"
 
+#include <utilities/logger.hpp>
+
 #include <cstdint>
 
 namespace cuopt::linear_programming::io {
@@ -16,6 +18,7 @@ namespace cuopt::linear_programming::io {
 template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
 {
+  CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
   return mps_fast::parse_mps_fast_file<i_t, f_t>(mps_file_path);
 }
 
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 97ef5c5cc4..08521eafc0 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -1,9 +1,11 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #include "file_reader.hpp"
 #include "nvtx_ranges.hpp"
 
+#include <utilities/error.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -24,6 +26,10 @@
 
 namespace mps_fast {
 
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
 char* string_buffer;
 char* string_buffer_ptr;
 
@@ -65,7 +71,10 @@ std::size_t get_file_size(int fd, const std::string& path)
 {
   struct stat st;
   if (::fstat(fd, &st) != 0) {
-    throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to stat file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
   }
   return static_cast<std::size_t>(st.st_size);
 }
@@ -86,7 +95,7 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
   if (remainder == 0) { return value; }
   std::size_t increment = alignment - remainder;
   if (value > std::numeric_limits<std::size_t>::max() - increment) {
-    throw std::runtime_error("allocation size overflow");
+    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
   }
   return value + increment;
 }
@@ -98,7 +107,10 @@ RawInputStream::RawInputStream(const std::string& path) : path_(path)
   MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
   fd_ = ::open(path.c_str(), O_RDONLY);
   if (fd_ < 0) {
-    throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open raw MPS file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
   }
 
   file_size_    = get_file_size(fd_, path);
@@ -173,11 +185,15 @@ void RawInputStream::run_decode_tasks()
           fd_, output_data_ + offset + done, size - done, static_cast<off_t>(offset + done));
         if (got < 0) {
           if (errno == EINTR) { continue; }
-          throw std::runtime_error("Failed to pread raw MPS file '" + path_ +
-                                   "': " + std::strerror(errno));
+          mps_parser_fail(error_type_t::RuntimeError,
+                          "Failed to pread raw MPS file '%s': %s",
+                          path_.c_str(),
+                          std::strerror(errno));
         }
         if (got == 0) {
-          throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'");
+          mps_parser_fail(error_type_t::RuntimeError,
+                          "Unexpected EOF while reading raw MPS file '%s'",
+                          path_.c_str());
         }
         done += static_cast<std::size_t>(got);
       }
@@ -243,7 +259,8 @@ FileReadMethod effective_file_read_method(const std::string& path, FileReadMetho
 {
   if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
   if (method == FileReadMethod::Lz4) {
-    throw std::runtime_error("lz4 read method requires a .lz4 input: " + path);
+    mps_parser_fail(
+      error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str());
   }
   return method;
 }
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index 3232a23e84..cc603e35d8 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
index 7aa302da23..ab0d4c2c78 100644
--- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -5,7 +5,8 @@
 
 #pragma once
 
-#include "simd_compat.hpp"
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
 
 #include <cstdint>
 #include <cstring>
@@ -44,30 +45,6 @@ static inline uint32_t crcHash(const uint8_t* key, int64_t len)
   return crc;
 }
 
-static const simde__m128i aes_seed_128 =
-  simde_mm_set_epi64x(0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL);
-static const simde__m256i aes_seed_256 = simde_mm256_set_epi64x(
-  0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL, 0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL);
-
-static inline uint32_t aes_hash(simde__m128i key)
-{
-  simde__m128i h      = simde_mm_aesenc_si128(key, aes_seed_128);
-  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
-  simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8));
-  return (uint32_t)simde_mm_cvtsi128_si32(folded);
-}
-
-static inline uint32_t aes_hash(simde__m256i key)
-{
-  simde__m128i lo     = simde_mm256_castsi256_si128(key);
-  simde__m128i hi     = simde_mm256_extracti128_si256(key, 1);
-  simde__m128i h      = simde_mm_xor_si128(lo, hi);
-  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
-  h                   = simde_mm_aesenc_si128(h, aes_seed_128);
-  simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8));
-  return (uint32_t)simde_mm_cvtsi128_si32(folded);
-}
-
 static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3)
 {
   uint64_t crc = 0;
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 36c42ba79a..010e890058 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -1,26 +1,25 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #include "file_reader.hpp"
 #include "mps_section_scanner.hpp"
 #include "nvtx_ranges.hpp"
 
+#include <utilities/error.hpp>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-#ifndef _WIN32
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
-#include <cerrno>
-#include <cstring>
-#endif
 
 #include <algorithm>
 #include <atomic>
+#include <cerrno>
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
@@ -37,6 +36,10 @@
 
 namespace mps_fast {
 
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
 namespace {
 
 constexpr uint32_t lz4_frame_magic                      = 0x184D2204u;
@@ -60,18 +63,18 @@ struct lz4_runtime_t {
       if (handle != nullptr) { break; }
     }
     if (handle == nullptr) {
-      throw std::logic_error(
-        "Could not open .mps.lz4 file since liblz4 was not found "
-        "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually "
-        "or install liblz4.");
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Could not open .mps.lz4 file since liblz4 was not found "
+                      "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually "
+                      "or install liblz4.");
     }
 
     decompress_safe =
       reinterpret_cast<LZ4_decompress_safe_t>(::dlsym(handle, "LZ4_decompress_safe"));
     if (decompress_safe == nullptr) {
-      throw std::logic_error(
-        "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file manually "
-        "or install a compatible liblz4.");
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file "
+                      "manually or install a compatible liblz4.");
     }
   }
 
@@ -100,7 +103,8 @@ int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size,
   (void)dst;
   (void)compressed_size;
   (void)dst_capacity;
-  throw std::logic_error(
+  mps_parser_fail(
+    error_type_t::RuntimeError,
     "Experimental fast MPS parser was built without LZ4 decompression support. "
     "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
 #endif
@@ -111,7 +115,8 @@ void ensure_lz4_runtime_available()
 #if defined(MPS_PARSER_WITH_LZ4)
   (void)lz4_runtime();
 #else
-  throw std::logic_error(
+  mps_parser_fail(
+    error_type_t::RuntimeError,
     "Experimental fast MPS parser was built without LZ4 decompression support. "
     "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
 #endif
@@ -121,37 +126,16 @@ int open_lz4_fd(const std::string& path)
 {
   int fd = ::open(path.c_str(), O_RDONLY);
   if (fd < 0) {
-    throw std::runtime_error("Failed to open LZ4 file '" + path + "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open LZ4 file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
   }
   return fd;
 }
 
-#ifndef _WIN32
-std::size_t system_page_size();
-#endif
 std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment);
 
-#ifndef _WIN32
-class FileDescriptor {
- public:
-  explicit FileDescriptor(int fd) : fd_(fd) {}
-  ~FileDescriptor()
-  {
-    if (fd_ >= 0) { ::close(fd_); }
-  }
-
-  FileDescriptor(const FileDescriptor&)            = delete;
-  FileDescriptor& operator=(const FileDescriptor&) = delete;
-
-  int get() const noexcept { return fd_; }
-  bool valid() const noexcept { return fd_ >= 0; }
-
- private:
-  int fd_;
-};
-
-#endif
-
 uint32_t read_le32(const char* ptr)
 {
   const auto* p = reinterpret_cast<const unsigned char*>(ptr);
@@ -176,32 +160,34 @@ std::size_t block_max_size_from_bd(unsigned char bd)
     case 5: return 256ull * 1024ull;
     case 6: return 1024ull * 1024ull;
     case 7: return 4ull * 1024ull * 1024ull;
-    default: throw std::runtime_error("unsupported LZ4 frame block size ID");
+    default: mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame block size ID");
   }
 }
 
 std::size_t checked_size(uint64_t value, const char* label)
 {
   if (value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max())) {
-    throw std::runtime_error(std::string("LZ4 ") + label + " exceeds size_t");
+    mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label);
   }
   return static_cast<std::size_t>(value);
 }
 
-#ifndef _WIN32
 std::size_t get_file_size(int fd, const std::string& path)
 {
   struct stat st;
   if (::fstat(fd, &st) != 0) {
-    throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to stat file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  if (st.st_size < 0) {
+    mps_parser_fail(
+      error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str());
   }
-  if (st.st_size < 0) { throw std::runtime_error("Invalid negative file size for '" + path + "'"); }
   return static_cast<std::size_t>(st.st_size);
 }
 
-#endif
-
-#ifndef _WIN32
 std::size_t system_page_size()
 {
   static std::size_t page_size = [] {
@@ -210,7 +196,6 @@ std::size_t system_page_size()
   }();
   return page_size;
 }
-#endif
 
 std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
 {
@@ -219,16 +204,15 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
   if (remainder == 0) { return value; }
   std::size_t increment = alignment - remainder;
   if (value > std::numeric_limits<std::size_t>::max() - increment) {
-    throw std::runtime_error("allocation size overflow");
+    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
   }
   return value + increment;
 }
 
-#ifndef _WIN32
 std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
 {
   if (a != 0 && b > std::numeric_limits<std::size_t>::max() / a) {
-    throw std::runtime_error(std::string(label) + " size overflow");
+    mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label);
   }
   return a * b;
 }
@@ -313,7 +297,7 @@ class lz4_resident_windows_t {
   const lz4_resident_window_t& window_for_offset(std::size_t offset) const
   {
     if (windows_.empty()) {
-      throw std::runtime_error("LZ4 resident window lookup with no windows");
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows");
     }
     std::size_t lo = 0;
     std::size_t hi = windows_.size();
@@ -328,12 +312,11 @@ class lz4_resident_windows_t {
         return w;
       }
     }
-    throw std::runtime_error("LZ4 offset outside resident windows");
+    mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
   }
 
   std::vector<lz4_resident_window_t>& windows_;
 };
-#endif
 
 }  // namespace
 
@@ -350,48 +333,58 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
 
   char header[32];
   if (compressed_size_ < 7) {
-    throw std::runtime_error("LZ4 input is too small to contain a frame header");
+    mps_parser_fail(error_type_t::ValidationError,
+                    "LZ4 input is too small to contain a frame header");
   }
   std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size_);
   if (!pread_full_plain(fd_, header, header_bytes, 0)) {
-    throw std::runtime_error("Failed to read LZ4 frame header '" + path +
-                             "': " + std::strerror(errno));
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to read LZ4 frame header '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
   }
 
   std::size_t offset = 0;
   uint32_t magic     = read_le32(header + offset);
   if (magic != lz4_frame_magic) {
-    throw std::runtime_error("unsupported LZ4 input: expected standard LZ4 frame magic");
+    mps_parser_fail(error_type_t::ValidationError,
+                    "unsupported LZ4 input: expected standard LZ4 frame magic");
   }
   offset += 4;
   unsigned char flg = static_cast<unsigned char>(header[offset++]);
   unsigned char bd  = static_cast<unsigned char>(header[offset++]);
   unsigned version  = (flg >> 6) & 0x3u;
-  if (version != 1) { throw std::runtime_error("unsupported LZ4 frame version"); }
+  if (version != 1) {
+    mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version");
+  }
   bool block_independent = (flg & 0x20u) != 0;
   block_checksum_        = (flg & 0x10u) != 0;
   content_size_present_  = (flg & 0x08u) != 0;
   content_checksum_      = (flg & 0x04u) != 0;
   dict_id_               = (flg & 0x01u) != 0;
   if (!block_independent) {
-    throw std::runtime_error("parallel LZ4 reader requires independent blocks; compress with -BI");
+    mps_parser_fail(error_type_t::ValidationError,
+                    "parallel LZ4 reader requires independent blocks; compress with -BI");
   }
   block_max_size_ = block_max_size_from_bd(bd);
   if (content_size_present_) {
     if (offset + 8 > header_bytes) {
-      throw std::runtime_error("truncated LZ4 frame while reading content size");
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading content size");
     }
     content_size_ = checked_size(read_le64(header + offset), "content size");
     offset += 8;
   }
   if (dict_id_) {
     if (offset + 4 > header_bytes) {
-      throw std::runtime_error("truncated LZ4 frame while reading dictionary id");
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading dictionary id");
     }
     offset += 4;
   }
   if (offset + 1 > header_bytes) {
-    throw std::runtime_error("truncated LZ4 frame while reading header checksum");
+    mps_parser_fail(error_type_t::ValidationError,
+                    "truncated LZ4 frame while reading header checksum");
   }
   offset += 1;
   header_size_ = offset;
@@ -447,7 +440,7 @@ void Lz4InputStream::commit_up_to(std::size_t bytes)
   std::lock_guard<std::mutex> lock(commit_mutex_);
   if (bytes <= output_committed_size_) return;
   if (bytes > output_mapped_size_) {
-    throw std::runtime_error("LZ4 output exceeded reserved virtual mapping");
+    mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping");
   }
   std::size_t new_committed = round_up_to_multiple(bytes, system_page_size());
   if (new_committed > output_mapped_size_) new_committed = output_mapped_size_;
@@ -556,7 +549,8 @@ void Lz4InputStream::run_decode_tasks()
             }
           }
           if (actual < 0 || static_cast<std::size_t>(actual) > block.decompressed_size) {
-            throw std::runtime_error("LZ4 input block decompressed to invalid size");
+            mps_parser_fail(error_type_t::ValidationError,
+                            "LZ4 input block decompressed to invalid size");
           }
 
           std::size_t actual_size = static_cast<std::size_t>(actual);
@@ -606,8 +600,13 @@ void Lz4InputStream::run_decode_tasks()
           ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset);
         }
         if (!ok) {
-          fail_and_notify(std::make_exception_ptr(std::runtime_error(
-            "Failed to pread LZ4 resident window: " + std::string(std::strerror(errno)))));
+          try {
+            mps_parser_fail(error_type_t::RuntimeError,
+                            "Failed to pread LZ4 resident window: %s",
+                            std::strerror(errno));
+          } catch (...) {
+            fail_and_notify(std::current_exception());
+          }
           return;
         }
         {
@@ -637,8 +636,8 @@ void Lz4InputStream::run_decode_tasks()
             return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0;
           });
           if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) {
-            throw std::runtime_error(
-              "LZ4 metadata scanner stopped before required window was ready");
+            mps_parser_fail(error_type_t::RuntimeError,
+                            "LZ4 metadata scanner stopped before required window was ready");
           }
         }
       };
@@ -665,7 +664,8 @@ void Lz4InputStream::run_decode_tasks()
         MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
         wait_range_ready(offset, 4);
         if (offset + 4 > compressed_size_) {
-          throw std::runtime_error("truncated LZ4 frame while reading block header");
+          mps_parser_fail(error_type_t::ValidationError,
+                          "truncated LZ4 frame while reading block header");
         }
         uint32_t raw_block_size = resident.read_u32(offset);
         offset += 4;
@@ -674,17 +674,20 @@ void Lz4InputStream::run_decode_tasks()
         bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
         std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
         if (block_payload_size == 0) {
-          throw std::runtime_error("invalid zero-sized LZ4 data block");
+          mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block");
         }
         if (block_payload_size > block_max_size_ && uncompressed) {
-          throw std::runtime_error("LZ4 uncompressed block exceeds frame block maximum");
+          mps_parser_fail(error_type_t::ValidationError,
+                          "LZ4 uncompressed block exceeds frame block maximum");
         }
         if (content_size_present_ && decompressed_offset >= content_size_) {
-          throw std::runtime_error("LZ4 frame contains more blocks than content size allows");
+          mps_parser_fail(error_type_t::ValidationError,
+                          "LZ4 frame contains more blocks than content size allows");
         }
         wait_range_ready(offset, block_payload_size);
         if (offset + block_payload_size > compressed_size_) {
-          throw std::runtime_error("truncated LZ4 frame while reading block payload");
+          mps_parser_fail(error_type_t::ValidationError,
+                          "truncated LZ4 frame while reading block payload");
         }
 
         std::size_t decompressed_size = block_payload_size;
@@ -696,7 +699,7 @@ void Lz4InputStream::run_decode_tasks()
           }
         }
         if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) {
-          throw std::runtime_error("LZ4 block exceeds declared content size");
+          mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
         }
 
         const char* src = resident.ptr_if_contiguous(offset, block_payload_size);
@@ -717,27 +720,32 @@ void Lz4InputStream::run_decode_tasks()
         if (block_checksum_) {
           wait_range_ready(offset, 4);
           if (offset + 4 > compressed_size_) {
-            throw std::runtime_error("truncated LZ4 frame while reading block checksum");
+            mps_parser_fail(error_type_t::ValidationError,
+                            "truncated LZ4 frame while reading block checksum");
           }
           offset += 4;
         }
         if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) {
-          throw std::runtime_error("LZ4 input block count exceeded reserved metadata slots");
+          mps_parser_fail(error_type_t::OutOfMemoryError,
+                          "LZ4 input block count exceeded reserved metadata slots");
         }
         if (batch.size() >= 1024) { push_batch(batch); }
       }
       if (content_checksum_) {
         wait_range_ready(offset, 4);
         if (offset + 4 > compressed_size_) {
-          throw std::runtime_error("truncated LZ4 frame while reading content checksum");
+          mps_parser_fail(error_type_t::ValidationError,
+                          "truncated LZ4 frame while reading content checksum");
         }
         offset += 4;
       }
       if (content_size_present_ && decompressed_offset != content_size_) {
-        throw std::runtime_error("LZ4 frame ended before declared content size was reached");
+        mps_parser_fail(error_type_t::ValidationError,
+                        "LZ4 frame ended before declared content size was reached");
       }
       if (offset != compressed_size_) {
-        throw std::runtime_error("LZ4 input contains trailing data after the first frame");
+        mps_parser_fail(error_type_t::ValidationError,
+                        "LZ4 input contains trailing data after the first frame");
       }
       push_batch(batch);
       {
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
index c1f411111a..98c6e4885d 100644
--- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
@@ -10,12 +10,19 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+
+#include <utilities/error.hpp>
+
 #include <limits>
 #include <stdexcept>
 #include <string>
 
 namespace mps_fast {
 
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
 // Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range
 // are still released by unmapping the owning outer range.
 class mmap_region_t {
@@ -51,8 +58,8 @@ class mmap_region_t {
   {
     void* ptr = ::mmap(address, size, prot, flags, fd, offset);
     if (ptr == MAP_FAILED) {
-      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
-                               std::strerror(errno));
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
     }
     return mmap_region_t(ptr, size);
   }
@@ -66,17 +73,18 @@ class mmap_region_t {
     std::size_t size, std::size_t alignment, int prot, int flags, const char* context)
   {
     if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
-      throw std::runtime_error("mmap aligned allocation requires power-of-two alignment");
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "mmap aligned allocation requires power-of-two alignment");
     }
     if (size > std::numeric_limits<std::size_t>::max() - alignment) {
-      throw std::runtime_error("mmap aligned allocation size overflow");
+      mps_parser_fail(error_type_t::OutOfMemoryError, "mmap aligned allocation size overflow");
     }
 
     std::size_t raw_size = size + alignment;
     void* raw            = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0);
     if (raw == MAP_FAILED) {
-      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
-                               std::strerror(errno));
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
     }
 
     uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
@@ -93,8 +101,8 @@ class mmap_region_t {
   {
     void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset);
     if (ptr == MAP_FAILED) {
-      throw std::runtime_error(std::string("mmap failed for ") + context + ": " +
-                               std::strerror(errno));
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
     }
   }
 
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index 3ed8763428..8581921173 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -1,8 +1,9 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #include "mps_section_scanner.hpp"
-#include "simd_compat.hpp"
+
+#include <utilities/error.hpp>
 
 #include <algorithm>
 #include <cstdint>
@@ -10,8 +11,15 @@
 #include <initializer_list>
 #include <stdexcept>
 
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
 namespace mps_fast {
 
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
 namespace {
 
 bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; }
@@ -52,7 +60,7 @@ std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase)
     case mps_phase_kind::ranges: return 5;
     case mps_phase_kind::quadratic: return 6;
   }
-  throw std::runtime_error("invalid MPS phase kind");
+  mps_parser_fail(error_type_t::RuntimeError, "invalid MPS phase kind");
 }
 
 void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range)
@@ -277,7 +285,8 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
                                                 const char* end)
 {
   if (block_index >= block_count_) {
-    throw std::runtime_error("MPS section scanner observed invalid LZ4 block index");
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "MPS section scanner observed invalid LZ4 block index");
   }
 
   scan_section_range(begin, end, false);
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index 0c492b0074..cc287368fb 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
index 650d28dbc2..23f4b4b8c1 100644
--- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/io/experimental_mps_fast/perf_counters.hpp
index 147a7ae7bb..1baaf011e5 100644
--- a/cpp/src/io/experimental_mps_fast/perf_counters.hpp
+++ b/cpp/src/io/experimental_mps_fast/perf_counters.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp
index d81af7a2eb..fb849fcff0 100644
--- a/cpp/src/io/experimental_mps_fast/simd_compat.hpp
+++ b/cpp/src/io/experimental_mps_fast/simd_compat.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/cpp/src/io/utilities/error.hpp b/cpp/src/io/utilities/error.hpp
index 58ac3891e1..c1b28fc7ff 100644
--- a/cpp/src/io/utilities/error.hpp
+++ b/cpp/src/io/utilities/error.hpp
@@ -34,6 +34,30 @@ inline std::string error_to_string(error_type_t error)
   return std::string("UnAccountedError");
 }
 
+[[noreturn]] inline void mps_parser_throw(error_type_t error_type, const char* msg)
+{
+  throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
+                         "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+}
+
+/**
+ * @brief Report an unrecoverable parser error.
+ *
+ * @param[error_type_t] error enum error type
+ * @param[const char *] fmt String format for error message
+ * @param variable set of arguments used for fmt
+ * @throw std::logic_error always
+ */
+[[noreturn]] inline void mps_parser_fail(error_type_t error_type, const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  char msg[2048];
+  vsnprintf(msg, sizeof(msg), fmt, args);
+  va_end(args);
+  mps_parser_throw(error_type, msg);
+}
+
 /**
  * @brief Function for checking (pre-)conditions that throws an exception when a
  * condition is false
@@ -52,9 +76,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f
     char msg[2048];
     vsnprintf(msg, sizeof(msg), fmt, args);
     va_end(args);
-
-    throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
-                           "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+    mps_parser_throw(error_type, msg);
   }
 }
 

From be97a050f1fee09385f6ca60db03d6596aba5bee Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Mon, 8 Jun 2026 05:38:55 -0700
Subject: [PATCH 05/22] decode performance metrics

---
 .../experimental_mps_fast/lz4_file_reader.cpp | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 010e890058..a0be7daaf0 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -20,9 +20,11 @@
 #include <algorithm>
 #include <atomic>
 #include <cerrno>
+#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <deque>
 #include <limits>
@@ -217,6 +219,12 @@ std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
   return a * b;
 }
 
+double elapsed_ms_since(std::chrono::steady_clock::time_point start)
+{
+  return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start)
+    .count();
+}
+
 bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
 {
   std::size_t done = 0;
@@ -479,6 +487,8 @@ void Lz4InputStream::run_decode_tasks()
   }
 
   const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count);
+  std::atomic<double> decoder_wait_batch_ms{0.0};
+  std::atomic<double> decoder_active_batch_ms{0.0};
 
   struct resident_block_desc_t {
     const char* src                 = nullptr;
@@ -514,10 +524,12 @@ void Lz4InputStream::run_decode_tasks()
         {
           MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
           std::unique_lock<std::mutex> lock(desc_mutex);
+          const auto wait_start = std::chrono::steady_clock::now();
           desc_cv.wait(lock, [&] {
             return stop_workers.load(std::memory_order_acquire) || scanner_done ||
                    !desc_queue.empty();
           });
+          decoder_wait_batch_ms.fetch_add(elapsed_ms_since(wait_start), std::memory_order_relaxed);
           if (stop_workers.load(std::memory_order_acquire)) { return; }
           if (desc_queue.empty()) {
             if (scanner_done) return;
@@ -527,6 +539,7 @@ void Lz4InputStream::run_decode_tasks()
           desc_queue.pop_front();
         }
 
+        const auto decode_start = std::chrono::steady_clock::now();
         MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
         for (const auto& block : batch) {
           char* dst  = output_data_ + block.decompressed_offset;
@@ -578,6 +591,8 @@ void Lz4InputStream::run_decode_tasks()
             section_scanner_->publish_ready(after);
           }
         }
+        decoder_active_batch_ms.fetch_add(elapsed_ms_since(decode_start),
+                                          std::memory_order_relaxed);
       }
     } catch (...) {
       fail_and_notify(std::current_exception());
@@ -621,6 +636,7 @@ void Lz4InputStream::run_decode_tasks()
 
   std::atomic_size_t blocks_scanned{0};
   std::vector<std::vector<char>> crossing_payloads;
+  const auto read_wall_start = std::chrono::steady_clock::now();
   std::thread scanner([&] {
     try {
       nvtx::name_current_thread("lz4-metadata-scan");
@@ -770,6 +786,7 @@ void Lz4InputStream::run_decode_tasks()
   for (auto& reader : readers) {
     reader.join();
   }
+  const double read_wall_ms = elapsed_ms_since(read_wall_start);
   scanner.join();
   for (auto& worker : io_workers) {
     worker.join();
@@ -777,6 +794,19 @@ void Lz4InputStream::run_decode_tasks()
   if (first_error) std::rethrow_exception(first_error);
   output_view_size_ = ready_bytes_;
   section_scanner_->publish_ready(output_view_size_);
+
+  const double compressed_mb = static_cast<double>(compressed_size_) / (1024.0 * 1024.0);
+  const double read_effective_mbps =
+    read_wall_ms > 0.0 ? compressed_mb / (read_wall_ms / 1000.0) : 0.0;
+  const double decoder_wait_ms   = decoder_wait_batch_ms.load(std::memory_order_relaxed);
+  const double decoder_active_ms = decoder_active_batch_ms.load(std::memory_order_relaxed);
+  const double decoder_total_ms  = decoder_wait_ms + decoder_active_ms;
+  const double decoder_wait_ratio =
+    decoder_total_ms > 0.0 ? decoder_wait_ms / decoder_total_ms : 0.0;
+  std::fprintf(stderr,
+               "[LZ4_IO] read_effective_MBps=%.3f decoder_wait_ratio=%.6f\n",
+               read_effective_mbps,
+               decoder_wait_ratio);
 }
 
 }  // namespace mps_fast

From 1e4d7c991c4392728db2d0a1e7fbde87d652ba0f Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 10 Jun 2026 07:51:22 -0700
Subject: [PATCH 06/22] lots of cleanup

---
 cpp/CMakeLists.txt                            |   6 +
 .../fast_fp64_parser.hpp                      |  72 +-
 .../fast_parse_primitives.hpp                 |  83 +-
 .../io/experimental_mps_fast/fast_parser.cpp  | 994 ++++++++++--------
 .../io/experimental_mps_fast/file_reader.cpp  |  93 +-
 .../io/experimental_mps_fast/file_reader.hpp  |  37 +-
 .../hash_table_smallstr.hpp                   | 293 +-----
 .../experimental_mps_fast/lz4_file_reader.cpp | 786 ++++++++------
 .../io/experimental_mps_fast/mmap_region.hpp  |  25 +-
 .../mps_section_scanner.cpp                   | 290 +++--
 .../mps_section_scanner.hpp                   |  10 +-
 .../io/experimental_mps_fast/nvtx_ranges.hpp  |   2 +-
 .../io/experimental_mps_fast/simd_compat.hpp  |  10 -
 cpp/tests/linear_programming/CMakeLists.txt   |  38 +
 .../fast_fp64_parser_test.cpp                 | 231 ++++
 .../fast_parser_edge_test.cpp                 | 871 +++++++++++++++
 16 files changed, 2463 insertions(+), 1378 deletions(-)
 delete mode 100644 cpp/src/io/experimental_mps_fast/simd_compat.hpp
 create mode 100644 cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
 create mode 100644 cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 712a132fc0..e134d49d02 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -54,6 +54,7 @@ option(SKIP_ROUTING_BUILD "Skip building routing components" OFF)
 option(SKIP_GRPC_BUILD "Skip building gRPC and protobuf components" OFF)
 option(WRITE_FATBIN "Enable fatbin writing" ON)
 option(HOST_LINEINFO "Build with debug line information for host code" OFF)
+option(MPS_FAST_TIMERS "Enable experimental fast MPS parser phase timer printouts" OFF)
 
 message(VERBOSE "cuOpt: Enable nvcc -lineinfo: ${CMAKE_CUDA_LINEINFO}")
 message(VERBOSE "cuOpt: Build cuOpt unit-tests: ${BUILD_TESTS}")
@@ -64,6 +65,7 @@ message(VERBOSE "cuOpt: Skip C/Python adapters: ${SKIP_C_PYTHON_ADAPTERS}")
 message(VERBOSE "cuOpt: Skip routing build: ${SKIP_ROUTING_BUILD}")
 message(VERBOSE "cuOpt: Build with debug line information for host code: ${HOST_LINEINFO}")
 message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}")
+message(VERBOSE "cuOpt: Fast MPS parser timers: ${MPS_FAST_TIMERS}")
 
 # ##################################################################################################
 # - compiler options ------------------------------------------------------------------------------
@@ -517,6 +519,10 @@ target_compile_definitions(cuopt
   PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API
 )
 
+if (MPS_FAST_TIMERS)
+    target_compile_definitions(cuopt PRIVATE MPS_FAST_TIMERS=1)
+endif ()
+
 target_compile_options(cuopt
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
         "$<$<COMPILE_LANGUAGE:CUDA>:${CUOPT_CUDA_FLAGS}>"
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index 605c6adc5b..905dcc9e7b 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -27,6 +27,8 @@ namespace fp64 {
 #define FASTP64_MAX_EXP_10    288
 #define FASTP64_POWER_COUNT   (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1)
 #define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1)
+#define FASTP64_EXPONENT_MASK 0x7FF
+#define FASTP64_HALF_MASK     0x1FF
 
 // Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm
 // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
@@ -45,11 +47,11 @@ struct cuopt_uint256_t {
   {
     unsigned __int128 carry = 0;
     for (uint64_t& v : limb) {
-      unsigned __int128 x = static_cast<unsigned __int128>(v) * m + carry;
-      v                   = static_cast<uint64_t>(x);
+      unsigned __int128 x = (unsigned __int128)v * m + carry;
+      v                   = (uint64_t)x;
       carry               = x >> 64;
     }
-    return static_cast<uint32_t>(carry);
+    return (uint32_t)carry;
   }
 
   constexpr cuopt_uint256_t shl_small(int bits) const
@@ -81,6 +83,9 @@ struct cuopt_normalized_uint256_t {
   {
     uint32_t carry = sig.mul_u32(10);
     int shift      = 32 - std::countl_zero(carry);
+    // The normalized 256-bit value always overflows into carry after *10; keep
+    // the guard explicit because the cross-limb path shifts by 64 - shift.
+    if (shift == 0) { return; }
     cuopt_uint256_t out;
     for (int i = 0; i < 4; ++i) {
       uint64_t lower = sig.limb[i] >> shift;
@@ -88,7 +93,7 @@ struct cuopt_normalized_uint256_t {
       if (i + 1 < 4) {
         upper = sig.limb[i + 1] << (64 - shift);
       } else {
-        upper = static_cast<uint64_t>(carry) << (64 - shift);
+        upper = (uint64_t)carry << (64 - shift);
       }
       out.limb[i] = lower | upper;
     }
@@ -107,7 +112,7 @@ struct cuopt_normalized_uint256_t {
     unsigned __int128 rem = extra;
     for (int i = 3; i >= 0; --i) {
       unsigned __int128 cur = (rem << 64) | shifted.limb[i];
-      quotient.limb[i]      = static_cast<uint64_t>(cur / 10);
+      quotient.limb[i]      = (uint64_t)(cur / 10);
       rem                   = cur % 10;
     }
     sig = quotient;
@@ -186,7 +191,7 @@ static inline bool parse_8_digits(const char* p, uint32_t& out)
   uint64_t v     = raw - 0x3030303030303030ULL;
   uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL;
   uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL;
-  out            = static_cast<uint32_t>((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL);
+  out            = (uint32_t)((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL);
   return true;
 }
 
@@ -229,7 +234,7 @@ static inline void scan_digit_run(const char*& p,
     if (after_dot) ++frac_digits;
     if (!too_many_digits && (digit != 0 || sig_digits != 0)) {
       if (sig_digits < 19) {
-        out.mantissa = (out.mantissa * 10) + static_cast<uint64_t>(digit);
+        out.mantissa = (out.mantissa * 10) + (uint64_t)digit;
         ++sig_digits;
       } else {
         too_many_digits = true;
@@ -314,38 +319,42 @@ static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits)
   uint64_t norm                = man << lz;
   int adj_e2                   = p.biased_e2 - lz;
 
-  unsigned __int128 product = static_cast<unsigned __int128>(norm) * p.high;
-  uint64_t hi               = static_cast<uint64_t>(product >> 64);
-  uint64_t lo               = static_cast<uint64_t>(product);
+  unsigned __int128 product = (unsigned __int128)norm * p.high;
+  uint64_t hi               = (uint64_t)(product >> 64);
+  uint64_t lo               = (uint64_t)product;
 
-  if ((hi & 0x1FF) == 0x1FF && lo + norm < norm) {
-    unsigned __int128 low_product = static_cast<unsigned __int128>(norm) * p.low;
-    uint64_t low_hi               = static_cast<uint64_t>(low_product >> 64);
-    uint64_t low_lo               = static_cast<uint64_t>(low_product);
+  // If the high product lands near the 9-bit halfway window, include the low
+  // 64x64 product to disambiguate rounding before deciding whether to fallback.
+  if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && lo + norm < norm) {
+    unsigned __int128 low_product = (unsigned __int128)norm * p.low;
+    uint64_t low_hi               = (uint64_t)(low_product >> 64);
+    uint64_t low_lo               = (uint64_t)low_product;
     uint64_t old_lo               = lo;
     lo += low_hi;
     hi += lo < old_lo ? 1 : 0;
-    if ((hi & 0x1FF) == 0x1FF && lo == std::numeric_limits<uint64_t>::max() &&
-        low_lo + norm < low_lo) {
+    if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK &&
+        lo == std::numeric_limits<uint64_t>::max() && low_lo + norm < low_lo) {
       return false;
     }
   }
 
   uint64_t hi_msb = hi >> 63;
-  uint64_t x54    = hi >> (9 + hi_msb);
-  adj_e2 -= static_cast<int>(1 - hi_msb);
+  // Extract 54 bits: 53 significand bits plus one rounding bit. The product
+  // may be shifted by one depending on whether hi already has its top bit set.
+  uint64_t x54 = hi >> (9 + hi_msb);
+  adj_e2 -= (int)(1 - hi_msb);
 
-  // half-way ambiguity, fallback
-  if (lo == 0 && (hi & 0x1FF) == 0 && (x54 & 3) == 1) { return false; }
+  // Exact halfway with round-to-even ambiguity; let strtod handle the rare tie.
+  if (lo == 0 && (hi & FASTP64_HALF_MASK) == 0 && (x54 & 3) == 1) { return false; }
 
-  // exponent overflow, fallback
+  // Round 54 -> 53 bits, carry into the exponent if rounding overflows.
   uint64_t x53      = (x54 + (x54 & 1)) >> 1;
   uint64_t overflow = x53 >> 53;
   uint64_t ret_man  = (x53 >> overflow) & FASTP64_MANTISSA_MASK;
-  int ret_exp       = adj_e2 + static_cast<int>(overflow);
-  if (ret_exp <= 0 || ret_exp >= 0x7FF) { return false; }
+  int ret_exp       = adj_e2 + (int)overflow;
+  if (ret_exp <= 0 || ret_exp >= FASTP64_EXPONENT_MASK) { return false; }
 
-  bits = (static_cast<uint64_t>(ret_exp) << 52) | ret_man;
+  bits = ((uint64_t)ret_exp << 52) | ret_man;
   return true;
 }
 
@@ -357,14 +366,14 @@ static inline double assemble_fp64(const ParsedDecimal& dec)
   if (dec.fast_eligible) {
     double small    = 0.0;
     bool used_small = false;
-    if (dec.exp10 >= 0 && dec.exp10 < static_cast<int>(small_integer_powers.size())) {
+    if (dec.exp10 >= 0 && dec.exp10 < (int)small_integer_powers.size()) {
       uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10];
       if (dec.mantissa <= limit) {
-        small      = static_cast<double>(dec.mantissa) * small_powers[dec.exp10];
+        small      = (double)dec.mantissa * small_powers[dec.exp10];
         used_small = true;
       }
     } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) {
-      small      = static_cast<double>(dec.mantissa) / small_powers[-dec.exp10];
+      small      = (double)dec.mantissa / small_powers[-dec.exp10];
       used_small = true;
     }
     if (used_small) { return dec.negative ? -small : small; }
@@ -383,17 +392,12 @@ static inline double parse_fp64_advance(const char*& p, const char* end)
   const char* start = p;
   ParsedDecimal dec;
   if (!parse_decimal_advance(p, end, dec)) {
-    return fallback_strtod(std::string_view(start, static_cast<size_t>(p - start)));
+    return fallback_strtod(std::string_view(start, (size_t)(p - start)));
   }
 
   double v = assemble_fp64(dec);
   if (v == v) return v;
-  return fallback_strtod(std::string_view(start, static_cast<size_t>(p - start)));
-}
-
-static inline double parse_fp64_token(const char* p, const char* end)
-{
-  return parse_fp64_advance(p, end);
+  return fallback_strtod(std::string_view(start, (size_t)(p - start)));
 }
 
 }  // namespace fp64
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index bd4ee4669a..70ed3283c3 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -5,7 +5,6 @@
 
 #include "fast_fp64_parser.hpp"
 
-#include <cctype>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
@@ -26,26 +25,6 @@
 
 namespace mps_fast {
 
-static inline void reset_number_parse_stats() {}
-static inline void print_number_parse_stats() {}
-
-static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; }
-
-static inline double fast_atof_core(const char*& data, const char* end)
-{
-  return fp64::parse_fp64_advance(data, end);
-}
-
-static inline double fast_atof(const char* data, const char* end)
-{
-  return fast_atof_core(data, end);
-}
-
-static inline double fast_atof_advance(const char*& ptr, const char* end)
-{
-  return fast_atof_core(ptr, end);
-}
-
 struct cursor_t {
   const char* start;
   const char* ptr;
@@ -65,7 +44,7 @@ struct cursor_t {
         line_start = p + 1;
       }
     }
-    std::size_t column = static_cast<std::size_t>(ptr - line_start) + 1;
+    std::size_t column = (std::size_t)(ptr - line_start) + 1;
     return {line, column};
   }
 
@@ -92,7 +71,7 @@ struct cursor_t {
   static const char* scalar_scan(const char* p, const char* end)
   {
     while (p < end) {
-      unsigned char c = static_cast<unsigned char>(*p);
+      unsigned char c = (unsigned char)*p;
       if constexpr (skip_ws_mode) {
         if (c > 32 || c == '\n') return p;
       } else {
@@ -171,6 +150,8 @@ struct cursor_t {
     const simde__m256i v32 = simde_mm256_set1_epi8(32);
     const simde__m256i vnl = simde_mm256_set1_epi8('\n');
 
+    // Input buffers are padded by file_reader/lz4_file_reader/small_raw_read,
+    // so this unaligned 32-byte load is valid whenever end - ptr >= 32.
     simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
     simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
     unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
@@ -210,16 +191,19 @@ struct cursor_t {
   inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
   read_two_fields()
   {
-    if (__unlikely(end - ptr < 32)) {
+    auto slow = [&] {
       auto f1 = read_field();
       auto f2 = read_field();
-      return {f1, f2};
-    }
+      return std::pair<std::string_view, std::string_view>{f1, f2};
+    };
+
+    if (__unlikely(end - ptr < 32)) { return slow(); }
 
     const char* field1_start = ptr;
     const simde__m256i v32   = simde_mm256_set1_epi8(32);
     const simde__m256i vnl   = simde_mm256_set1_epi8('\n');
 
+    // Same padded-buffer contract as read_field().
     simde__m256i data  = simde_mm256_loadu_si256((const simde__m256i*)ptr);
     simde__m256i gt32  = simde_mm256_cmpgt_epi8(data, v32);
     simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
@@ -229,33 +213,17 @@ struct cursor_t {
     unsigned int nl_mask        = (unsigned int)simde_mm256_movemask_epi8(is_nl);
     unsigned int stop_mask      = printable_mask | nl_mask;
 
-    if (__unlikely(ws_mask == 0)) {
-      auto f1 = read_field();
-      auto f2 = read_field();
-      return {f1, f2};
-    }
+    if (__unlikely(ws_mask == 0)) { return slow(); }
     int field1_end_off = __builtin_ctz(ws_mask);
 
     unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1);
-    if (__unlikely(after_field1 == 0)) {
-      auto f1 = read_field();
-      auto f2 = read_field();
-      return {f1, f2};
-    }
+    if (__unlikely(after_field1 == 0)) { return slow(); }
     int field2_start_off = __builtin_ctz(after_field1);
 
-    if (__unlikely(ptr[field2_start_off] == '\n')) {
-      auto f1 = read_field();
-      auto f2 = read_field();
-      return {f1, f2};
-    }
+    if (__unlikely(ptr[field2_start_off] == '\n')) { return slow(); }
 
     unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1);
-    if (__unlikely(ws_after_field2_start == 0)) {
-      auto f1 = read_field();
-      auto f2 = read_field();
-      return {f1, f2};
-    }
+    if (__unlikely(ws_after_field2_start == 0)) { return slow(); }
     int field2_end_off = __builtin_ctz(ws_after_field2_start);
 
     unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1);
@@ -274,7 +242,9 @@ struct cursor_t {
 static inline void expect(cursor_t& cursor, const char* field)
 {
   auto id = cursor.read_field();
-  if (__unlikely(id != field)) { cursor.error("expected '%s', got '%s'", field, id.data()); }
+  if (__unlikely(id != field)) {
+    cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data());
+  }
 }
 
 static inline void accept_comment_line(cursor_t& cursor)
@@ -290,7 +260,10 @@ static inline void accept_comment_line(cursor_t& cursor)
 
 static inline void expect_eol(cursor_t& cursor)
 {
-  if (__unlikely(!cursor.eol())) { cursor.error("expected end of line, got '%s'", cursor.ptr); }
+  if (__unlikely(!cursor.eol())) {
+    auto got = cursor.peek_field();
+    cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data());
+  }
 
   for (;;) {
     while (cursor.eol()) {
@@ -308,7 +281,8 @@ static inline void expect_eol(cursor_t& cursor)
     }
 
     if (__unlikely(cursor.done())) { return; }
-    if (__unlikely(!std::isalpha(static_cast<unsigned char>(cursor.ptr[0])))) {
+    char c = cursor.ptr[0];
+    if (__unlikely(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) {
       cursor.skip_ws();
       if (cursor.eol()) { continue; }
     }
@@ -336,19 +310,22 @@ static inline void expect_section(cursor_t& cursor, const char* section)
 static inline double expect_number(cursor_t& cursor)
 {
   auto num = cursor.read_field();
-  if (num.empty()) { cursor.error("expected number, got '%s'", num.data()); }
-  return fast_atof(num.data(), num.data() + num.size());
+  if (num.empty()) { cursor.error("expected number, got empty field"); }
+  const char* p = num.data();
+  return fp64::parse_fp64_advance(p, p + num.size());
 }
 
 static inline double expect_number_fast_pm_one(cursor_t& cursor)
 {
   const char* p = cursor.ptr;
-  if (p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
+  // Kept bounded despite the global padding invariant: this path is also used
+  // on section-local cursors whose logical end may precede the physical buffer.
+  if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
     cursor.ptr = p + 2;
     cursor.skip_ws();
     return -1.0;
   }
-  if (p[0] == '1' && p[1] <= ' ') {
+  if (cursor.end - p >= 2 && p[0] == '1' && p[1] <= ' ') {
     cursor.ptr = p + 1;
     cursor.skip_ws();
     return 1.0;
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 73f50c5341..de1b3ea84c 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -1,8 +1,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
-#define MPS_FAST_TIMERS
-
 #include "fast_parser.hpp"
 #include "fast_parse_primitives.hpp"
 #include "file_reader.hpp"
@@ -77,6 +75,48 @@ static int phase_thread_count(int phase_cap)
   return std::max(1, std::min(phase_cap, available_threads));
 }
 
+class chunk_name_arena_t {
+ public:
+  void reserve(size_t bytes)
+  {
+    if (bytes > next_slab_size_) { next_slab_size_ = bytes; }
+  }
+
+  std::string_view copy(std::string_view name)
+  {
+    char* dst = allocate(name.size() + 1);
+    std::memcpy(dst, name.data(), name.size());
+    dst[name.size()] = '\0';
+    return std::string_view(dst, name.size());
+  }
+
+ private:
+  struct slab_t {
+    std::unique_ptr<char[]> data;
+    size_t capacity = 0;
+    size_t used     = 0;
+  };
+
+  char* allocate(size_t bytes)
+  {
+    if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().capacity) {
+      size_t capacity = std::max(bytes, next_slab_size_);
+      slab_t slab;
+      slab.data     = std::make_unique<char[]>(capacity);
+      slab.capacity = capacity;
+      slabs_.push_back(std::move(slab));
+      next_slab_size_ = std::max(next_slab_size_ * 2, capacity);
+    }
+    slab_t& slab = slabs_.back();
+    char* ptr    = slab.data.get() + slab.used;
+    slab.used += bytes;
+    return ptr;
+  }
+
+  std::vector<slab_t> slabs_;
+  size_t next_slab_size_ = 64 * 1024;
+};
+
 static inline size_t row_hash_partition_for(uint32_t hash)
 {
   return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
@@ -89,8 +129,70 @@ static inline size_t row_hash_partition_for(uint32_t hash)
 struct TimerEntry {
   const char* name;
   double elapsed_ms;
+  size_t rss_kb;
+  size_t hwm_kb;
+  size_t compressed_bytes;
 };
 
+static std::atomic_size_t& get_timer_compressed_bytes()
+{
+  static std::atomic_size_t compressed_bytes{0};
+  return compressed_bytes;
+}
+
+class timer_io_context_t {
+ public:
+  explicit timer_io_context_t(size_t compressed_bytes)
+    : old_compressed_bytes_(
+        get_timer_compressed_bytes().exchange(compressed_bytes, std::memory_order_acq_rel))
+  {
+  }
+
+  ~timer_io_context_t()
+  {
+    get_timer_compressed_bytes().store(old_compressed_bytes_, std::memory_order_release);
+  }
+
+  timer_io_context_t(const timer_io_context_t&)            = delete;
+  timer_io_context_t& operator=(const timer_io_context_t&) = delete;
+
+ private:
+  size_t old_compressed_bytes_ = 0;
+};
+
+static size_t parse_status_kb_line(const char* line, const char* key)
+{
+  size_t key_len = std::strlen(key);
+  if (std::strncmp(line, key, key_len) != 0) { return 0; }
+  const char* p = line + key_len;
+  while (*p == ' ' || *p == '\t') {
+    ++p;
+  }
+  size_t value = 0;
+  while (*p >= '0' && *p <= '9') {
+    value = value * 10 + (size_t)(*p - '0');
+    ++p;
+  }
+  return value;
+}
+
+static std::pair<size_t, size_t> current_process_rss_kb()
+{
+  FILE* file = std::fopen("/proc/self/status", "r");
+  if (file == nullptr) { return {0, 0}; }
+
+  size_t rss_kb = 0;
+  size_t hwm_kb = 0;
+  char line[256];
+  while (std::fgets(line, sizeof(line), file) != nullptr) {
+    if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); }
+    if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); }
+    if (rss_kb != 0 && hwm_kb != 0) { break; }
+  }
+  std::fclose(file);
+  return {rss_kb, hwm_kb};
+}
+
 static std::vector<TimerEntry>& get_timer_buffer()
 {
   static std::vector<TimerEntry> buffer;
@@ -110,7 +212,13 @@ static void flush_timers()
   std::lock_guard<std::mutex> lock(get_timer_mutex());
   auto& buffer = get_timer_buffer();
   for (const auto& entry : buffer) {
-    std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms);
+    std::fprintf(stderr,
+                 "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f compressed_GB=%.3f\n",
+                 entry.name,
+                 entry.elapsed_ms,
+                 (double)entry.rss_kb / (1024.0 * 1024.0),
+                 (double)entry.hwm_kb / (1024.0 * 1024.0),
+                 (double)entry.compressed_bytes / (1024.0 * 1024.0 * 1024.0));
   }
   buffer.clear();
 #endif
@@ -189,8 +297,10 @@ class scoped_timer_t {
     double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
     nvtx_.end();
     if (accumulator_) { *accumulator_ += elapsed_ms; }
+    auto [rss_kb, hwm_kb]   = current_process_rss_kb();
+    size_t compressed_bytes = get_timer_compressed_bytes().load(std::memory_order_acquire);
     std::lock_guard<std::mutex> lock(get_timer_mutex());
-    get_timer_buffer().push_back({name_, elapsed_ms});
+    get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb, compressed_bytes});
 #endif
   }
 
@@ -221,11 +331,6 @@ static inline void error_unknown_row(cursor_t& cursor, const char* row_start, co
 // Parsing state shared across section parsers
 // =============================================================================
 
-// Hash and equality for string_view keys in unordered_map
-struct string_view_hash {
-  size_t operator()(std::string_view sv) const { return std::hash<std::string_view>{}(sv); }
-};
-
 static inline size_t next_power_of_2(size_t n)
 {
   if (n == 0) return 1;
@@ -309,12 +414,14 @@ struct parse_state_t {
   // Temporary string_view storage (points into input buffer, no allocation)
   std::vector<std::string_view> row_names_sv;
   std::vector<std::string_view> var_names_sv;
+  std::vector<chunk_name_arena_t> var_name_arenas;
   std::string_view problem_name_sv;
   std::string_view objective_name_sv;
   std::vector<std::string_view> ignored_objective_names_sv;
 
   // Optional dense ordered column index for labels like V0, V1, ...
   bool col_dense_ordered = false;
+  std::string col_dense_prefix_storage;
   std::string_view col_dense_prefix;
   uint64_t col_dense_min_id  = 0;
   uint64_t col_dense_max_id  = 0;
@@ -329,7 +436,7 @@ struct parse_state_t {
   size_t row_hash_partition_count                                               = 0;
   std::array<row_hash_partition_t, MPS_ROW_HASH_PARTITIONS> row_hash_partitions = {};
   // Overflow map for row names longer than HASH_KEY_BYTES
-  std::unordered_map<std::string_view, size_t, string_view_hash> row_names_long;
+  std::unordered_map<std::string_view, size_t> row_names_long;
 
   // Optional dense ordered row index for labels like R0001, R0002, ...
   row_index_mode_t row_index_mode = row_index_mode_t::hash;
@@ -342,7 +449,7 @@ struct parse_state_t {
   bool row_dense_zero_padded = false;
 
   // var_names still uses STL (only used in parse_bounds, not as hot)
-  std::unordered_map<std::string_view, size_t, string_view_hash> var_names_map;
+  std::unordered_map<std::string_view, size_t> var_names_map;
 
   struct bounds_only_var_t {
     f_t lb    = f_t{0};
@@ -524,7 +631,7 @@ struct parse_state_t {
       // Use mmap for allocation - the OS provides zero'd pages
       row_hash_region = mmap_region_t::anonymous(
         row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table");
-      row_names_ht = static_cast<hash_slot_var_t*>(row_hash_region.data());
+      row_names_ht = (hash_slot_var_t*)row_hash_region.data();
       if (use_partitioned) {
         hash_slot_var_t* next_slots = row_names_ht;
         for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
@@ -1226,6 +1333,7 @@ struct ChunkResult {
   std::vector<uint32_t> row_indices;
   std::vector<size_t> col_offsets;
   std::vector<std::string_view> var_names;
+  chunk_name_arena_t var_name_arena;
   std::vector<MarkerInfo> markers;
   std::vector<std::pair<size_t, double>> objective_entries;  // local_col_idx -> coefficient
   // Sparse per-row scratch: each touched 4096-row block stores counts after parsing,
@@ -1475,6 +1583,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
   result.row_indices.reserve(estimated_nnz);
   result.col_offsets.reserve(estimated_cols + 1);
   result.var_names.reserve(estimated_cols);
+  result.var_name_arena.reserve(std::max<size_t>(4096, estimated_cols * 16));
   result.objective_entries.reserve(estimated_cols);
   size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) /
                         COLUMN_ROW_COUNT_BLOCK_ROWS;
@@ -1532,25 +1641,25 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
       sign = -1.0;
       cursor.advance(1);
     }
-    if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) &&
+    if (cursor.ptr + 1 < cursor.end && fp64::is_digit(cursor.ptr[0]) &&
         (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) {
       value = sign * (cursor.ptr[0] - '0');
       cursor.advance(1);
     } else {
-      value = sign * fast_atof_advance(cursor.ptr, cursor.end);
+      value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end);
     }
     // usually EOL directly follows
     if (__unlikely(!cursor.eol())) { cursor.skip_ws(); }
     accept_comment(cursor);
 
-    if (result.first_var_name.empty()) { result.first_var_name = var_name; }
-    result.last_var_name = var_name;
-
     if (prev_var_name != var_name) {
-      result.var_names.push_back(var_name);
-      observe_dense_col_name(result.dense_col_stats, var_name);
+      std::string_view owned_var_name = result.var_name_arena.copy(var_name);
+      result.var_names.push_back(owned_var_name);
+      observe_dense_col_name(result.dense_col_stats, owned_var_name);
       result.col_offsets.push_back(result.values.size());
-      prev_var_name = var_name;
+      prev_var_name = owned_var_name;
+      if (result.first_var_name.empty()) { result.first_var_name = owned_var_name; }
+      result.last_var_name = owned_var_name;
     }
 
     auto add_entry = [&](std::string_view rn, double val) {
@@ -1579,7 +1688,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
         expect_eol(cursor);
         continue;
       }
-      double value2 = fast_atof_advance(cursor.ptr, cursor.end);
+      double value2 = fp64::parse_fp64_advance(cursor.ptr, cursor.end);
       cursor.skip_ws();
       accept_comment(cursor);
 
@@ -1595,114 +1704,129 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
 }
 
 // Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC
-template <typename i_t, typename f_t>
-static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
-                                       std::vector<ChunkResult>& chunks,
-                                       int num_threads)
-{
-  scoped_timer_t timer("merge_chunks_to_csr");
-
-  int num_chunks = (int)chunks.size();
-  if (num_chunks == 0) return;
-
-  i_t n_rows = state.problem.n_constraints_;
+template <typename i_t>
+struct column_merge_shape_t {
+  int num_chunks = 0;
+  i_t n_rows     = 0;
+  std::vector<size_t> global_col_offset;
+  size_t total_cols = 0;
+  size_t total_nnz  = 0;
+};
 
-  std::vector<size_t> global_col_offset(num_chunks + 1);
-  global_col_offset[0] = 0;
-  size_t total_nnz     = 0;
+template <typename i_t>
+static column_merge_shape_t<i_t> compute_column_merge_shape(const std::vector<ChunkResult>& chunks,
+                                                            i_t n_rows)
+{
+  column_merge_shape_t<i_t> shape;
+  shape.num_chunks = (int)chunks.size();
+  shape.n_rows     = n_rows;
+  shape.global_col_offset.resize((size_t)shape.num_chunks + 1);
   {
     scoped_timer_t timer("columns_global_offsets");
-    for (int t = 0; t < num_chunks; t++) {
-      global_col_offset[t + 1] = global_col_offset[t] + chunks[t].var_names.size();
-      total_nnz += chunks[t].values.size();
+    for (int t = 0; t < shape.num_chunks; t++) {
+      shape.global_col_offset[(size_t)t + 1] =
+        shape.global_col_offset[(size_t)t] + chunks[(size_t)t].var_names.size();
+      shape.total_nnz += chunks[(size_t)t].values.size();
     }
   }
-  size_t total_cols = global_col_offset[num_chunks];
+  shape.total_cols = shape.global_col_offset[(size_t)shape.num_chunks];
   if constexpr (std::numeric_limits<i_t>::max() < std::numeric_limits<int64_t>::max()) {
     const size_t index_max = (size_t)std::numeric_limits<i_t>::max();
-    if (total_nnz > index_max) {
+    if (shape.total_nnz > index_max) {
       mps_parser_fail(error_type_t::RuntimeError,
                       "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu",
-                      total_nnz,
+                      shape.total_nnz,
                       index_max);
     }
-    if (total_cols > index_max || (size_t)n_rows > index_max) {
+    if (shape.total_cols > index_max || (size_t)n_rows > index_max) {
       mps_parser_fail(error_type_t::RuntimeError,
                       "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index "
                       "max=%zu",
                       (size_t)n_rows,
-                      total_cols,
+                      shape.total_cols,
                       index_max);
     }
   }
-  {
-    scoped_timer_t timer("columns_dense_metadata");
-    bool dense_ok   = total_cols > 0;
-    bool have_first = false;
-    std::string_view dense_prefix;
-    uint64_t expected_next_id = 0;
-    uint64_t dense_min_id     = 0;
-    uint64_t dense_max_id     = 0;
-    size_t dense_pad_width    = 0;
-    bool dense_zero_padded    = false;
-
-    for (int t = 0; t < num_chunks && dense_ok; ++t) {
-      const auto& stats = chunks[t].dense_col_stats;
-      if (stats.count == 0) { continue; }
-      if (!stats.candidate || stats.count != chunks[t].var_names.size()) {
-        dense_ok = false;
-        break;
-      }
-      if (!have_first) {
-        have_first        = true;
-        dense_prefix      = stats.prefix;
-        expected_next_id  = stats.first_id;
-        dense_min_id      = stats.first_id;
-        dense_pad_width   = stats.pad_width;
-        dense_zero_padded = stats.zero_padded;
-      }
-      if (stats.prefix != dense_prefix || stats.first_id != expected_next_id ||
-          !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) {
-        dense_ok = false;
-        break;
-      }
-      if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) {
-        dense_ok = false;
-        break;
-      }
-      dense_max_id = stats.last_id;
-      if (stats.last_id == std::numeric_limits<uint64_t>::max()) {
-        expected_next_id = stats.last_id;
-        dense_ok         = false;
-        break;
-      }
-      expected_next_id = stats.last_id + 1;
-    }
+  return shape;
+}
 
-    if (!have_first || dense_max_id < dense_min_id ||
-        dense_max_id - dense_min_id + 1 != total_cols) {
+template <typename i_t, typename f_t>
+static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<ChunkResult>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_dense_metadata");
+  bool dense_ok   = shape.total_cols > 0;
+  bool have_first = false;
+  std::string_view dense_prefix;
+  uint64_t expected_next_id = 0;
+  uint64_t dense_min_id     = 0;
+  uint64_t dense_max_id     = 0;
+  size_t dense_pad_width    = 0;
+  bool dense_zero_padded    = false;
+
+  for (int t = 0; t < shape.num_chunks && dense_ok; ++t) {
+    const auto& stats = chunks[(size_t)t].dense_col_stats;
+    if (stats.count == 0) { continue; }
+    if (!stats.candidate || stats.count != chunks[(size_t)t].var_names.size()) {
+      dense_ok = false;
+      break;
+    }
+    if (!have_first) {
+      have_first        = true;
+      dense_prefix      = stats.prefix;
+      expected_next_id  = stats.first_id;
+      dense_min_id      = stats.first_id;
+      dense_pad_width   = stats.pad_width;
+      dense_zero_padded = stats.zero_padded;
+    }
+    if (stats.prefix != dense_prefix || stats.first_id != expected_next_id ||
+        !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) {
       dense_ok = false;
+      break;
     }
-
-    state.col_dense_ordered = dense_ok;
-    if (dense_ok) {
-      state.col_dense_prefix      = dense_prefix;
-      state.col_dense_min_id      = dense_min_id;
-      state.col_dense_max_id      = dense_max_id;
-      state.col_dense_pad_width   = dense_pad_width;
-      state.col_dense_zero_padded = dense_zero_padded;
+    if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) {
+      dense_ok = false;
+      break;
     }
+    dense_max_id = stats.last_id;
+    if (stats.last_id == std::numeric_limits<uint64_t>::max()) {
+      dense_ok = false;
+      break;
+    }
+    expected_next_id = stats.last_id + 1;
+  }
+
+  if (!have_first || dense_max_id < dense_min_id ||
+      dense_max_id - dense_min_id + 1 != shape.total_cols) {
+    dense_ok = false;
   }
 
-  // Step 2: Sum row counts (already computed during parsing) and build CSR row_offsets
-  std::vector<i_t> global_row_counts((size_t)n_rows, 0);
+  state.col_dense_ordered = dense_ok;
+  if (dense_ok) {
+    state.col_dense_prefix_storage.assign(dense_prefix);
+    state.col_dense_prefix      = state.col_dense_prefix_storage;
+    state.col_dense_min_id      = dense_min_id;
+    state.col_dense_max_id      = dense_max_id;
+    state.col_dense_pad_width   = dense_pad_width;
+    state.col_dense_zero_padded = dense_zero_padded;
+  }
+}
+
+template <typename i_t, typename f_t>
+static std::vector<i_t> build_csr_row_offsets(parse_state_t<i_t, f_t>& state,
+                                              const std::vector<ChunkResult>& chunks,
+                                              const column_merge_shape_t<i_t>& shape)
+{
+  std::vector<i_t> global_row_counts((size_t)shape.n_rows, 0);
   {
     scoped_timer_t timer("columns_sum_row_counts");
-    for (int t = 0; t < num_chunks; t++) {
-      for (const auto& block : chunks[t].row_count_blocks) {
-        const int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset;
-        size_t row_base             = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
-        size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base);
+    for (int t = 0; t < shape.num_chunks; t++) {
+      for (const auto& block : chunks[(size_t)t].row_count_blocks) {
+        const int64_t* block_counts =
+          chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+        size_t row_base    = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+        size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
         for (size_t local = 0; local < block_limit; ++local) {
           global_row_counts[row_base + local] += (i_t)block_counts[local];
         }
@@ -1711,196 +1835,223 @@ static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
   }
   {
     scoped_timer_t timer("columns_build_row_offsets");
-    state.problem.A_offsets_.resize((size_t)n_rows + 1);
+    state.problem.A_offsets_.resize((size_t)shape.n_rows + 1);
     state.problem.A_offsets_[0] = 0;
-    for (i_t r = 0; r < n_rows; r++) {
+    for (i_t r = 0; r < shape.n_rows; r++) {
       state.problem.A_offsets_[(size_t)r + 1] =
         state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r];
     }
   }
+  return global_row_counts;
+}
 
-  {
-    scoped_timer_t timer("columns_counts_to_write_positions");
-    std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0});
-    for (int t = 0; t < num_chunks; t++) {
-      for (auto& block : chunks[t].row_count_blocks) {
-        int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset;
-        size_t row_base       = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
-        size_t block_limit    = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base);
-        for (size_t local = 0; local < block_limit; ++local) {
-          int64_t count = block_counts[local];
-          if (count == 0) continue;
-          size_t row          = row_base + local;
-          i_t pos             = state.problem.A_offsets_[row] + global_row_counts[row];
-          block_counts[local] = (int64_t)pos;
-          global_row_counts[row] += (i_t)count;
-        }
+template <typename i_t>
+static void convert_counts_to_write_positions(std::vector<ChunkResult>& chunks,
+                                              const column_merge_shape_t<i_t>& shape,
+                                              const std::vector<i_t>& row_offsets,
+                                              std::vector<i_t>& global_row_counts)
+{
+  scoped_timer_t timer("columns_counts_to_write_positions");
+  std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (auto& block : chunks[(size_t)t].row_count_blocks) {
+      int64_t* block_counts = chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+      size_t row_base       = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+      size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
+      for (size_t local = 0; local < block_limit; ++local) {
+        int64_t count = block_counts[local];
+        if (count == 0) continue;
+        size_t row          = row_base + local;
+        i_t pos             = row_offsets[row] + global_row_counts[row];
+        block_counts[local] = (int64_t)pos;
+        global_row_counts[row] += (i_t)count;
       }
     }
   }
+}
 
-  {
-    scoped_timer_t timer("columns_row_count_storage_hugepages");
+static void materialize_chunk_row_count_storage(std::vector<ChunkResult>& chunks, int num_threads)
+{
+  scoped_timer_t timer("columns_row_count_storage_hugepages");
 #pragma omp parallel for num_threads(num_threads)
-    for (int t = 0; t < num_chunks; ++t) {
-      materialize_vector_hugepages(
-        "column_row_count_storage", chunks[t].row_count_storage, materialize_touch_t::write_2mb);
-    }
+  for (int t = 0; t < (int)chunks.size(); ++t) {
+    materialize_vector_hugepages("column_row_count_storage",
+                                 chunks[(size_t)t].row_count_storage,
+                                 materialize_touch_t::write_2mb);
   }
+}
 
-  // Step 6: Allocate CSR arrays
-  {
-    scoped_timer_t timer("allocate_csr_arrays");
-
-    // May be unexpectedly slow, even if already reserved() to good fit.
-    // I assume the cause is probably that the pages aren't actually backed when reserve() is called
-    // and the actual physical allocation only happens now
-
-    // evil tweak until we can refactior problem_t
-    // run the zero-init resize() calls in parallel
+template <typename i_t, typename f_t>
+static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
+                                    const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("allocate_csr_arrays");
 
+  // problem_t uses std::vector, so these resize() calls zero-initialize large arrays.
+  // Running them in parallel hides part of that page-fault and initialization cost.
 #pragma omp parallel sections num_threads(4)
-    {
+  {
 #pragma omp section
-      {
-        state.problem.A_.resize(total_nnz);
-      }
+    {
+      state.problem.A_.resize(shape.total_nnz);
+    }
 #pragma omp section
-      {
-        state.problem.A_indices_.resize(total_nnz);
-      }
+    {
+      state.problem.A_indices_.resize(shape.total_nnz);
+    }
 #pragma omp section
-      {
-        if (!state.col_dense_ordered) { state.var_names_sv.resize(total_cols); }
+    {
+      if (!state.col_dense_ordered) {
+        state.var_name_arenas.clear();
+        state.var_name_arenas.resize((size_t)shape.num_chunks);
+        state.var_names_sv.resize(shape.total_cols);
       }
+    }
 #pragma omp section
-      {
-        state.problem.var_types_.resize(total_cols);
-      }
+    {
+      state.problem.var_types_.resize(shape.total_cols);
     }
   }
+}
 
-  // Step 6: Parallel scatter into CSR + copy var_names
+template <typename i_t, typename f_t>
+static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
+                                         std::vector<ChunkResult>& chunks,
+                                         const column_merge_shape_t<i_t>& shape,
+                                         int num_threads)
+{
+  scoped_timer_t timer("scatter_into_csr");
   {
-    scoped_timer_t timer("scatter_into_csr");
-    {
-      scoped_timer_t matrix_timer("scatter_matrix_entries");
+    scoped_timer_t matrix_timer("scatter_matrix_entries");
 #ifdef MPS_FAST_PERF_COUNTERS
-      std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_chunks);
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)shape.num_chunks);
 #endif
 #pragma omp parallel for num_threads(num_threads)
-      for (int t = 0; t < num_chunks; t++) {
+    for (int t = 0; t < shape.num_chunks; t++) {
 #ifdef MPS_FAST_PERF_COUNTERS
-        thread_perf_counters_t perf_counters;
+      thread_perf_counters_t perf_counters;
 #endif
-        auto& chunk = chunks[t];
-
-        for (size_t local_col = 0; local_col < chunks[t].var_names.size(); local_col++) {
-          i_t global_col = (i_t)(global_col_offset[t] + local_col);
-
-          size_t col_start = chunks[t].col_offsets[local_col];
-          size_t col_end   = chunks[t].col_offsets[local_col + 1];
-          for (size_t idx = col_start; idx < col_end; idx++) {
-            i_t row                        = (i_t)chunks[t].row_indices[idx];
-            size_t row_idx                 = (size_t)row;
-            size_t block_id                = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
-            size_t local                   = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
-            int32_t block_pos              = chunk.row_count_block_dir[block_id];
-            RowCountBlock& block           = chunk.row_count_blocks[(size_t)block_pos];
-            int64_t& write_pos             = chunk.row_count_storage[block.storage_offset + local];
-            i_t dest                       = (i_t)write_pos++;
-            state.problem.A_[dest]         = (f_t)chunks[t].values[idx];
-            state.problem.A_indices_[dest] = global_col;
-          }
+      auto& chunk = chunks[(size_t)t];
+      for (size_t local_col = 0; local_col < chunk.var_names.size(); local_col++) {
+        i_t global_col   = (i_t)(shape.global_col_offset[(size_t)t] + local_col);
+        size_t col_start = chunk.col_offsets[local_col];
+        size_t col_end   = chunk.col_offsets[local_col + 1];
+        for (size_t idx = col_start; idx < col_end; idx++) {
+          i_t row                        = (i_t)chunk.row_indices[idx];
+          size_t row_idx                 = (size_t)row;
+          size_t block_id                = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+          size_t local                   = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+          int32_t block_pos              = chunk.row_count_block_dir[block_id];
+          RowCountBlock& block           = chunk.row_count_blocks[(size_t)block_pos];
+          int64_t& write_pos             = chunk.row_count_storage[block.storage_offset + local];
+          i_t dest                       = (i_t)write_pos++;
+          state.problem.A_[dest]         = (f_t)chunk.values[idx];
+          state.problem.A_indices_[dest] = global_col;
         }
-#ifdef MPS_FAST_PERF_COUNTERS
-        perf_snapshots[(size_t)t] = perf_counters.stop();
-#endif
       }
 #ifdef MPS_FAST_PERF_COUNTERS
-      print_perf_totals("scatter_matrix_entries", perf_snapshots);
+      perf_snapshots[(size_t)t] = perf_counters.stop();
 #endif
     }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("scatter_matrix_entries", perf_snapshots);
+#endif
+  }
 
-    if (!state.col_dense_ordered) {
-      {
-        scoped_timer_t names_timer("scatter_var_names");
+  if (!state.col_dense_ordered) {
+    scoped_timer_t names_timer("scatter_var_names");
 #pragma omp parallel for num_threads(num_threads)
-        for (int t = 0; t < num_chunks; t++) {
-          for (size_t i = 0; i < chunks[t].var_names.size(); i++) {
-            state.var_names_sv[global_col_offset[t] + i] = chunks[t].var_names[i];
-          }
-        }
+    for (int t = 0; t < shape.num_chunks; t++) {
+      chunk_name_arena_t& arena = state.var_name_arenas[(size_t)t];
+      arena.reserve(std::max<size_t>(4096, chunks[(size_t)t].var_names.size() * 16));
+      for (size_t i = 0; i < chunks[(size_t)t].var_names.size(); i++) {
+        state.var_names_sv[shape.global_col_offset[(size_t)t] + i] =
+          arena.copy(chunks[(size_t)t].var_names[i]);
       }
-    } else {
-      scoped_timer_t names_timer("scatter_var_names");
     }
+  } else {
+    scoped_timer_t names_timer("scatter_var_names");
   }
+}
 
-  // Step 7: Apply integer markers
-  struct GlobalMarker {
-    MarkerInfo::Type type;
-    size_t global_var_idx;
-  };
-  {
-    scoped_timer_t timer("columns_apply_markers");
-    std::vector<GlobalMarker> all_markers;
-
-    for (int t = 0; t < num_chunks; t++) {
-      for (const auto& m : chunks[t].markers) {
-        GlobalMarker gm;
-        gm.type = m.type;
-
-        if (m.after_local_var_idx == SIZE_MAX) {
-          // Marker before any variable in this chunk
-          gm.global_var_idx = (global_col_offset[t] > 0) ? global_col_offset[t] - 1 : SIZE_MAX;
-        } else {
-          gm.global_var_idx = global_col_offset[t] + m.after_local_var_idx;
-        }
-        all_markers.push_back(gm);
-      }
-    }
-
-    std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
-      // SIZE_MAX means "before all variables" - should sort first
-      if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
-      if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
-      return a.global_var_idx < b.global_var_idx;
-    });
+struct global_marker_t {
+  MarkerInfo::Type type;
+  size_t global_var_idx;
+};
 
-    bool is_integer   = false;
-    size_t marker_idx = 0;
+template <typename i_t, typename f_t>
+static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<ChunkResult>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_apply_markers");
+  std::vector<global_marker_t> all_markers;
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& m : chunks[(size_t)t].markers) {
+      global_marker_t gm;
+      gm.type = m.type;
+      gm.global_var_idx =
+        m.after_local_var_idx == SIZE_MAX
+          ? (shape.global_col_offset[(size_t)t] > 0 ? shape.global_col_offset[(size_t)t] - 1
+                                                    : SIZE_MAX)
+          : shape.global_col_offset[(size_t)t] + m.after_local_var_idx;
+      all_markers.push_back(gm);
+    }
+  }
+
+  std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
+    if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
+    if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
+    return a.global_var_idx < b.global_var_idx;
+  });
 
-    for (size_t v = 0; v < total_cols; v++) {
-      while (marker_idx < all_markers.size() &&
-             (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
-              all_markers[marker_idx].global_var_idx < v)) {
-        if (all_markers[marker_idx].type == MarkerInfo::INTORG) {
-          is_integer = true;
-        } else {
-          is_integer = false;
-        }
-        marker_idx++;
-      }
-      state.problem.var_types_[v] = is_integer ? 'I' : 'C';
+  bool is_integer   = false;
+  size_t marker_idx = 0;
+  for (size_t v = 0; v < shape.total_cols; v++) {
+    while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
+                                               all_markers[marker_idx].global_var_idx < v)) {
+      is_integer = all_markers[marker_idx].type == MarkerInfo::INTORG;
+      marker_idx++;
     }
+    state.problem.var_types_[v] = is_integer ? 'I' : 'C';
   }
+}
 
-  // Step 8: Handle objective entries
-  {
-    scoped_timer_t timer("columns_objective_entries");
-    state.problem.c_.resize(total_cols, f_t{0});
-    for (int t = 0; t < num_chunks; t++) {
-      for (const auto& [local_col, coeff] : chunks[t].objective_entries) {
-        size_t global_col = global_col_offset[t] + local_col;
-        if (global_col < total_cols) { state.problem.c_[global_col] = (f_t)coeff; }
-      }
+template <typename i_t, typename f_t>
+static void assign_column_objective_entries(parse_state_t<i_t, f_t>& state,
+                                            const std::vector<ChunkResult>& chunks,
+                                            const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_objective_entries");
+  state.problem.c_.resize(shape.total_cols, f_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& [local_col, coeff] : chunks[(size_t)t].objective_entries) {
+      size_t global_col = shape.global_col_offset[(size_t)t] + local_col;
+      if (global_col < shape.total_cols) { state.problem.c_[global_col] = (f_t)coeff; }
     }
   }
+}
 
-  // Store final dimensions; CSR and objective coefficients are already complete.
-  state.problem.n_vars_ = (i_t)total_cols;
-  state.problem.nnz_    = (i_t)total_nnz;
+template <typename i_t, typename f_t>
+static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
+                                       std::vector<ChunkResult>& chunks,
+                                       int num_threads)
+{
+  scoped_timer_t timer("merge_chunks_to_csr");
+  if (chunks.empty()) return;
+
+  auto shape = compute_column_merge_shape<i_t>(chunks, state.problem.n_constraints_);
+  detect_dense_column_metadata(state, chunks, shape);
+  auto global_row_counts = build_csr_row_offsets(state, chunks, shape);
+  convert_counts_to_write_positions(chunks, shape, state.problem.A_offsets_, global_row_counts);
+  materialize_chunk_row_count_storage(chunks, num_threads);
+  allocate_column_outputs(state, shape);
+  scatter_column_chunks_to_csr(state, chunks, shape, num_threads);
+  apply_column_integer_markers(state, chunks, shape);
+  assign_column_objective_entries(state, chunks, shape);
+
+  state.problem.n_vars_ = (i_t)shape.total_cols;
+  state.problem.nnz_    = (i_t)shape.total_nnz;
 }
 
 template <typename i_t, typename f_t>
@@ -1931,20 +2082,28 @@ static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
 #ifdef MPS_FAST_PERF_COUNTERS
     std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_threads);
 #endif
+    std::exception_ptr first_error = nullptr;
+    std::mutex error_mutex;
     {
 #pragma omp parallel for num_threads(num_threads)
       for (int t = 0; t < num_threads; t++) {
-        MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns);
+        try {
+          MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns);
 #ifdef MPS_FAST_PERF_COUNTERS
-        thread_perf_counters_t perf_counters;
+          thread_perf_counters_t perf_counters;
 #endif
-        results[t] =
-          parse_columns_chunk<i_t, f_t>(chunk_bounds[t].start, chunk_bounds[t].end, state);
+          results[t] =
+            parse_columns_chunk<i_t, f_t>(chunk_bounds[t].start, chunk_bounds[t].end, state);
 #ifdef MPS_FAST_PERF_COUNTERS
-        perf_snapshots[(size_t)t] = perf_counters.stop();
+          perf_snapshots[(size_t)t] = perf_counters.stop();
 #endif
+        } catch (...) {
+          std::lock_guard<std::mutex> lock(error_mutex);
+          if (!first_error) { first_error = std::current_exception(); }
+        }
       }
     }
+    if (first_error) { std::rethrow_exception(first_error); }
 #ifdef MPS_FAST_PERF_COUNTERS
     print_perf_totals("parse_columns_chunk_parallel", perf_snapshots);
 #endif
@@ -2016,6 +2175,74 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
   }
 }
 
+static size_t find_var_after_hint(const std::vector<std::string_view>& var_names,
+                                  std::string_view var_name,
+                                  size_t hint_idx)
+{
+  const size_t n_vars = var_names.size();
+  if (hint_idx + 1 < n_vars && var_names[hint_idx + 1] == var_name) { return hint_idx + 1; }
+  if (hint_idx < n_vars && var_names[hint_idx] == var_name) { return hint_idx; }
+
+  const size_t first_begin = std::min(hint_idx + 2, n_vars);
+  for (size_t i = first_begin; i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  for (size_t i = 0; i < hint_idx && i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  return SIZE_MAX;
+}
+
+template <typename f_t, typename SetLb, typename SetUb, typename SetType, typename Error>
+static bool apply_bound_record(std::string_view bound_type,
+                               f_t value,
+                               bool has_value,
+                               bool first_bound_for_var,
+                               SetLb&& set_lb,
+                               SetUb&& set_ub,
+                               SetType&& set_type,
+                               Error&& error)
+{
+  if (bound_type == "LO") {
+    set_lb(value);
+  } else if (bound_type == "UP") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+  } else if (bound_type == "FX") {
+    set_lb(value);
+    set_ub(value);
+  } else if (bound_type == "FR") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "MI") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "PL") {
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "BV") {
+    set_lb(f_t{0});
+    set_ub(f_t{1});
+    set_type('I');
+  } else if (bound_type == "LI") {
+    set_lb(value);
+    set_type('I');
+  } else if (bound_type == "UI") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+    set_type('I');
+  } else if (bound_type == "SC") {
+    if (__unlikely(!has_value)) {
+      error("SC bound requires an upper bound value", bound_type);
+      return false;
+    }
+    set_ub(value);
+    set_type('S');
+  } else {
+    error("unknown bound type", bound_type);
+    return false;
+  }
+  return true;
+}
+
 template <typename i_t, typename f_t>
 static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
                                                 cursor_t& cursor,
@@ -2042,8 +2269,6 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
     size_t min_var          = SIZE_MAX;
     size_t max_var          = 0;
     size_t decreasing_order = 0;
-    bool saw_integer_type   = false;
-    bool saw_negative_upper = false;
     const char* error_ptr   = nullptr;
     char error_msg[192]     = {};
   };
@@ -2073,23 +2298,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
       size_t hint_idx = 0;
       auto lookup_var = [&](std::string_view var_name) {
         if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); }
-        if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) {
-          return hint_idx + 1;
-        }
-        if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { return hint_idx; }
-
-        size_t search_start = hint_idx + 2;
-        size_t search_end   = n_vars;
-      search_loop:
-        for (size_t i = search_start; i < search_end; ++i) {
-          if (state.var_names_sv[i] == var_name) { return i; }
-        }
-        if (search_start != 0) {
-          search_end   = hint_idx;
-          search_start = 0;
-          goto search_loop;
-        }
-        return SIZE_MAX;
+        return find_var_after_hint(state.var_names_sv, var_name, hint_idx);
       };
       try {
         while (cursor.ptr < cursor.end) {
@@ -2144,57 +2353,30 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
             accept_comment(cursor);
           }
 
-          if (bound_type == "LO") {
-            state.problem.variable_lower_bounds_[var_idx] = value;
-          } else if (bound_type == "UP") {
-            state.problem.variable_upper_bounds_[var_idx] = value;
-            if (first_bound_for_var && value < f_t{0}) {
-              state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
-              local.saw_negative_upper                      = true;
-            }
-          } else if (bound_type == "FX") {
-            state.problem.variable_lower_bounds_[var_idx] = value;
-            state.problem.variable_upper_bounds_[var_idx] = value;
-          } else if (bound_type == "FR") {
-            state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
-            state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
-          } else if (bound_type == "MI") {
-            state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
-          } else if (bound_type == "PL") {
-            state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits<f_t>::infinity();
-          } else if (bound_type == "BV") {
-            state.problem.variable_lower_bounds_[var_idx] = 0;
-            state.problem.variable_upper_bounds_[var_idx] = 1;
-            state.problem.var_types_[var_idx]             = 'I';
-            local.saw_integer_type                        = true;
-          } else if (bound_type == "LI") {
-            state.problem.variable_lower_bounds_[var_idx] = value;
-            state.problem.var_types_[var_idx]             = 'I';
-            local.saw_integer_type                        = true;
-          } else if (bound_type == "UI") {
-            state.problem.variable_upper_bounds_[var_idx] = value;
-            if (first_bound_for_var && value < f_t{0}) {
-              state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits<f_t>::infinity();
-              local.saw_negative_upper                      = true;
-            }
-            state.problem.var_types_[var_idx] = 'I';
-            local.saw_integer_type            = true;
-          } else if (bound_type == "SC") {
-            if (__unlikely(!has_value)) {
-              std::snprintf(
-                local.error_msg, sizeof(local.error_msg), "SC bound requires an upper bound value");
-              local.error_ptr = cursor.ptr;
-              break;
+          auto set_lb    = [&](f_t x) { state.problem.variable_lower_bounds_[var_idx] = x; };
+          auto set_ub    = [&](f_t x) { state.problem.variable_upper_bounds_[var_idx] = x; };
+          auto set_type  = [&](char t) { state.problem.var_types_[var_idx] = t; };
+          auto set_error = [&](const char* msg, std::string_view type) {
+            if (type.empty() || std::strcmp(msg, "unknown bound type") != 0) {
+              std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", msg);
+            } else {
+              std::snprintf(local.error_msg,
+                            sizeof(local.error_msg),
+                            "%s: %.*s",
+                            msg,
+                            (int)type.size(),
+                            type.data());
             }
-            state.problem.variable_upper_bounds_[var_idx] = value;
-            state.problem.var_types_[var_idx]             = 'S';
-          } else {
-            std::snprintf(local.error_msg,
-                          sizeof(local.error_msg),
-                          "unknown bound type: %.*s",
-                          (int)bound_type.size(),
-                          bound_type.data());
             local.error_ptr = cursor.ptr;
+          };
+          if (!apply_bound_record(bound_type,
+                                  value,
+                                  has_value,
+                                  first_bound_for_var,
+                                  set_lb,
+                                  set_ub,
+                                  set_type,
+                                  set_error)) {
             break;
           }
 
@@ -2353,29 +2535,10 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
       if (__likely(state.col_dense_ordered)) {
         var_idx = state.col_lookup_dense_ordered(var_name);
         if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
-      } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) {
-        var_idx = hint_idx + 1;
-      } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) {
-        var_idx = hint_idx;
       } else {
-        size_t search_start = hint_idx + 2;
-        size_t search_end   = n_vars;
-
-      search_loop:
-        for (size_t i = search_start; i < search_end; ++i) {
-          if (state.var_names_sv[i] == var_name) {
-            var_idx = i;
-            goto found;
-          }
-        }
-        if (search_start != 0) {
-          search_end   = hint_idx;
-          search_start = 0;
-          goto search_loop;
-        }
-        aux_var = &state.bounds_only_vars[var_name];
+        var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx);
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
       }
-    found:
       if (var_idx != SIZE_MAX) { hint_idx = var_idx; }
       bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx);
 
@@ -2383,15 +2546,8 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
       bool has_value = false;
       accept_comment(cursor);
       if (!cursor.eol()) {
-        // bounds are often just set to 0 or 1
-        if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') {
-          value = cursor.ptr[0] - '0';
-          cursor.ptr += 1;
-          has_value = true;
-        } else {
-          value     = (f_t)expect_number(cursor);
-          has_value = true;
-        }
+        value     = (f_t)expect_number(cursor);
+        has_value = true;
         accept_comment(cursor);
       }
 
@@ -2417,43 +2573,14 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
         }
       };
 
-      if (bound_type == "LO") {
-        set_lb(value);
-      } else if (bound_type == "UP") {
-        set_ub(value);
-        if (first_bound_for_var && value < f_t{0}) {
-          set_lb(-std::numeric_limits<f_t>::infinity());
-        }
-      } else if (bound_type == "FX") {
-        set_lb(value);
-        set_ub(value);
-      } else if (bound_type == "FR") {
-        set_lb(-std::numeric_limits<f_t>::infinity());
-        set_ub(std::numeric_limits<f_t>::infinity());
-      } else if (bound_type == "MI") {
-        set_lb(-std::numeric_limits<f_t>::infinity());
-      } else if (bound_type == "PL") {
-        set_ub(std::numeric_limits<f_t>::infinity());
-      } else if (bound_type == "BV") {
-        set_lb(0);
-        set_ub(1);
-        set_type('I');
-      } else if (bound_type == "LI") {
-        set_lb(value);
-        set_type('I');
-      } else if (bound_type == "UI") {
-        set_ub(value);
-        if (first_bound_for_var && value < f_t{0}) {
-          set_lb(-std::numeric_limits<f_t>::infinity());
+      auto set_error = [&](const char* msg, std::string_view type) {
+        if (std::strcmp(msg, "unknown bound type") == 0) {
+          cursor.error("%s: %.*s", msg, (int)type.size(), type.data());
         }
-        set_type('I');
-      } else if (bound_type == "SC") {
-        if (__unlikely(!has_value)) { cursor.error("SC bound requires an upper bound value"); }
-        set_ub(value);
-        set_type('S');
-      } else {
-        cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data());
-      }
+        cursor.error("%s", msg);
+      };
+      (void)apply_bound_record(
+        bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error);
       if (aux_var == nullptr) { mark_bound(var_idx); }
 
       expect_eol(cursor);
@@ -2831,15 +2958,10 @@ static void append_bounds_only_variables(parse_state_t<i_t, f_t>& state)
   state.problem.n_vars_ = (i_t)state.problem.var_names_.size();
 }
 
-template <typename Stream, typename i_t, typename f_t>
-static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
-  Stream& stream, const char* total_timer_name, const char* producer_task_name)
+template <typename i_t, typename f_t>
+static std::size_t init_problem_storage(
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem, std::size_t reserve_hint)
 {
-  auto total_timer = std::make_unique<scoped_timer_t>(total_timer_name);
-  omp_set_max_active_levels(2);
-
-  input_stream_view_t input = stream.view();
-  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
   problem.n_vars_                   = 0;
   problem.n_constraints_            = 0;
   problem.nnz_                      = 0;
@@ -2847,7 +2969,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   problem.objective_scaling_factor_ = f_t{1};
   problem.objective_offset_         = f_t{0};
 
-  std::size_t reserve_size = std::max<std::size_t>(stream.reserve_size_hint(), 1024 * 1024);
+  std::size_t reserve_size = std::max<std::size_t>(reserve_hint, 1024 * 1024);
   std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
   problem.A_offsets_.reserve(reserve_dim);
   problem.b_.reserve(reserve_dim);
@@ -2859,6 +2981,31 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   problem.var_names_.reserve(reserve_dim);
   problem.constraint_lower_bounds_.reserve(reserve_dim);
   problem.constraint_upper_bounds_.reserve(reserve_dim);
+  return reserve_dim;
+}
+
+static const char* trailing_endata_cursor_end(mps_phase_registry_t& registry)
+{
+  mps_phase_range_t quadratic = registry.range(mps_phase_kind::quadratic);
+  if (quadratic.present) { return quadratic.end; }
+  mps_phase_range_t bounds = registry.range(mps_phase_kind::bounds);
+  if (bounds.present) { return bounds.end; }
+  mps_phase_range_t ranges = registry.range(mps_phase_kind::ranges);
+  if (ranges.present) { return ranges.end; }
+  return registry.range(mps_phase_kind::rhs).end;
+}
+
+template <typename Stream, typename i_t, typename f_t>
+static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
+  Stream& stream, const char* total_timer_name, const char* producer_task_name)
+{
+  omp_set_max_active_levels(2);
+
+  input_stream_view_t input = stream.view();
+  timer_io_context_t timer_io_context(input.compressed_size);
+  auto total_timer = std::make_unique<scoped_timer_t>(total_timer_name);
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
+  std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint());
 
   cursor_t cursor(input.data, 0);
   parse_state_t<i_t, f_t> state(problem, cursor);
@@ -2949,6 +3096,9 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
         input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic);
       }
 
+      // We intentionally keep LZ4/raw input as a stable full-buffer producer here. The
+      // progressive decoded-page lifetime prototype saved RSS, but made COLUMNS/merge slower
+      // and really wants a separate memory-limited parser pipeline instead of this fast path.
 #pragma omp task
       {
         MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io);
@@ -2978,7 +3128,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
         });
       }
 
-#pragma omp task depend(in : columns_ready, rows_done) depend(out : columns_done)
+#pragma omp task depend(in : rows_done, columns_ready) depend(out : columns_done)
       {
         run_parser_task([&] {
           MPS_NVTX_RANGE("task_columns", nvtx::colors::columns);
@@ -3042,13 +3192,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   append_bounds_only_variables(state);
 
   input.size = stream.size();
-  cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present
-                 ? input.registry->range(mps_phase_kind::quadratic).end
-                 : (input.registry->range(mps_phase_kind::bounds).present
-                      ? input.registry->range(mps_phase_kind::bounds).end
-                      : (input.registry->range(mps_phase_kind::ranges).present
-                           ? input.registry->range(mps_phase_kind::ranges).end
-                           : input.registry->range(mps_phase_kind::rhs).end));
+  cursor.ptr = trailing_endata_cursor_end(*input.registry);
   cursor.end = input.data + input.size;
   if (!cursor.done()) { expect(cursor, "ENDATA"); }
 
@@ -3060,6 +3204,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
 struct small_raw_read_t {
   bool use_small_path = false;
   std::vector<char> buffer;
+  std::size_t size = 0;
 };
 
 static small_raw_read_t try_read_small_raw_file(const std::string& path)
@@ -3081,54 +3226,39 @@ static small_raw_read_t try_read_small_raw_file(const std::string& path)
     mps_parser_fail(
       error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str());
   }
-  std::size_t file_size = static_cast<std::size_t>(file_size_long);
+  std::size_t file_size = (std::size_t)file_size_long;
   if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; }
   if (std::fseek(file, 0, SEEK_SET) != 0) {
     mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str());
   }
 
-  std::vector<char> buffer(file_size);
+  if (file_size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "small raw input padding size overflow");
+  }
+  std::vector<char> buffer(file_size + input_buffer_padding_bytes);
   if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) {
     mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str());
   }
-  return {true, std::move(buffer)};
+  return {true, std::move(buffer), file_size};
 }
 
 template <typename i_t, typename f_t>
 static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_small_raw_file(
-  std::vector<char> buffer)
+  std::vector<char> buffer, std::size_t input_size)
 {
   auto total_timer = std::make_unique<scoped_timer_t>("parse_mps_fast_file_raw_small (total)");
   const char* data = buffer.data();
-  const char* end  = data + buffer.size();
+  const char* end  = data + input_size;
 
   mps_phase_registry_t registry;
   mps_section_block_scanner_t scanner(data, 1, registry);
   scanner.observe_block(0, data, end);
-  scanner.publish_ready(buffer.size());
+  scanner.publish_ready(input_size);
 
   cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
-  problem.n_vars_                   = 0;
-  problem.n_constraints_            = 0;
-  problem.nnz_                      = 0;
-  problem.maximize_                 = false;
-  problem.objective_scaling_factor_ = f_t{1};
-  problem.objective_offset_         = f_t{0};
-
-  std::size_t reserve_size = std::max<std::size_t>(buffer.size(), 1024 * 1024);
-  std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
-  problem.A_offsets_.reserve(reserve_dim);
-  problem.b_.reserve(reserve_dim);
-  problem.variable_lower_bounds_.reserve(reserve_dim);
-  problem.variable_upper_bounds_.reserve(reserve_dim);
-  problem.var_types_.reserve(reserve_dim);
-  problem.row_types_.reserve(reserve_dim);
-  problem.row_names_.reserve(reserve_dim);
-  problem.var_names_.reserve(reserve_dim);
-  problem.constraint_lower_bounds_.reserve(reserve_dim);
-  problem.constraint_upper_bounds_.reserve(reserve_dim);
+  std::size_t reserve_dim = init_problem_storage(problem, input_size);
 
-  cursor_t cursor(data, buffer.size());
+  cursor_t cursor(data, input_size);
   parse_state_t<i_t, f_t> state(problem, cursor);
   state.row_names_sv.reserve(reserve_dim);
 
@@ -3142,13 +3272,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data);
   append_bounds_only_variables(state);
 
-  cursor.ptr = registry.range(mps_phase_kind::quadratic).present
-                 ? registry.range(mps_phase_kind::quadratic).end
-                 : (registry.range(mps_phase_kind::bounds).present
-                      ? registry.range(mps_phase_kind::bounds).end
-                      : (registry.range(mps_phase_kind::ranges).present
-                           ? registry.range(mps_phase_kind::ranges).end
-                           : registry.range(mps_phase_kind::rhs).end));
+  cursor.ptr = trailing_endata_cursor_end(registry);
   cursor.end = end;
   if (!cursor.done()) { expect(cursor, "ENDATA"); }
 
@@ -3170,7 +3294,7 @@ cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
   if (effective_method == FileReadMethod::Read) {
     small_raw_read_t small_raw = try_read_small_raw_file(path);
     if (small_raw.use_small_path) {
-      return parse_mps_fast_small_raw_file<i_t, f_t>(std::move(small_raw.buffer));
+      return parse_mps_fast_small_raw_file<i_t, f_t>(std::move(small_raw.buffer), small_raw.size);
     }
     RawInputStream stream(path);
     return parse_mps_fast_stream<RawInputStream, i_t, f_t>(
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 08521eafc0..dc9ae86abc 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -15,6 +15,8 @@
 #include <atomic>
 #include <cerrno>
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <limits>
 #include <mutex>
@@ -30,13 +32,11 @@ using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_expects;
 using cuopt::linear_programming::io::mps_parser_fail;
 
-char* string_buffer;
-char* string_buffer_ptr;
-
 namespace {
 
-constexpr std::size_t raw_input_window_bytes     = 64ull * 1024ull * 1024ull;
-constexpr std::size_t raw_input_max_read_threads = 8;
+constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
+constexpr std::size_t raw_input_max_read_threads          = 8;
+constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
 
 bool path_has_suffix(const std::string& path, const char* suffix) noexcept
 {
@@ -45,28 +45,6 @@ bool path_has_suffix(const std::string& path, const char* suffix) noexcept
          path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
 }
 
-}  // namespace
-
-namespace {
-
-class FileDescriptor {
- public:
-  explicit FileDescriptor(int fd) : fd_(fd) {}
-  ~FileDescriptor()
-  {
-    if (fd_ >= 0) { ::close(fd_); }
-  }
-
-  FileDescriptor(const FileDescriptor&)            = delete;
-  FileDescriptor& operator=(const FileDescriptor&) = delete;
-
-  int get() const noexcept { return fd_; }
-  bool valid() const noexcept { return fd_ >= 0; }
-
- private:
-  int fd_;
-};
-
 std::size_t get_file_size(int fd, const std::string& path)
 {
   struct stat st;
@@ -76,14 +54,14 @@ std::size_t get_file_size(int fd, const std::string& path)
                     path.c_str(),
                     std::strerror(errno));
   }
-  return static_cast<std::size_t>(st.st_size);
+  return (std::size_t)st.st_size;
 }
 
 std::size_t system_page_size()
 {
   static std::size_t page_size = [] {
     long value = ::sysconf(_SC_PAGESIZE);
-    return value > 0 ? static_cast<std::size_t>(value) : static_cast<std::size_t>(4096);
+    return value > 0 ? (std::size_t)value : (std::size_t)4096;
   }();
   return page_size;
 }
@@ -100,25 +78,47 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
   return value + increment;
 }
 
+std::size_t add_input_padding(std::size_t size)
+{
+  if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow");
+  }
+  return size + input_buffer_padding_bytes;
+}
+
 }  // namespace
 
 RawInputStream::RawInputStream(const std::string& path) : path_(path)
 {
   MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
-  fd_ = ::open(path.c_str(), O_RDONLY);
-  if (fd_ < 0) {
+  buffered_fd_ = ::open(path.c_str(), O_RDONLY);
+  if (buffered_fd_ < 0) {
     mps_parser_fail(error_type_t::RuntimeError,
                     "Failed to open raw MPS file '%s': %s",
                     path.c_str(),
                     std::strerror(errno));
   }
 
-  file_size_    = get_file_size(fd_, path);
+  file_size_         = get_file_size(buffered_fd_, path);
+  fd_                = buffered_fd_;
+  bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes;
+  if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) {
+    use_direct_io = raw_direct[0] != '0';
+  }
+  if (use_direct_io) {
+#ifdef O_DIRECT
+    int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
+    if (direct_fd >= 0) {
+      fd_        = direct_fd;
+      direct_io_ = true;
+    }
+#endif
+  }
   window_bytes_ = raw_input_window_bytes;
   window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
 
-  output_mapped_size_ =
-    round_up_to_multiple(std::max<std::size_t>(file_size_, 1), system_page_size());
+  output_mapped_size_ = round_up_to_multiple(
+    std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
   output_region_ = mmap_region_t::anonymous(
     output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
   output_data_ = output_region_.char_data();
@@ -133,6 +133,7 @@ RawInputStream::RawInputStream(const std::string& path) : path_(path)
 RawInputStream::~RawInputStream()
 {
   if (fd_ >= 0) { ::close(fd_); }
+  if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); }
 }
 
 const char* RawInputStream::data() const noexcept { return output_data_; }
@@ -156,7 +157,7 @@ void RawInputStream::run_decode_tasks()
   }
 
   std::size_t hw_threads =
-    std::max<std::size_t>(1, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+    std::max<std::size_t>(1, (std::size_t)std::thread::hardware_concurrency());
   std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
   thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
 
@@ -181,10 +182,19 @@ void RawInputStream::run_decode_tasks()
     {
       MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
       while (done < size) {
-        ssize_t got = ::pread(
-          fd_, output_data_ + offset + done, size - done, static_cast<off_t>(offset + done));
+        ssize_t got =
+          ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
         if (got < 0) {
           if (errno == EINTR) { continue; }
+          if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) {
+            got = ::pread(
+              buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
+            if (got >= 0) {
+              done += (std::size_t)got;
+              continue;
+            }
+            if (errno == EINTR) { continue; }
+          }
           mps_parser_fail(error_type_t::RuntimeError,
                           "Failed to pread raw MPS file '%s': %s",
                           path_.c_str(),
@@ -195,7 +205,7 @@ void RawInputStream::run_decode_tasks()
                           "Unexpected EOF while reading raw MPS file '%s'",
                           path_.c_str());
         }
-        done += static_cast<std::size_t>(got);
+        done += (std::size_t)got;
       }
     }
 
@@ -249,10 +259,11 @@ bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffi
 void drop_file_cache(const std::string& path)
 {
   MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
-  FileDescriptor fd(::open(path.c_str(), O_RDONLY));
-  if (!fd.valid()) { return; }
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) { return; }
 
-  ::posix_fadvise(fd.get(), 0, 0, POSIX_FADV_DONTNEED);
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+  ::close(fd);
 }
 
 FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index cc603e35d8..bab63c76cf 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -16,6 +16,10 @@
 
 namespace mps_fast {
 
+inline constexpr std::size_t input_buffer_padding_bytes = 64;
+
+struct lz4_pipeline_t;
+
 /**
  * @brief File reading method selection
  */
@@ -72,15 +76,7 @@ class Lz4InputStream {
   void run_decode_tasks();
 
  private:
-  struct Block {
-    std::size_t compressed_offset   = 0;
-    std::size_t compressed_size     = 0;
-    std::size_t read_end_offset     = 0;
-    std::size_t decompressed_offset = 0;
-    std::size_t decompressed_size   = 0;
-    std::size_t index               = 0;
-    bool uncompressed               = false;
-  };
+  friend struct lz4_pipeline_t;
 
   void commit_up_to(std::size_t bytes);
 
@@ -99,7 +95,6 @@ class Lz4InputStream {
   bool block_checksum_               = false;
   bool content_checksum_             = false;
   bool dict_id_                      = false;
-  std::vector<Block> blocks_;
   mps_phase_registry_t registry_;
   std::mutex commit_mutex_;
   std::mutex frontier_mutex_;
@@ -108,24 +103,6 @@ class Lz4InputStream {
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
   std::size_t next_block_  = 0;
   std::size_t ready_bytes_ = 0;
-
-  struct BatchMetric {
-    std::size_t index                    = 0;
-    std::size_t first_block              = 0;
-    std::size_t blocks                   = 0;
-    std::size_t file_bytes               = 0;
-    std::size_t decompressed_bytes       = 0;
-    double read_ms                       = 0.0;
-    double decode_ms                     = 0.0;
-    double commit_ms                     = 0.0;
-    double frontier_lock_wait_ms         = 0.0;
-    double frontier_update_ms            = 0.0;
-    double section_scan_ms               = 0.0;
-    std::size_t ready_bytes_delta        = 0;
-    std::size_t frontier_blocks_advanced = 0;
-    double total_ms                      = 0.0;
-  };
-  std::vector<BatchMetric> batch_metrics_;
 };
 
 class RawInputStream {
@@ -148,7 +125,9 @@ class RawInputStream {
 
  private:
   std::string path_;
-  int fd_ = -1;
+  int fd_          = -1;
+  int buffered_fd_ = -1;
+  bool direct_io_  = false;
   mmap_region_t output_region_;
   char* output_data_              = nullptr;
   std::size_t output_mapped_size_ = 0;
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
index ab0d4c2c78..7d367db941 100644
--- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -6,302 +6,63 @@
 #pragma once
 
 #include <simde/x86/avx2.h>
-#include <simde/x86/sse4.2.h>
 
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 
-#define __assume(cond)                    \
-  do {                                    \
-    if (!(cond)) __builtin_unreachable(); \
-  } while (0)
-
-#define BUCKET_COUNT (4194304 * 2 * 2 * 4)  // 2^22
-
-// Set to 1 for 32-byte keys, 0 for 16-byte keys
-#ifndef USE_32B_HASH_KEYS
-#define USE_32B_HASH_KEYS 1
-#endif
-
 namespace mps_fast {
 
-static inline uint32_t crcHash(const uint8_t* key, int64_t len)
-{
-  __assume(len < 256);
-
-  uint64_t crc = 0;
-  while (len > 8) {
-    uint64_t val = *(const uint64_t*)key;
-    crc          = simde_mm_crc32_u64(crc, val);
-    len -= 8;
-    key += 8;
-  }
-
-  // CRC the final 1-7 bytes
-  uint64_t val = *(const uint64_t*)key;
-  val &= ~(~0ULL << len * 8);  // Compiles to a bzhi instruction (also UB)
-  crc = simde_mm_crc32_u64(crc, val);
-
-  return crc;
-}
-
-static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3)
+// FNV-1a over bytes in reverse order; row names commonly share long prefixes.
+static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len)
 {
-  uint64_t crc = 0;
-  crc          = simde_mm_crc32_u64(crc, q0);
-  crc          = simde_mm_crc32_u64(crc, q1);
-  crc          = simde_mm_crc32_u64(crc, q2);
-  crc          = simde_mm_crc32_u64(crc, q3);
+  constexpr uint32_t fnv_offset = 2166136261u;
+  constexpr uint32_t fnv_prime  = 16777619u;
 
-  return crc;
-}
-
-// FNV-1a hash, processes bytes in reverse to better handle common-prefix strings
-static inline uint32_t fnv1a_hash(const char* ptr, size_t len)
-{
-  constexpr uint32_t FNV_OFFSET = 2166136261u;
-  constexpr uint32_t FNV_PRIME  = 16777619u;
-
-  uint32_t h    = FNV_OFFSET;
+  uint32_t h    = fnv_offset;
   const char* p = ptr + len;
   while (p > ptr) {
     --p;
     h ^= (uint8_t)*p;
-    h *= FNV_PRIME;
+    h *= fnv_prime;
   }
   return h;
 }
 
-struct __attribute__((packed)) hash_slot_32_t {
-  uint32_t count;
-  simde__m256i node;
-};
-
-struct alignas(16) hash_slot_16_t {
-  char key[16];
-  uint32_t count;
-};
-
-static inline bool key_cmpeq_16(const char* slot_key, simde__m128i key)
-{
-  simde__m128i slot_vec = simde_mm_loadu_si128((const simde__m128i*)slot_key);
-  int mask              = simde_mm_movemask_epi8(simde_mm_cmpeq_epi8(slot_vec, key));
-  return mask == 0xFFFF;
-}
-
-// 32-byte aligned slot: 28-byte key + 4-byte count = 32 bytes total (one cache line half)
+// 28-byte inline key + uint32 payload: two slots per 64-byte cache line.
+// key_store writes a full 32-byte vector starting at key[0], so callers must
+// publish the payload after storing the key. key_cmpeq masks those payload lanes
+// away, leaving the trailing uint32 free for the row index + 1 sentinel.
 struct alignas(32) hash_slot_28_t {
   char key[28];
   uint32_t count;
 };
 
-static inline simde__m256i make_key_28(const char* ptr, size_t len)
-{
-  alignas(32) char buf[32] = {0};
-  size_t copy_len          = len < 28 ? len : 28;
-  std::memcpy(buf, ptr, copy_len);
-  return simde_mm256_load_si256((const simde__m256i*)buf);
-}
-
-// Compare 28-byte keys stored in simde__m256i (ignore last 4 bytes)
-static inline bool key_cmpeq_28(const char* slot_key, simde__m256i key)
-{
-  simde__m256i slot_vec = simde_mm256_loadu_si256((const simde__m256i*)slot_key);
-  int mask              = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key));
-  return (mask & 0x0FFFFFFF) == 0x0FFFFFFF;  // Only check first 28 bytes
-}
-
-#if USE_32B_HASH_KEYS
-using hash_key_t                = simde__m256i;
-using hash_slot_var_t           = hash_slot_28_t;
-constexpr size_t HASH_KEY_BYTES = 28;
-constexpr int HASH_KEY_CMP_MASK = 0x0FFFFFFF;
-#define make_key                 make_key_28
-#define key_cmpeq(slot_key, key) key_cmpeq_28(slot_key, key)
-#define key_store(slot_key, key) simde_mm256_store_si256((simde__m256i*)(slot_key), key)
-#else
-using hash_key_t                = simde__m128i;
-using hash_slot_var_t           = hash_slot_16_t;
-constexpr size_t HASH_KEY_BYTES = 16;
-constexpr int HASH_KEY_CMP_MASK = 0xFFFF;
-#define make_key                 make_key_16
-#define key_cmpeq(slot_key, key) key_cmpeq_16(slot_key, key)
-#define key_store(slot_key, key) simde_mm_store_si128((simde__m128i*)(slot_key), key)
-#endif
-
-// Legacy alias
-using hash_slot_t = hash_slot_32_t;
-
-struct hash_table_t {
-  hash_slot_t slots[BUCKET_COUNT];
-};
-
-static inline void hash_table_push(
-  hash_table_t* table, uint32_t hash, simde__m256i val, int len, const uint8_t* ptr)
-{
-  hash %= BUCKET_COUNT;
-
-  hash_slot_t* slot = &table->slots[hash];
-
-  if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) {
-    ++slot->count;
-    return;
-  }
-
-  bool relooped = false;
-
-loop:
-  for (; slot < &table->slots[BUCKET_COUNT]; ++slot) {
-    if (slot->count == 0) {
-      slot->count = 1;
-      slot->node  = val;
-      return;
-    }
-
-    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) {
-      ++slot->count;
-      return;
-    }
-  }
+using hash_key_t                     = simde__m256i;
+using hash_slot_var_t                = hash_slot_28_t;
+constexpr std::size_t HASH_KEY_BYTES = 28;
 
-  if (!relooped) {
-    relooped = true;
-    slot     = &table->slots[0];
-    goto loop;
-  } else {
-    __builtin_trap();
-  }
-}
-
-extern char* string_buffer;
-extern char* string_buffer_ptr;
+static_assert(sizeof(hash_slot_28_t) == 32);
+static_assert(alignof(hash_slot_28_t) == 32);
+static_assert(offsetof(hash_slot_28_t, count) == HASH_KEY_BYTES);
 
-// Lookup: returns the stored value (count-1) or SIZE_MAX if not found
-// For small strings <= 32 bytes stored inline in node
-static inline size_t hash_table_lookup(const hash_table_t* table, uint32_t hash, simde__m256i val)
+static inline hash_key_t make_key(const char* ptr, std::size_t len)
 {
-  hash %= BUCKET_COUNT;
-  const hash_slot_t* slot = &table->slots[hash];
-
-  for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) {
-    if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; }
-
-    if (slot->count == 0) {
-      return SIZE_MAX;  // Not found
-    }
-
-    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) {
-      return slot->count - 1;  // Found, return index
-    }
-  }
-
-  return SIZE_MAX;  // Not found
+  alignas(32) char buf[32] = {};
+  std::memcpy(buf, ptr, len < HASH_KEY_BYTES ? len : HASH_KEY_BYTES);
+  return simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(buf));
 }
 
-// Insert with index: stores index+1 in count field (0 means empty)
-static inline void hash_table_insert(hash_table_t* table,
-                                     uint32_t hash,
-                                     simde__m256i val,
-                                     size_t index)
+static inline bool key_cmpeq(const char* slot_key, hash_key_t key)
 {
-  hash %= BUCKET_COUNT;
-  hash_slot_t* slot = &table->slots[hash];
-
-  for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) {
-    if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; }
-
-    if (slot->count == 0) {
-      slot->count = (uint32_t)(index + 1);
-      slot->node  = val;
-      return;
-    }
-
-    if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) {
-      // Already exists, update index
-      slot->count = (uint32_t)(index + 1);
-      return;
-    }
-  }
-
-  __builtin_trap();
-}
-
-// Create simde__m256i key from string_view (zero-padded)
-static inline simde__m256i make_key_32(const char* ptr, size_t len)
-{
-  alignas(32) char buf[32] = {0};
-  if (len > 32) len = 32;
-  memcpy(buf, ptr, len);
-  return simde_mm256_load_si256((const simde__m256i*)buf);
-}
-
-// Create simde__m128i key from string_view (zero-padded, for strings <= 16 bytes)
-static inline simde__m128i make_key_16(const char* ptr, size_t len)
-{
-  alignas(16) char buf[16] = {0};
-  if (len > 16) len = 16;
-  memcpy(buf, ptr, len);
-  return simde_mm_load_si128((const simde__m128i*)buf);
-}
-
-static inline uint64_t m256_u64_lane(simde__m256i value, size_t lane)
-{
-  simde__m256i_private private_value = simde__m256i_to_private(value);
-  return private_value.u64[lane];
+  simde__m256i slot_vec = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(slot_key));
+  int mask              = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key));
+  return (mask & 0x0fffffff) == 0x0fffffff;
 }
 
-static inline void hash_table_push_ptr(hash_table_t* table,
-                                       uint32_t hash,
-                                       int len,
-                                       const uint8_t* ptr)
+static inline void key_store(char* slot_key, hash_key_t key)
 {
-  hash %= BUCKET_COUNT;
-
-  hash_slot_t* slot = &table->slots[hash];
-  bool relooped     = false;
-
-  uint32_t len_in_qwords = (len / 8) + (len % 8 ? 1 : 0);
-
-loop:
-  do {
-    uint64_t node_len = m256_u64_lane(slot->node, 3);
-    uint64_t node_tag = m256_u64_lane(slot->node, 0);
-    // nonzero, it's not a pointer of the same length, skip
-    if (__builtin_expect(node_len != (uint64_t)len, 0)) {
-      if (__builtin_expect(node_tag == 0, 1)) {
-        slot->count = 1;
-        slot->node  = simde_mm256_set_epi64x(len,
-                                            ((uint64_t*)ptr)[0],
-                                            (uint64_t)string_buffer_ptr,
-                                            0u | ((uint64_t)len_in_qwords << 32u));
-
-        memcpy(string_buffer_ptr, ptr, len);
-        string_buffer_ptr += len;
-        // Pad
-        string_buffer_ptr += (8 - len % 8) + 8;
-
-        return;
-      } else
-        continue;
-    }
-    if (m256_u64_lane(slot->node, 2) != ((uint64_t*)ptr)[0])  // First 8 bytes differ
-      continue;
-
-    uint8_t* other_ptr = reinterpret_cast<uint8_t*>(m256_u64_lane(slot->node, 1));
-    if (__builtin_expect(memcmp(ptr + 16, other_ptr + 16, len - 16) == 0, 1)) {
-      ++slot->count;
-
-      return;
-    }
-  } while (++slot < &table->slots[BUCKET_COUNT]);
-
-  if (!relooped) {
-    relooped = true;
-    slot     = &table->slots[0];
-    goto loop;
-  } else {
-    __builtin_trap();
-  }
+  simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(slot_key), key);
 }
 
 }  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index a0be7daaf0..bb6657e303 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -19,12 +19,13 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cassert>
 #include <cerrno>
-#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <deque>
 #include <limits>
@@ -44,12 +45,13 @@ using cuopt::linear_programming::io::mps_parser_fail;
 
 namespace {
 
-constexpr uint32_t lz4_frame_magic                      = 0x184D2204u;
-constexpr uint32_t lz4_uncompressed_block               = 0x80000000u;
-constexpr uint32_t lz4_block_size_mask                  = 0x7FFFFFFFu;
-constexpr std::size_t lz4_pipeline_batch_bytes          = 64ull * 1024ull * 1024ull;
-constexpr std::size_t lz4_input_max_io_threads          = 8;
-constexpr std::size_t lz4_no_content_size_reserve_ratio = 16;
+constexpr uint32_t lz4_frame_magic                        = 0x184D2204u;
+constexpr uint32_t lz4_uncompressed_block                 = 0x80000000u;
+constexpr uint32_t lz4_block_size_mask                    = 0x7FFFFFFFu;
+constexpr std::size_t lz4_pipeline_batch_bytes            = 64ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_input_max_io_threads            = 8;
+constexpr std::size_t lz4_no_content_size_reserve_ratio   = 16;
 
 using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
 
@@ -168,10 +170,10 @@ std::size_t block_max_size_from_bd(unsigned char bd)
 
 std::size_t checked_size(uint64_t value, const char* label)
 {
-  if (value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max())) {
+  if (value > (uint64_t)std::numeric_limits<std::size_t>::max()) {
     mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label);
   }
-  return static_cast<std::size_t>(value);
+  return (std::size_t)value;
 }
 
 std::size_t get_file_size(int fd, const std::string& path)
@@ -187,14 +189,14 @@ std::size_t get_file_size(int fd, const std::string& path)
     mps_parser_fail(
       error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str());
   }
-  return static_cast<std::size_t>(st.st_size);
+  return (std::size_t)st.st_size;
 }
 
 std::size_t system_page_size()
 {
   static std::size_t page_size = [] {
     long value = ::sysconf(_SC_PAGESIZE);
-    return value > 0 ? static_cast<std::size_t>(value) : static_cast<std::size_t>(4096);
+    return value > 0 ? (std::size_t)value : (std::size_t)4096;
   }();
   return page_size;
 }
@@ -219,10 +221,12 @@ std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
   return a * b;
 }
 
-double elapsed_ms_since(std::chrono::steady_clock::time_point start)
+std::size_t checked_add(std::size_t a, std::size_t b, const char* label)
 {
-  return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start)
-    .count();
+  if (a > std::numeric_limits<std::size_t>::max() - b) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label);
+  }
+  return a + b;
 }
 
 bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
@@ -230,9 +234,9 @@ bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
   std::size_t done = 0;
   while (done < bytes) {
     std::size_t remaining = bytes - done;
-    std::size_t chunk     = std::min<std::size_t>(
-      remaining, static_cast<std::size_t>(std::numeric_limits<ssize_t>::max()));
-    ssize_t got = ::pread(fd, dst + done, chunk, static_cast<off_t>(offset + done));
+    std::size_t chunk =
+      std::min<std::size_t>(remaining, (std::size_t)std::numeric_limits<ssize_t>::max());
+    ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done));
     if (got < 0) {
       if (errno == EINTR) { continue; }
       return false;
@@ -241,7 +245,7 @@ bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
       errno = EIO;
       return false;
     }
-    done += static_cast<std::size_t>(got);
+    done += (std::size_t)got;
   }
   return true;
 }
@@ -359,8 +363,8 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
                     "unsupported LZ4 input: expected standard LZ4 frame magic");
   }
   offset += 4;
-  unsigned char flg = static_cast<unsigned char>(header[offset++]);
-  unsigned char bd  = static_cast<unsigned char>(header[offset++]);
+  unsigned char flg = (unsigned char)header[offset++];
+  unsigned char bd  = (unsigned char)header[offset++];
   unsigned version  = (flg >> 6) & 0x3u;
   if (version != 1) {
     mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version");
@@ -403,6 +407,7 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
       checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve");
     reserve_size = std::max(reserve_size, block_max_size_);
   }
+  reserve_size = checked_add(reserve_size, input_buffer_padding_bytes, "LZ4 output padding");
 
   constexpr std::size_t huge_alignment = 2 * 1024 * 1024;
   output_mapped_size_                  = round_up_to_multiple(reserve_size, system_page_size());
@@ -460,310 +465,293 @@ void Lz4InputStream::commit_up_to(std::size_t bytes)
   output_committed_size_ = new_committed;
 }
 
-void Lz4InputStream::run_decode_tasks()
-{
-  MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
-  std::exception_ptr first_error = nullptr;
-  std::mutex error_mutex;
-  std::atomic_bool stop_workers{false};
-  auto mark_error = [&](std::exception_ptr eptr) {
+struct resident_block_desc_t {
+  const char* src                 = nullptr;
+  std::size_t compressed_size     = 0;
+  std::size_t decompressed_offset = 0;
+  std::size_t decompressed_size   = 0;
+  std::size_t index               = 0;
+  std::size_t window_index        = std::numeric_limits<std::size_t>::max();
+  bool uncompressed               = false;
+};
+
+struct lz4_pipeline_t {
+  explicit lz4_pipeline_t(Lz4InputStream& input_)
+    : input(input_),
+      window_count((input.compressed_size_ + window_bytes - 1) / window_bytes),
+      windows(window_count),
+      io_threads(std::min(lz4_input_max_io_threads, window_count)),
+      window_done(window_count, 0),
+      window_refs(window_count),
+      window_scanned(window_count),
+      window_released(window_count)
+  {
+    for (std::size_t i = 0; i < window_count; ++i) {
+      std::size_t offset     = i * window_bytes;
+      std::size_t size       = std::min(window_bytes, input.compressed_size_ - offset);
+      windows[i].index       = i;
+      windows[i].file_offset = offset;
+      windows[i].size        = size;
+      window_refs[i].store(0, std::memory_order_relaxed);
+      window_scanned[i].store(0, std::memory_order_relaxed);
+      window_released[i].store(0, std::memory_order_relaxed);
+    }
+  }
+
+  void run()
+  {
+    start_readers();
+    std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this);
+    start_decoders();
+
+    for (auto& reader : readers) {
+      reader.join();
+    }
+    scanner.join();
+    for (auto& worker : decoders) {
+      worker.join();
+    }
+    if (first_error) { std::rethrow_exception(first_error); }
+  }
+
+  void finalize()
+  {
+    input.output_view_size_ = input.ready_bytes_;
+    input.commit_up_to(
+      checked_add(input.output_view_size_, input_buffer_padding_bytes, "LZ4 output padding"));
+    input.section_scanner_->publish_ready(input.output_view_size_);
+  }
+
+  void mark_error(std::exception_ptr eptr)
+  {
     std::lock_guard<std::mutex> lock(error_mutex);
     if (!first_error) {
       first_error = eptr;
       stop_workers.store(true, std::memory_order_release);
     }
-  };
-
-  const std::size_t window_bytes = lz4_pipeline_batch_bytes;
-  const std::size_t window_count = (compressed_size_ + window_bytes - 1) / window_bytes;
-  std::vector<lz4_resident_window_t> windows(window_count);
-  for (std::size_t i = 0; i < window_count; ++i) {
-    std::size_t offset     = i * window_bytes;
-    std::size_t size       = std::min(window_bytes, compressed_size_ - offset);
-    windows[i].index       = i;
-    windows[i].file_offset = offset;
-    windows[i].size        = size;
-    windows[i].data.reset(new char[size]);
-  }
-
-  const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count);
-  std::atomic<double> decoder_wait_batch_ms{0.0};
-  std::atomic<double> decoder_active_batch_ms{0.0};
-
-  struct resident_block_desc_t {
-    const char* src                 = nullptr;
-    std::size_t compressed_size     = 0;
-    std::size_t decompressed_offset = 0;
-    std::size_t decompressed_size   = 0;
-    std::size_t index               = 0;
-    bool uncompressed               = false;
-  };
-
-  std::atomic_size_t next_window{0};
-  std::vector<unsigned char> window_done(window_count, 0);
-  std::mutex window_mutex;
-  std::condition_variable window_cv;
-
-  std::deque<std::vector<resident_block_desc_t>> desc_queue;
-  bool scanner_done = false;
-  std::mutex desc_mutex;
-  std::condition_variable desc_cv;
+  }
 
-  auto fail_and_notify = [&](std::exception_ptr eptr) {
+  void fail_and_notify(std::exception_ptr eptr)
+  {
     mark_error(eptr);
     window_cv.notify_all();
     desc_cv.notify_all();
-  };
+  }
+
+  void add_compressed_resident(std::size_t bytes)
+  {
+    compressed_resident_bytes.fetch_add(bytes, std::memory_order_relaxed);
+  }
+
+  void try_release_window(std::size_t index)
+  {
+    if (index >= window_count) { return; }
+    if (window_scanned[index].load(std::memory_order_acquire) == 0) { return; }
+    if (window_refs[index].load(std::memory_order_acquire) != 0) { return; }
+    uint8_t expected = 0;
+    if (!window_released[index].compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) {
+      return;
+    }
+    std::lock_guard<std::mutex> lock(window_release_mutex);
+    if (windows[index].data) {
+      windows[index].data.reset();
+      compressed_resident_bytes.fetch_sub(windows[index].size, std::memory_order_relaxed);
+    }
+  }
+
+  void mark_windows_scanned_before(std::size_t offset)
+  {
+    std::size_t last_excl = std::min(window_count, offset / window_bytes);
+    for (std::size_t wi = 0; wi < last_excl; ++wi) {
+      window_scanned[wi].store(1, std::memory_order_release);
+      try_release_window(wi);
+    }
+  }
 
-  auto decode_worker = [&](std::size_t tid) {
+  void start_readers()
+  {
+    readers.reserve(io_threads);
+    for (std::size_t t = 0; t < io_threads; ++t) {
+      readers.emplace_back(&lz4_pipeline_t::run_reader_stage, this, t);
+    }
+  }
+
+  void run_reader_stage(std::size_t tid)
+  {
+    std::string thread_name = "lz4-window-read-" + std::to_string(tid);
+    nvtx::name_current_thread(thread_name.c_str());
+    while (!stop_workers.load(std::memory_order_acquire)) {
+      std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
+      if (index >= windows.size()) { break; }
+      auto& w = windows[index];
+      w.data.reset(new char[w.size]);
+      add_compressed_resident(w.size);
+      bool ok = false;
+      {
+        MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+        ok = pread_full_plain(input.fd_, w.data.get(), w.size, w.file_offset);
+      }
+      if (!ok) {
+        try {
+          mps_parser_fail(error_type_t::RuntimeError,
+                          "Failed to pread LZ4 resident window: %s",
+                          std::strerror(errno));
+        } catch (...) {
+          fail_and_notify(std::current_exception());
+        }
+        return;
+      }
+      {
+        MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+        std::lock_guard<std::mutex> lock(window_mutex);
+        window_done[index] = 1;
+      }
+      window_cv.notify_all();
+    }
+  }
+
+  void start_decoders()
+  {
+    decoders.reserve(io_threads);
+    for (std::size_t t = 0; t < io_threads; ++t) {
+      decoders.emplace_back(&lz4_pipeline_t::run_decoder_stage, this, t);
+    }
+  }
+
+  void run_decoder_stage(std::size_t tid)
+  {
     try {
       std::string thread_name = "lz4-window-decode-" + std::to_string(tid);
       nvtx::name_current_thread(thread_name.c_str());
       while (true) {
-        std::vector<resident_block_desc_t> batch;
-        {
-          MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
-          std::unique_lock<std::mutex> lock(desc_mutex);
-          const auto wait_start = std::chrono::steady_clock::now();
-          desc_cv.wait(lock, [&] {
-            return stop_workers.load(std::memory_order_acquire) || scanner_done ||
-                   !desc_queue.empty();
-          });
-          decoder_wait_batch_ms.fetch_add(elapsed_ms_since(wait_start), std::memory_order_relaxed);
-          if (stop_workers.load(std::memory_order_acquire)) { return; }
-          if (desc_queue.empty()) {
-            if (scanner_done) return;
-            continue;
-          }
-          batch = std::move(desc_queue.front());
-          desc_queue.pop_front();
-        }
-
-        const auto decode_start = std::chrono::steady_clock::now();
-        MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
-        for (const auto& block : batch) {
-          char* dst  = output_data_ + block.decompressed_offset;
-          int actual = 0;
-          {
-            MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode);
-            if (block.uncompressed) {
-              std::memcpy(dst, block.src, block.decompressed_size);
-              actual = static_cast<int>(block.decompressed_size);
-            } else if (block.compressed_size >
-                         static_cast<std::size_t>(std::numeric_limits<int>::max()) ||
-                       block.decompressed_size >
-                         static_cast<std::size_t>(std::numeric_limits<int>::max())) {
-              actual = -1;
-            } else {
-              actual = lz4_decompress_safe_runtime(block.src,
-                                                   dst,
-                                                   static_cast<int>(block.compressed_size),
-                                                   static_cast<int>(block.decompressed_size));
-            }
-          }
-          if (actual < 0 || static_cast<std::size_t>(actual) > block.decompressed_size) {
-            mps_parser_fail(error_type_t::ValidationError,
-                            "LZ4 input block decompressed to invalid size");
-          }
-
-          std::size_t actual_size = static_cast<std::size_t>(actual);
-          {
-            MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
-            section_scanner_->observe_block(block.index, dst, dst + actual_size);
-          }
-          std::size_t before = 0;
-          std::size_t after  = 0;
-          {
-            MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic);
-            frontier_mutex_.lock();
-            block_done_[block.index] = 1;
-            block_end_[block.index]  = block.decompressed_offset + actual_size;
-            before                   = ready_bytes_;
-            while (next_block_ < block_done_.size() && block_done_[next_block_]) {
-              ready_bytes_ = block_end_[next_block_];
-              ++next_block_;
-            }
-            after = ready_bytes_;
-            frontier_mutex_.unlock();
-          }
-          if (after > before) {
-            MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic);
-            section_scanner_->publish_ready(after);
-          }
-        }
-        decoder_active_batch_ms.fetch_add(elapsed_ms_since(decode_start),
-                                          std::memory_order_relaxed);
+        std::vector<resident_block_desc_t> batch = wait_for_decode_batch();
+        if (batch.empty()) { return; }
+        decode_batch(batch);
       }
     } catch (...) {
       fail_and_notify(std::current_exception());
     }
-  };
+  }
 
-  std::vector<std::thread> readers;
-  readers.reserve(io_threads);
-  for (std::size_t t = 0; t < io_threads; ++t) {
-    readers.emplace_back([&, t] {
-      std::string thread_name = "lz4-window-read-" + std::to_string(t);
-      nvtx::name_current_thread(thread_name.c_str());
-      while (!stop_workers.load(std::memory_order_acquire)) {
-        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
-        if (index >= windows.size()) { break; }
-        auto& w = windows[index];
-        bool ok = false;
-        {
-          MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
-          ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset);
-        }
-        if (!ok) {
-          try {
-            mps_parser_fail(error_type_t::RuntimeError,
-                            "Failed to pread LZ4 resident window: %s",
-                            std::strerror(errno));
-          } catch (...) {
-            fail_and_notify(std::current_exception());
-          }
-          return;
-        }
-        {
-          MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
-          std::lock_guard<std::mutex> lock(window_mutex);
-          window_done[index] = 1;
-        }
-        window_cv.notify_all();
-      }
+  std::vector<resident_block_desc_t> wait_for_decode_batch()
+  {
+    MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
+    std::unique_lock<std::mutex> lock(desc_mutex);
+    desc_cv.wait(lock, [&] {
+      return stop_workers.load(std::memory_order_acquire) || scanner_done || !desc_queue.empty();
     });
+    if (stop_workers.load(std::memory_order_acquire) || desc_queue.empty()) { return {}; }
+    std::vector<resident_block_desc_t> batch = std::move(desc_queue.front());
+    desc_queue.pop_front();
+    return batch;
   }
 
-  std::atomic_size_t blocks_scanned{0};
-  std::vector<std::vector<char>> crossing_payloads;
-  const auto read_wall_start = std::chrono::steady_clock::now();
-  std::thread scanner([&] {
-    try {
-      nvtx::name_current_thread("lz4-metadata-scan");
-      lz4_resident_windows_t resident(windows);
-      auto wait_range_ready = [&](std::size_t begin, std::size_t size) {
-        if (size == 0) return;
-        std::size_t first = begin / window_bytes;
-        std::size_t last  = (begin + size - 1) / window_bytes;
-        for (std::size_t wi = first; wi <= last; ++wi) {
-          MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
-          std::unique_lock<std::mutex> lock(window_mutex);
-          window_cv.wait(lock, [&] {
-            return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0;
-          });
-          if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) {
-            mps_parser_fail(error_type_t::RuntimeError,
-                            "LZ4 metadata scanner stopped before required window was ready");
-          }
-        }
-      };
-      auto push_batch = [&](std::vector<resident_block_desc_t>& batch) {
-        if (batch.empty()) return;
-        {
-          MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc);
-          commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size);
-        }
-        {
-          MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic);
-          std::lock_guard<std::mutex> lock(desc_mutex);
-          desc_queue.push_back(std::move(batch));
-        }
-        batch.clear();
-        desc_cv.notify_one();
-      };
-
-      std::vector<resident_block_desc_t> batch;
-      batch.reserve(1024);
-      std::size_t offset              = header_size_;
-      std::size_t decompressed_offset = 0;
-      while (true) {
-        MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
-        wait_range_ready(offset, 4);
-        if (offset + 4 > compressed_size_) {
-          mps_parser_fail(error_type_t::ValidationError,
-                          "truncated LZ4 frame while reading block header");
-        }
-        uint32_t raw_block_size = resident.read_u32(offset);
-        offset += 4;
-        if (raw_block_size == 0) { break; }
-
-        bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
-        std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
-        if (block_payload_size == 0) {
-          mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block");
-        }
-        if (block_payload_size > block_max_size_ && uncompressed) {
-          mps_parser_fail(error_type_t::ValidationError,
-                          "LZ4 uncompressed block exceeds frame block maximum");
-        }
-        if (content_size_present_ && decompressed_offset >= content_size_) {
-          mps_parser_fail(error_type_t::ValidationError,
-                          "LZ4 frame contains more blocks than content size allows");
-        }
-        wait_range_ready(offset, block_payload_size);
-        if (offset + block_payload_size > compressed_size_) {
-          mps_parser_fail(error_type_t::ValidationError,
-                          "truncated LZ4 frame while reading block payload");
-        }
-
-        std::size_t decompressed_size = block_payload_size;
-        if (!uncompressed) {
-          if (content_size_present_) {
-            decompressed_size = std::min(block_max_size_, content_size_ - decompressed_offset);
-          } else {
-            decompressed_size = block_max_size_;
-          }
-        }
-        if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) {
-          mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
-        }
+  void decode_batch(const std::vector<resident_block_desc_t>& batch)
+  {
+    MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
+    for (const auto& block : batch) {
+      decode_block(block);
+    }
+  }
 
-        const char* src = resident.ptr_if_contiguous(offset, block_payload_size);
-        if (src == nullptr) {
-          crossing_payloads.emplace_back(block_payload_size);
-          resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size);
-          src = crossing_payloads.back().data();
-        }
-        batch.push_back({src,
-                         block_payload_size,
-                         decompressed_offset,
-                         decompressed_size,
-                         blocks_scanned.load(std::memory_order_relaxed),
-                         uncompressed});
-        blocks_scanned.fetch_add(1, std::memory_order_relaxed);
-        decompressed_offset += decompressed_size;
-        offset += block_payload_size;
-        if (block_checksum_) {
-          wait_range_ready(offset, 4);
-          if (offset + 4 > compressed_size_) {
-            mps_parser_fail(error_type_t::ValidationError,
-                            "truncated LZ4 frame while reading block checksum");
-          }
-          offset += 4;
-        }
-        if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) {
-          mps_parser_fail(error_type_t::OutOfMemoryError,
-                          "LZ4 input block count exceeded reserved metadata slots");
-        }
-        if (batch.size() >= 1024) { push_batch(batch); }
-      }
-      if (content_checksum_) {
-        wait_range_ready(offset, 4);
-        if (offset + 4 > compressed_size_) {
-          mps_parser_fail(error_type_t::ValidationError,
-                          "truncated LZ4 frame while reading content checksum");
-        }
-        offset += 4;
+  void decode_block(const resident_block_desc_t& block)
+  {
+    char* dst  = input.output_data_ + block.decompressed_offset;
+    int actual = 0;
+    {
+      MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode);
+      if (block.uncompressed) {
+        std::memcpy(dst, block.src, block.decompressed_size);
+        actual = (int)block.decompressed_size;
+      } else if (block.compressed_size > (std::size_t)std::numeric_limits<int>::max() ||
+                 block.decompressed_size > (std::size_t)std::numeric_limits<int>::max()) {
+        actual = -1;
+      } else {
+        actual = lz4_decompress_safe_runtime(
+          block.src, dst, (int)block.compressed_size, (int)block.decompressed_size);
       }
-      if (content_size_present_ && decompressed_offset != content_size_) {
-        mps_parser_fail(error_type_t::ValidationError,
-                        "LZ4 frame ended before declared content size was reached");
+    }
+    if (actual < 0 || (std::size_t)actual > block.decompressed_size) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input block decompressed to invalid size");
+    }
+    release_block_window_ref(block);
+    publish_decoded_block(block, dst, (std::size_t)actual);
+  }
+
+  void release_block_window_ref(const resident_block_desc_t& block)
+  {
+    if (block.window_index == std::numeric_limits<std::size_t>::max()) { return; }
+    uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel);
+    (void)old;
+    assert(old > 0);
+    if (old == 1) { try_release_window(block.window_index); }
+  }
+
+  void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size)
+  {
+    {
+      MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
+      input.section_scanner_->observe_block(block.index, dst, dst + actual_size);
+    }
+    std::size_t before = 0;
+    std::size_t after  = 0;
+    {
+      MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic);
+      std::lock_guard<std::mutex> lock(input.frontier_mutex_);
+      input.block_done_[block.index] = 1;
+      input.block_end_[block.index]  = block.decompressed_offset + actual_size;
+      before                         = input.ready_bytes_;
+      while (input.next_block_ < input.block_done_.size() && input.block_done_[input.next_block_]) {
+        input.ready_bytes_ = input.block_end_[input.next_block_];
+        ++input.next_block_;
       }
-      if (offset != compressed_size_) {
-        mps_parser_fail(error_type_t::ValidationError,
-                        "LZ4 input contains trailing data after the first frame");
+      after = input.ready_bytes_;
+    }
+    if (after > before) {
+      MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic);
+      input.section_scanner_->publish_ready(after);
+    }
+  }
+
+  void wait_range_ready(std::size_t begin, std::size_t size)
+  {
+    if (size == 0) return;
+    std::size_t first = begin / window_bytes;
+    std::size_t last  = (begin + size - 1) / window_bytes;
+    for (std::size_t wi = first; wi <= last; ++wi) {
+      MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
+      std::unique_lock<std::mutex> lock(window_mutex);
+      window_cv.wait(
+        lock, [&] { return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; });
+      if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "LZ4 metadata scanner stopped before required window was ready");
       }
-      push_batch(batch);
+    }
+  }
+
+  void push_batch(std::vector<resident_block_desc_t>& batch)
+  {
+    if (batch.empty()) return;
+    {
+      MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc);
+      input.commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size);
+    }
+    {
+      MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic);
+      std::lock_guard<std::mutex> lock(desc_mutex);
+      desc_queue.push_back(std::move(batch));
+    }
+    batch.clear();
+    desc_cv.notify_one();
+  }
+
+  void run_scanner_stage()
+  {
+    try {
+      nvtx::name_current_thread("lz4-metadata-scan");
+      scan_lz4_metadata();
       {
         std::lock_guard<std::mutex> lock(desc_mutex);
         scanner_done = true;
@@ -776,37 +764,177 @@ void Lz4InputStream::run_decode_tasks()
       }
       fail_and_notify(std::current_exception());
     }
-  });
-
-  std::vector<std::thread> io_workers;
-  io_workers.reserve(io_threads);
-  for (std::size_t t = 0; t < io_threads; ++t) {
-    io_workers.emplace_back(decode_worker, t);
-  }
-  for (auto& reader : readers) {
-    reader.join();
-  }
-  const double read_wall_ms = elapsed_ms_since(read_wall_start);
-  scanner.join();
-  for (auto& worker : io_workers) {
-    worker.join();
-  }
-  if (first_error) std::rethrow_exception(first_error);
-  output_view_size_ = ready_bytes_;
-  section_scanner_->publish_ready(output_view_size_);
-
-  const double compressed_mb = static_cast<double>(compressed_size_) / (1024.0 * 1024.0);
-  const double read_effective_mbps =
-    read_wall_ms > 0.0 ? compressed_mb / (read_wall_ms / 1000.0) : 0.0;
-  const double decoder_wait_ms   = decoder_wait_batch_ms.load(std::memory_order_relaxed);
-  const double decoder_active_ms = decoder_active_batch_ms.load(std::memory_order_relaxed);
-  const double decoder_total_ms  = decoder_wait_ms + decoder_active_ms;
-  const double decoder_wait_ratio =
-    decoder_total_ms > 0.0 ? decoder_wait_ms / decoder_total_ms : 0.0;
-  std::fprintf(stderr,
-               "[LZ4_IO] read_effective_MBps=%.3f decoder_wait_ratio=%.6f\n",
-               read_effective_mbps,
-               decoder_wait_ratio);
+  }
+
+  void scan_lz4_metadata()
+  {
+    lz4_resident_windows_t resident(windows);
+    std::vector<resident_block_desc_t> batch;
+    batch.reserve(lz4_decode_batch_decompressed_bytes / input.block_max_size_ + 1);
+    std::size_t batch_decoded_bytes = 0;
+    std::size_t offset              = input.header_size_;
+    std::size_t decompressed_offset = 0;
+    blocks_scanned.store(0, std::memory_order_relaxed);
+
+    while (true) {
+      MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block header");
+      }
+      uint32_t raw_block_size = resident.read_u32(offset);
+      offset += 4;
+      if (raw_block_size == 0) { break; }
+
+      resident_block_desc_t block =
+        scan_one_block(resident, raw_block_size, offset, decompressed_offset);
+      batch_decoded_bytes += block.decompressed_size;
+      batch.push_back(block);
+      blocks_scanned.fetch_add(1, std::memory_order_relaxed);
+      if (blocks_scanned.load(std::memory_order_relaxed) > input.block_done_.size()) {
+        mps_parser_fail(error_type_t::OutOfMemoryError,
+                        "LZ4 input block count exceeded reserved metadata slots");
+      }
+      if (batch_decoded_bytes >= lz4_decode_batch_decompressed_bytes) {
+        push_batch(batch);
+        batch_decoded_bytes = 0;
+      }
+    }
+
+    scan_frame_footer(offset, decompressed_offset);
+    push_batch(batch);
+    mark_windows_scanned_before(input.compressed_size_);
+  }
+
+  resident_block_desc_t scan_one_block(lz4_resident_windows_t& resident,
+                                       uint32_t raw_block_size,
+                                       std::size_t& offset,
+                                       std::size_t& decompressed_offset)
+  {
+    bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
+    std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
+    if (block_payload_size == 0) {
+      mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block");
+    }
+    if (block_payload_size > input.block_max_size_ && uncompressed) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 uncompressed block exceeds frame block maximum");
+    }
+    if (input.content_size_present_ && decompressed_offset >= input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame contains more blocks than content size allows");
+    }
+
+    wait_range_ready(offset, block_payload_size);
+    if (offset + block_payload_size > input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading block payload");
+    }
+
+    std::size_t decompressed_size = block_payload_size;
+    if (!uncompressed) {
+      decompressed_size =
+        input.content_size_present_
+          ? std::min(input.block_max_size_, input.content_size_ - decompressed_offset)
+          : input.block_max_size_;
+    }
+    if (input.content_size_present_ &&
+        decompressed_size > input.content_size_ - decompressed_offset) {
+      mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
+    }
+
+    const char* src          = resident.ptr_if_contiguous(offset, block_payload_size);
+    std::size_t window_index = std::numeric_limits<std::size_t>::max();
+    if (src == nullptr) {
+      crossing_payloads.emplace_back(block_payload_size);
+      resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size);
+      src = crossing_payloads.back().data();
+    } else {
+      window_index = offset / window_bytes;
+      window_refs[window_index].fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    resident_block_desc_t block{src,
+                                block_payload_size,
+                                decompressed_offset,
+                                decompressed_size,
+                                blocks_scanned.load(std::memory_order_relaxed),
+                                window_index,
+                                uncompressed};
+    decompressed_offset += decompressed_size;
+    offset += block_payload_size;
+    mark_windows_scanned_before(offset);
+    if (input.block_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    return block;
+  }
+
+  void scan_frame_footer(std::size_t& offset, std::size_t decompressed_offset)
+  {
+    if (input.content_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading content checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    if (input.content_size_present_ && decompressed_offset != input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame ended before declared content size was reached");
+    }
+    if (offset != input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input contains trailing data after the first frame");
+    }
+  }
+
+  Lz4InputStream& input;
+  const std::size_t window_bytes = lz4_pipeline_batch_bytes;
+  const std::size_t window_count;
+  std::vector<lz4_resident_window_t> windows;
+  const std::size_t io_threads;
+
+  std::exception_ptr first_error = nullptr;
+  std::mutex error_mutex;
+  std::atomic_bool stop_workers{false};
+
+  std::atomic_size_t next_window{0};
+  std::vector<unsigned char> window_done;
+  std::vector<std::atomic<uint32_t>> window_refs;
+  std::vector<std::atomic<uint8_t>> window_scanned;
+  std::vector<std::atomic<uint8_t>> window_released;
+  std::mutex window_mutex;
+  std::condition_variable window_cv;
+  std::mutex window_release_mutex;
+  std::atomic_size_t compressed_resident_bytes{0};
+
+  std::deque<std::vector<resident_block_desc_t>> desc_queue;
+  bool scanner_done = false;
+  std::mutex desc_mutex;
+  std::condition_variable desc_cv;
+
+  std::atomic_size_t blocks_scanned{0};
+  std::vector<std::vector<char>> crossing_payloads;
+  std::vector<std::thread> readers;
+  std::vector<std::thread> decoders;
+};
+
+void Lz4InputStream::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
+  lz4_pipeline_t pipeline(*this);
+  pipeline.run();
+  pipeline.finalize();
 }
 
 }  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
index 98c6e4885d..d7b299917b 100644
--- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -53,6 +53,7 @@ class mmap_region_t {
 
   ~mmap_region_t() { reset(); }
 
+ private:
   static mmap_region_t map(
     void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
   {
@@ -64,6 +65,7 @@ class mmap_region_t {
     return mmap_region_t(ptr, size);
   }
 
+ public:
   static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context)
   {
     return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context);
@@ -89,7 +91,7 @@ class mmap_region_t {
 
     uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
     uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1);
-    std::size_t prefix     = static_cast<std::size_t>(aligned_addr - raw_addr);
+    std::size_t prefix     = (std::size_t)(aligned_addr - raw_addr);
     std::size_t suffix     = raw_size - prefix - size;
     if (prefix > 0) { ::munmap(raw, prefix); }
     if (suffix > 0) { ::munmap(reinterpret_cast<void*>(aligned_addr + size), suffix); }
@@ -113,33 +115,14 @@ class mmap_region_t {
     size_ = 0;
   }
 
-  void reset(void* ptr, std::size_t size) noexcept
-  {
-    reset();
-    ptr_  = ptr;
-    size_ = size;
-  }
-
-  void* release() noexcept
-  {
-    void* ptr = ptr_;
-    ptr_      = nullptr;
-    size_     = 0;
-    return ptr;
-  }
-
   void advise(int advice) const noexcept
   {
     if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); }
   }
 
   void* data() noexcept { return ptr_; }
-  const void* data() const noexcept { return ptr_; }
-  char* char_data() noexcept { return static_cast<char*>(ptr_); }
-  const char* char_data() const noexcept { return static_cast<const char*>(ptr_); }
+  char* char_data() noexcept { return (char*)ptr_; }
   std::size_t size() const noexcept { return size_; }
-  bool empty() const noexcept { return ptr_ == nullptr || size_ == 0; }
-  explicit operator bool() const noexcept { return !empty(); }
 
  private:
   void* ptr_        = nullptr;
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index 8581921173..498b106955 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -6,6 +6,8 @@
 #include <utilities/error.hpp>
 
 #include <algorithm>
+#include <array>
+#include <cassert>
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
@@ -22,6 +24,33 @@ using cuopt::linear_programming::io::mps_parser_fail;
 
 namespace {
 
+struct section_record_t {
+  mps_section_kind kind;
+  const char* name;
+  std::size_t len;
+};
+
+constexpr section_record_t section_records[] = {
+  {mps_section_kind::rows, "ROWS", 4},
+  {mps_section_kind::columns, "COLUMNS", 7},
+  {mps_section_kind::rhs, "RHS", 3},
+  {mps_section_kind::bounds, "BOUNDS", 6},
+  {mps_section_kind::ranges, "RANGES", 6},
+  {mps_section_kind::quadobj, "QUADOBJ", 7},
+  {mps_section_kind::qmatrix, "QMATRIX", 7},
+  {mps_section_kind::qcmatrix, "QCMATRIX", 8},
+  {mps_section_kind::endata, "ENDATA", 6},
+};
+
+constexpr const char* header_records[] = {"NAME", "OBJSENSE", "OBJNAME"};
+
+constexpr std::size_t kSimdWidth = sizeof(simde__m256i);
+static_assert(kSimdWidth == 32);
+static_assert((std::size_t)mps_section_kind::rows == 0);
+static_assert((std::size_t)mps_section_kind::endata + 1 == std::size(section_records));
+static_assert((std::size_t)mps_phase_kind::header == 0);
+static_assert((std::size_t)mps_phase_kind::quadratic + 1 == 7);
+
 bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; }
 
 simde__m256i nonblank_column1_mask(simde__m256i bytes)
@@ -29,39 +58,21 @@ simde__m256i nonblank_column1_mask(simde__m256i bytes)
   return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' '));
 }
 
-const char* section_name(mps_section_kind kind)
+enum class section_record_match_t { invalid, header, section };
+
+bool line_has_record_prefix(const char* line_start, const char* line_end, const char* name)
 {
-  switch (kind) {
-    case mps_section_kind::rows: return "ROWS";
-    case mps_section_kind::columns: return "COLUMNS";
-    case mps_section_kind::rhs: return "RHS";
-    case mps_section_kind::bounds: return "BOUNDS";
-    case mps_section_kind::ranges: return "RANGES";
-    case mps_section_kind::quadobj: return "QUADOBJ";
-    case mps_section_kind::qmatrix: return "QMATRIX";
-    case mps_section_kind::qcmatrix: return "QCMATRIX";
-    case mps_section_kind::endata: return "ENDATA";
+  std::size_t len = std::strlen(name);
+  if ((std::size_t)(line_end - line_start) < len || std::memcmp(line_start, name, len) != 0) {
+    return false;
   }
-  return "";
+  const char* after = line_start + len;
+  return after == line_end || *after <= ' ';
 }
 
-std::size_t section_name_len(mps_section_kind kind) { return std::strlen(section_name(kind)); }
-
 }  // namespace
 
-std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase)
-{
-  switch (phase) {
-    case mps_phase_kind::header: return 0;
-    case mps_phase_kind::rows: return 1;
-    case mps_phase_kind::columns: return 2;
-    case mps_phase_kind::rhs: return 3;
-    case mps_phase_kind::bounds: return 4;
-    case mps_phase_kind::ranges: return 5;
-    case mps_phase_kind::quadratic: return 6;
-  }
-  mps_parser_fail(error_type_t::RuntimeError, "invalid MPS phase kind");
-}
+std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) { return (std::size_t)phase; }
 
 void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range)
 {
@@ -105,68 +116,37 @@ bool mps_phase_registry_t::ready(mps_phase_kind phase) const
 
 mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
 {
-  return ranges_[phase_index(phase)];
+  std::size_t idx = phase_index(phase);
+  assert(ready_[idx].load(std::memory_order_acquire));
+  return ranges_[idx];
 }
 
-bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind)
+static section_record_match_t is_section_record(const char* line_start,
+                                                const char* line_end,
+                                                mps_section_kind* kind)
 {
-  if (line_start >= line_end) { return false; }
-
-  mps_section_kind candidate;
-  switch (*line_start) {
-    case 'R':
-      if (line_end - line_start >= 3 && std::memcmp(line_start, "RHS", 3) == 0) {
-        candidate = mps_section_kind::rhs;
-      } else if (line_end - line_start >= 4 && std::memcmp(line_start, "ROWS", 4) == 0) {
-        candidate = mps_section_kind::rows;
-      } else if (line_end - line_start >= 6 && std::memcmp(line_start, "RANGES", 6) == 0) {
-        candidate = mps_section_kind::ranges;
-      } else {
-        return false;
-      }
-      break;
-    case 'C':
-      if (line_end - line_start >= 7 && std::memcmp(line_start, "COLUMNS", 7) == 0) {
-        candidate = mps_section_kind::columns;
-      } else {
-        return false;
-      }
-      break;
-    case 'B':
-      if (line_end - line_start >= 6 && std::memcmp(line_start, "BOUNDS", 6) == 0) {
-        candidate = mps_section_kind::bounds;
-      } else {
-        return false;
-      }
-      break;
-    case 'E':
-      if (line_end - line_start >= 6 && std::memcmp(line_start, "ENDATA", 6) == 0) {
-        candidate = mps_section_kind::endata;
-      } else {
-        return false;
-      }
-      break;
-    case 'Q':
-      if (line_end - line_start >= 7 && std::memcmp(line_start, "QUADOBJ", 7) == 0) {
-        candidate = mps_section_kind::quadobj;
-      } else if (line_end - line_start >= 7 && std::memcmp(line_start, "QMATRIX", 7) == 0) {
-        candidate = mps_section_kind::qmatrix;
-      } else if (line_end - line_start >= 8 && std::memcmp(line_start, "QCMATRIX", 8) == 0) {
-        candidate = mps_section_kind::qcmatrix;
-      } else {
-        return false;
-      }
-      break;
-    default: return false;
+  if (line_start >= line_end) { return section_record_match_t::invalid; }
+
+  for (const char* name : header_records) {
+    if (line_has_record_prefix(line_start, line_end, name)) {
+      return section_record_match_t::header;
+    }
   }
 
-  const char* after = line_start + section_name_len(candidate);
-  while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
-    ++after;
+  for (const section_record_t& record : section_records) {
+    if ((std::size_t)(line_end - line_start) < record.len ||
+        std::memcmp(line_start, record.name, record.len) != 0) {
+      continue;
+    }
+    const char* after = line_start + record.len;
+    while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
+      ++after;
+    }
+    if (after != line_end) { return section_record_match_t::invalid; }
+    *kind = record.kind;
+    return section_record_match_t::section;
   }
-  if (after != line_end) { return false; }
-  *kind = candidate;
-  return true;
+  return section_record_match_t::invalid;
 }
 
 mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data,
@@ -188,18 +168,7 @@ mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data,
 
 std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind)
 {
-  switch (kind) {
-    case mps_section_kind::rows: return 0;
-    case mps_section_kind::columns: return 1;
-    case mps_section_kind::rhs: return 2;
-    case mps_section_kind::bounds: return 3;
-    case mps_section_kind::ranges: return 4;
-    case mps_section_kind::quadobj: return 5;
-    case mps_section_kind::qmatrix: return 6;
-    case mps_section_kind::qcmatrix: return 7;
-    case mps_section_kind::endata: return 8;
-  }
-  return 0;
+  return (std::size_t)kind;
 }
 
 void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr)
@@ -212,11 +181,8 @@ void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, cons
   }
 }
 
-void mps_section_block_scanner_t::scan_section_range(const char* begin,
-                                                     const char* end,
-                                                     bool boundary_scan)
+void mps_section_block_scanner_t::scan_section_range(const char* begin, const char* end)
 {
-  (void)boundary_scan;
   if (begin >= end) return;
   const char* p = begin;
 
@@ -224,21 +190,39 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin,
   // line. A separate boundary scan covers section titles whose newline/title
   // bytes straddle adjacent LZ4 blocks.
   if (p != data_) {
-    const void* nl = __builtin_memchr(p, '\n', static_cast<std::size_t>(end - p));
+    const void* nl = __builtin_memchr(p, '\n', (std::size_t)(end - p));
     if (nl == nullptr) { return; }
-    p = static_cast<const char*>(nl) + 1;
+    p = (const char*)nl + 1;
   }
 
   auto try_candidate = [&](const char* line_start) {
-    const void* nl = __builtin_memchr(line_start, '\n', static_cast<std::size_t>(end - line_start));
-    const char* line_end = nl == nullptr ? end : static_cast<const char*>(nl);
+    const void* nl       = __builtin_memchr(line_start, '\n', (std::size_t)(end - line_start));
+    const char* line_end = nullptr;
+    if (nl == nullptr) {
+      const char* ready_ptr = data_ + ready_bytes_.load(std::memory_order_acquire);
+      if (end != ready_ptr) { return; }
+      line_end = end;
+    } else {
+      line_end = (const char*)nl;
+    }
+    if (*line_start == '*' || *line_start == '$') { return; }
     mps_section_kind kind;
-    if (line_is_section(line_start, line_end, &kind)) { record_section_hit(kind, line_start); }
+    section_record_match_t match = is_section_record(line_start, line_end, &kind);
+    if (match == section_record_match_t::section) {
+      record_section_hit(kind, line_start);
+      return;
+    }
+    if (match == section_record_match_t::invalid) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "unknown section record: %.*s",
+                      (int)(line_end - line_start),
+                      line_start);
+    }
   };
 
   // Handle the very first line of a file (NAME indicator, usually)
   if (p == data_) {
-    if (p < end && is_nonblank_column1(static_cast<unsigned char>(*p))) { try_candidate(p); }
+    if (p < end && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
     ++p;
   }
 
@@ -246,24 +230,25 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin,
   // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap
   // candidate signal, then run the exact section matcher only for candidates.
   const simde__m256i newline = simde_mm256_set1_epi8('\n');
-  while (static_cast<std::size_t>(end - p) >= 32) {
+  while ((std::size_t)(end - p) >= kSimdWidth) {
+    // The first-line path above increments p when p == data_, so p - 1 is
+    // in-bounds here. Loading the previous vector lets us test "\nX" for all
+    // 32 candidate column-1 bytes with one AVX2 mask.
     simde__m256i current  = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p));
     simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p - 1));
-    std::uint32_t mask = static_cast<std::uint32_t>(simde_mm256_movemask_epi8(simde_mm256_and_si256(
-      simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current))));
+    std::uint32_t mask    = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256(
+      simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current)));
     while (mask != 0) {
       int bit = __builtin_ctz(mask);
       try_candidate(p + bit);
       mask &= mask - 1;
     }
-    p += 32;
+    p += kSimdWidth;
   }
 
   // scalar tail
   while (p < end) {
-    if (*(p - 1) == '\n' && is_nonblank_column1(static_cast<unsigned char>(*p))) {
-      try_candidate(p);
-    }
+    if (*(p - 1) == '\n' && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
     ++p;
   }
 }
@@ -277,7 +262,7 @@ void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::siz
     boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin;
   std::size_t end =
     right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end;
-  scan_section_range(data_ + begin, data_ + end, true);
+  scan_section_range(data_ + begin, data_ + end);
 }
 
 void mps_section_block_scanner_t::observe_block(std::size_t block_index,
@@ -289,11 +274,9 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
                     "MPS section scanner observed invalid LZ4 block index");
   }
 
-  scan_section_range(begin, end, false);
-  block_begin_offsets_[block_index].store(static_cast<std::size_t>(begin - data_),
-                                          std::memory_order_relaxed);
-  block_end_offsets_[block_index].store(static_cast<std::size_t>(end - data_),
-                                        std::memory_order_relaxed);
+  scan_section_range(begin, end);
+  block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed);
+  block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed);
   block_decoded_[block_index].store(1, std::memory_order_release);
 
   if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) {
@@ -308,11 +291,18 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
 void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
 {
   ready_bytes_.store(ready_bytes, std::memory_order_release);
+  std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0;
+  scan_section_range(data_ + begin, data_ + ready_bytes);
   publish_section_ranges();
 }
 
 void mps_section_block_scanner_t::publish_section_ranges()
 {
+  // Publication model: each present phase runs from its own section header to
+  // the first later section header that has been discovered. Optional sections
+  // publish present=false once a later boundary proves they cannot still appear.
+  // ENDATA, or final ready bytes for truncated/non-newline files, is the final
+  // boundary for the trailing optional/quadratic phases.
   std::lock_guard<std::mutex> lock(publish_mutex_);
   std::size_t ready     = ready_bytes_.load(std::memory_order_acquire);
   const char* ready_ptr = data_ + ready;
@@ -349,6 +339,21 @@ void mps_section_block_scanner_t::publish_section_ranges()
     }
     return best;
   };
+  auto publish_optional = [&](mps_phase_kind phase,
+                              const char* self,
+                              const char* predecessor,
+                              std::initializer_list<const char*> later_candidates) {
+    if (registry_.ready(phase)) { return; }
+    if (available(self)) {
+      const char* end = earliest_available_after(self, later_candidates);
+      if (end != nullptr) { registry_.publish(phase, {self, end, true}); }
+      return;
+    }
+    if (predecessor != nullptr &&
+        earliest_available_after(predecessor, later_candidates) != nullptr) {
+      registry_.publish(phase, {nullptr, nullptr, false});
+    }
+  };
 
   if (available(rows) && !registry_.ready(mps_phase_kind::header)) {
     registry_.publish(mps_phase_kind::header, {data_, rows, true});
@@ -364,43 +369,18 @@ void mps_section_block_scanner_t::publish_section_ranges()
     }
   }
 
-  if (!registry_.ready(mps_phase_kind::rhs)) {
-    if (available(rhs)) {
-      const char* rhs_end =
-        earliest_available_after(rhs, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
-      if (rhs_end != nullptr) { registry_.publish(mps_phase_kind::rhs, {rhs, rhs_end, true}); }
-    } else {
-      const char* after_columns = earliest_available_after(
-        columns, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
-      if (after_columns != nullptr) {
-        registry_.publish(mps_phase_kind::rhs, {nullptr, nullptr, false});
-      }
-    }
-  }
-
-  if (!registry_.ready(mps_phase_kind::ranges)) {
-    const char* ranges_end =
-      earliest_available_after(ranges, {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
-    const char* after_rhs = earliest_available_after(
-      rhs ? rhs : columns, {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
-    if (available(ranges) && ranges_end != nullptr) {
-      registry_.publish(mps_phase_kind::ranges, {ranges, ranges_end, true});
-    } else if (!ranges && after_rhs != nullptr) {
-      registry_.publish(mps_phase_kind::ranges, {nullptr, nullptr, false});
-    }
-  }
-
-  if (!registry_.ready(mps_phase_kind::bounds)) {
-    const char* bounds_end =
-      earliest_available_after(bounds, {quadobj, qmatrix, qcmatrix, final_boundary});
-    const char* after_ranges = earliest_available_after(
-      ranges ? ranges : (rhs ? rhs : columns), {quadobj, qmatrix, qcmatrix, final_boundary});
-    if (available(bounds) && bounds_end != nullptr) {
-      registry_.publish(mps_phase_kind::bounds, {bounds, bounds_end, true});
-    } else if (!bounds && after_ranges != nullptr) {
-      registry_.publish(mps_phase_kind::bounds, {nullptr, nullptr, false});
-    }
-  }
+  publish_optional(mps_phase_kind::rhs,
+                   rhs,
+                   columns,
+                   {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::ranges,
+                   ranges,
+                   rhs ? rhs : columns,
+                   {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::bounds,
+                   bounds,
+                   ranges ? ranges : (rhs ? rhs : columns),
+                   {quadobj, qmatrix, qcmatrix, final_boundary});
 
   if (!registry_.ready(mps_phase_kind::quadratic)) {
     const char* quadratic_begin = nullptr;
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index cc287368fb..74bf89da7f 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -47,6 +47,8 @@ class mps_phase_registry_t {
   void attach_event(mps_phase_kind phase, omp_event_handle_t event);
 
   bool ready(mps_phase_kind phase) const;
+  // range() is lock-free: callers must observe ready(phase)==true first. The
+  // acquire load in ready() pairs with publish()'s release store before ranges_.
   mps_phase_range_t range(mps_phase_kind phase) const;
 
  private:
@@ -62,8 +64,6 @@ class mps_phase_registry_t {
   mutable std::mutex mutex_;
 };
 
-bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind);
-
 class mps_section_block_scanner_t {
  public:
   mps_section_block_scanner_t(const char* data,
@@ -74,12 +74,14 @@ class mps_section_block_scanner_t {
   void publish_ready(std::size_t ready_bytes);
 
  private:
-  static constexpr std::size_t section_count    = 9;
+  static constexpr std::size_t section_count = 9;
+  // Section titles are short; 128 bytes is enough to rescan around a decoded
+  // block boundary and catch a newline/title pair split across adjacent blocks.
   static constexpr std::size_t boundary_overlap = 128;
 
   static std::size_t section_hit_index(mps_section_kind kind);
 
-  void scan_section_range(const char* begin, const char* end, bool boundary_scan);
+  void scan_section_range(const char* begin, const char* end);
   void scan_boundary(std::size_t left_index, std::size_t right_index);
   void record_section_hit(mps_section_kind kind, const char* ptr);
   void publish_section_ranges();
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
index 23f4b4b8c1..f8a6d04d1e 100644
--- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -121,7 +121,7 @@ class scoped_range {
 inline void name_current_thread(const char* name)
 {
 #ifdef MPS_FAST_NVTX
-  nvtxNameOsThreadA(static_cast<std::uint32_t>(::syscall(SYS_gettid)), name);
+  nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name);
 #else
   (void)name;
 #endif
diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp
deleted file mode 100644
index fb849fcff0..0000000000
--- a/cpp/src/io/experimental_mps_fast/simd_compat.hpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved. SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-// Use SIMDe's explicit simde_* API. On x86 it can still lower to native
-// intrinsics; on other targets it provides the portable implementation.
-#include <simde/x86/aes.h>
-#include <simde/x86/avx2.h>
-#include <simde/x86/sse4.2.h>
diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt
index bc057db1e2..fcceb4af56 100644
--- a/cpp/tests/linear_programming/CMakeLists.txt
+++ b/cpp/tests/linear_programming/CMakeLists.txt
@@ -21,6 +21,44 @@ ConfigureTest(MPS_PARSER_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp
     LABELS numopt)
 
+function(ConfigureStandaloneMpsFastTest CMAKE_TEST_NAME TEST_SOURCE)
+    add_executable(${CMAKE_TEST_NAME} ${TEST_SOURCE})
+    target_include_directories(${CMAKE_TEST_NAME}
+        PRIVATE
+        "${CUOPT_TEST_DIR}/../src"
+        "${CUOPT_TEST_DIR}/../src/io"
+        "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast"
+    )
+    target_compile_features(${CMAKE_TEST_NAME} PRIVATE cxx_std_20)
+    target_compile_options(${CMAKE_TEST_NAME}
+        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
+    )
+    target_link_libraries(${CMAKE_TEST_NAME}
+        PRIVATE
+        cuopt
+        simde::simde
+        ${CUOPT_PRIVATE_CUDA_LIBS}
+    )
+    if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
+      target_link_options(${CMAKE_TEST_NAME} PRIVATE -Wl,--enable-new-dtags)
+    endif()
+
+    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
+    set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "numopt")
+
+    install(
+        TARGETS ${CMAKE_TEST_NAME}
+        COMPONENT testing
+        DESTINATION bin/gtests/libcuopt
+        EXCLUDE_FROM_ALL
+    )
+endfunction()
+
+ConfigureStandaloneMpsFastTest(MPS_FAST_FP64_PARSER_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp)
+ConfigureStandaloneMpsFastTest(MPS_FAST_PARSER_EDGE_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp)
+
 # ##################################################################################################
 # - C API Tests----------------------------------------------------------------------
 # The C API tests require a separate library to be linked against. So we don't use the ConfigureTest macro.
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
new file mode 100644
index 0000000000..36171267cf
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -0,0 +1,231 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_fp64_parser.hpp"
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <clocale>
+#include <cstdint>
+#include <cstdlib>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace {
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); }
+
+void expect_true(bool condition, const std::string& message)
+{
+  if (!condition) { fail(message); }
+}
+
+void expect_eq_ptr(const char* got, const char* expected, std::string_view context)
+{
+  if (got != expected) {
+    std::ostringstream out;
+    out << context << ": pointer mismatch got_delta=" << (got - expected);
+    fail(out.str());
+  }
+}
+
+double reference_strtod(std::string_view token)
+{
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end    = nullptr;
+  errno        = 0;
+  double value = std::strtod(normalized.c_str(), &end);
+  expect_eq_ptr(end, normalized.c_str() + normalized.size(), token);
+  return value;
+}
+
+double parse_token(std::string_view token)
+{
+  const char* p = token.data();
+  return mps_fast::fp64::parse_fp64_advance(p, token.data() + token.size());
+}
+
+double parse_padded_token(std::string_view token)
+{
+  std::string padded(token);
+  padded.append(40, ' ');
+  const char* p = padded.data();
+  double value  = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size());
+  expect_eq_ptr(p, padded.data() + token.size(), token);
+  return value;
+}
+
+void expect_bitwise_strtod(std::string_view token)
+{
+  double ref           = reference_strtod(token);
+  uint64_t token_bits  = bits(parse_token(token));
+  uint64_t padded_bits = bits(parse_padded_token(token));
+  uint64_t ref_bits    = bits(ref);
+  if (token_bits != ref_bits || padded_bits != ref_bits) {
+    std::ostringstream out;
+    out << "bitwise mismatch for '" << token << "' ref=0x" << std::hex << ref_bits << " token=0x"
+        << token_bits << " padded=0x" << padded_bits;
+    fail(out.str());
+  }
+}
+
+std::string random_token(std::mt19937_64& rng)
+{
+  std::uniform_int_distribution<int> sign_dist(0, 4);
+  std::uniform_int_distribution<int> digit_dist(0, 9);
+  std::uniform_int_distribution<int> shape_dist(0, 5);
+  std::uniform_int_distribution<int> len_dist(1, 19);
+  std::uniform_int_distribution<int> exp_dist(-30, 30);
+
+  std::string token;
+  int sign = sign_dist(rng);
+  if (sign == 0) {
+    token.push_back('-');
+  } else if (sign == 1) {
+    token.push_back('+');
+  }
+
+  int shape = shape_dist(rng);
+  if (shape == 0) {
+    token.append("0.");
+    int frac_len = std::uniform_int_distribution<int>(1, 19)(rng);
+    for (int i = 0; i < frac_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+  } else {
+    int int_len = len_dist(rng);
+    token.push_back(static_cast<char>('1' + std::uniform_int_distribution<int>(0, 8)(rng)));
+    for (int i = 1; i < int_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+    if (shape >= 2) {
+      token.push_back('.');
+      int remaining = 24 - static_cast<int>(token.size());
+      int max_frac  = std::max(0, std::min(19, remaining));
+      int frac_len  = max_frac == 0 ? 0 : std::uniform_int_distribution<int>(0, max_frac)(rng);
+      for (int i = 0; i < frac_len; ++i) {
+        token.push_back(static_cast<char>('0' + digit_dist(rng)));
+      }
+    }
+  }
+
+  if (shape == 5) {
+    int exp            = exp_dist(rng);
+    std::string suffix = "e" + std::to_string(exp);
+    if (token.size() + suffix.size() <= 25) { token += suffix; }
+  }
+
+  if (token.size() > 25) { token.resize(25); }
+  return token;
+}
+
+void common_table_matches_strtod_bitwise()
+{
+  std::setlocale(LC_NUMERIC, "C");
+  const std::vector<std::string_view> cases = {
+    "0",
+    "-0",
+    "1",
+    "-1",
+    "+1",
+    "2",
+    "42",
+    "123456789",
+    "57.",
+    "-57.",
+    "0.1",
+    "0.01",
+    "0.12345678901234",
+    "0.1234567890123456",
+    "0.3333333333333333",
+    "0.6508282938248958",
+    "3.14159",
+    "3130000",
+    "8594600.16",
+    "2344.55",
+    "0.000000000000001",
+    "9999999999999999",
+    "1844674407370955161",
+    "1e0",
+    "1e-9",
+    "1E12",
+    "-2.5e3",
+    "3.125D-2",
+  };
+
+  for (std::string_view token : cases) {
+    expect_bitwise_strtod(token);
+  }
+}
+
+void cursor_advances_to_token_end()
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::string text = "123.45  ABC";
+  const char* p    = text.data();
+  double value     = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size());
+
+  expect_true(bits(value) == bits(reference_strtod("123.45")), "parsed value mismatch");
+  expect_eq_ptr(p, text.data() + 6, "cursor_advances_to_token_end");
+  expect_true(std::string_view(p, 5) == "  ABC", "cursor did not stop before trailing field");
+}
+
+void fixed_seed_random_differential()
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::mt19937_64 rng(0x4d50535f46415354ULL);
+  for (int i = 0; i < 100000; ++i) {
+    std::string token = random_token(rng);
+    expect_true(token.size() <= 25U, "generated token exceeds MPS numeric token length");
+    expect_bitwise_strtod(token);
+  }
+}
+
+}  // namespace
+
+int main()
+{
+  struct TestCase {
+    const char* name;
+    void (*fn)();
+  };
+
+  const TestCase tests[] = {
+    {"CommonTableMatchesStrtodBitwise", common_table_matches_strtod_bitwise},
+    {"CursorAdvancesToTokenEnd", cursor_advances_to_token_end},
+    {"FixedSeedRandomDifferential", fixed_seed_random_differential},
+  };
+
+  int failed = 0;
+  for (const TestCase& test : tests) {
+    std::cout << "[ RUN      ] " << test.name << '\n';
+    try {
+      test.fn();
+      std::cout << "[       OK ] " << test.name << '\n';
+    } catch (const std::exception& e) {
+      ++failed;
+      std::cerr << "[  FAILED  ] " << test.name << ": " << e.what() << '\n';
+    }
+  }
+
+  if (failed != 0) {
+    std::cerr << failed << " test(s) failed\n";
+    return 1;
+  }
+  std::cout << "[  PASSED  ] " << std::size(tests) << " test(s)\n";
+  return 0;
+}
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
new file mode 100644
index 0000000000..2e087ec4ee
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -0,0 +1,871 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "mps_section_scanner.hpp"
+
+#include <cuopt/linear_programming/io/parser.hpp>
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <unistd.h>
+
+namespace {
+
+struct skip_test : std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); }
+
+void expect_true(bool condition, const std::string& message)
+{
+  if (!condition) { fail(message); }
+}
+
+template <typename A, typename B>
+void expect_eq(const A& got, const B& expected, std::string_view context)
+{
+  if (!(got == expected)) {
+    std::ostringstream out;
+    out << context << ": got=" << got << " expected=" << expected;
+    fail(out.str());
+  }
+}
+
+template <typename VecA, typename VecB>
+void expect_vector_eq(const VecA& got, const VecB& expected, std::string_view context)
+{
+  if (got.size() != expected.size()) {
+    std::ostringstream out;
+    out << context << ": size got=" << got.size() << " expected=" << expected.size();
+    fail(out.str());
+  }
+  for (size_t i = 0; i < got.size(); ++i) {
+    if (!(got[i] == expected[i])) {
+      std::ostringstream out;
+      out << context << ": first mismatch at " << i;
+      fail(out.str());
+    }
+  }
+}
+
+void expect_near_inf(double value, int sign, std::string_view context)
+{
+  expect_true(std::isinf(value), std::string(context) + ": expected infinity");
+  expect_true(std::signbit(value) == (sign < 0), std::string(context) + ": wrong infinity sign");
+}
+
+struct TempMpsFile {
+  explicit TempMpsFile(std::string contents)
+  {
+    char path_template[128];
+    std::snprintf(path_template,
+                  sizeof(path_template),
+                  "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps",
+                  static_cast<long>(getpid()));
+    int fd = mkstemps(path_template, 4);
+    if (fd < 0) { fail(std::string("mkstemps failed: ") + std::strerror(errno)); }
+    path       = path_template;
+    FILE* file = fdopen(fd, "wb");
+    if (file == nullptr) {
+      close(fd);
+      fail(std::string("fdopen failed: ") + std::strerror(errno));
+    }
+    if (!contents.empty() &&
+        std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) {
+      std::fclose(file);
+      fail(std::string("failed to write temporary MPS file: ") + std::strerror(errno));
+    }
+    if (std::fclose(file) != 0) {
+      fail(std::string("failed to close temporary MPS file: ") + std::strerror(errno));
+    }
+  }
+
+  TempMpsFile(const TempMpsFile&)            = delete;
+  TempMpsFile& operator=(const TempMpsFile&) = delete;
+
+  ~TempMpsFile()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+struct TempOwnedPath {
+  explicit TempOwnedPath(std::string p) : path(std::move(p)) {}
+  TempOwnedPath(const TempOwnedPath&)            = delete;
+  TempOwnedPath& operator=(const TempOwnedPath&) = delete;
+
+  ~TempOwnedPath()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+template <typename Fn>
+void expect_throws(Fn&& fn, std::string_view context)
+{
+  try {
+    fn();
+  } catch (const std::exception&) {
+    return;
+  }
+  fail(std::string(context) + ": expected exception");
+}
+
+void expect_fast_parse_error(std::string_view fixture_name, std::string contents)
+{
+  TempMpsFile file(std::move(contents));
+  expect_throws(
+    [&] {
+      (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+    },
+    fixture_name);
+}
+
+std::string_view range_text(const mps_fast::mps_phase_range_t& range)
+{
+  if (!range.present) { return {}; }
+  return std::string_view(range.begin, static_cast<size_t>(range.end - range.begin));
+}
+
+void scanner_finds_section_split_across_blocks()
+{
+  const std::string mps =
+    "NAME EDGE\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " x1 OBJ 1\n"
+    " x1 rowA 2\n"
+    "RHS\n"
+    " rhs rowA 3\n"
+    "ENDATA\n";
+
+  const size_t columns_pos = mps.find("COLUMNS");
+  expect_true(columns_pos != std::string::npos, "failed to place COLUMNS split");
+  const size_t split = columns_pos + 3;
+
+  mps_fast::mps_phase_registry_t registry;
+  mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+  scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+  scanner.publish_ready(0);
+  scanner.observe_block(0, mps.data(), mps.data() + split);
+  scanner.publish_ready(mps.size());
+
+  expect_true(registry.ready(mps_fast::mps_phase_kind::header), "header not ready");
+  expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready");
+  expect_true(registry.ready(mps_fast::mps_phase_kind::columns), "columns not ready");
+  expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready");
+  expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), "quadratic sentinel not ready");
+
+  expect_true(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"),
+              "columns range begins at wrong boundary");
+  expect_true(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"),
+              "rhs range begins at wrong boundary");
+}
+
+void scanner_rejects_unknown_column_one_records_after_rows()
+{
+  const std::string mps =
+    "NAME BAD\n"
+    "ROWS\n"
+    " N OBJ\n"
+    "FOO\n"
+    "COLUMNS\n"
+    " x OBJ 1\n"
+    "ENDATA\n";
+
+  expect_throws(
+    [&] {
+      mps_fast::mps_phase_registry_t registry;
+      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry);
+      scanner.observe_block(0, mps.data(), mps.data() + mps.size());
+      scanner.publish_ready(mps.size());
+    },
+    "unknown column-1 record after ROWS");
+}
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+void expect_double_bitwise_eq(double got, double expected, std::string_view context)
+{
+  if (bits(got) != bits(expected)) {
+    std::ostringstream out;
+    out << context << ": got=0x" << std::hex << bits(got) << " expected=0x" << bits(expected);
+    fail(out.str());
+  }
+}
+
+template <typename VecA, typename VecB>
+void expect_double_vector_bitwise_eq(const VecA& got,
+                                     const VecB& expected,
+                                     std::string_view context)
+{
+  if (got.size() != expected.size()) {
+    std::ostringstream out;
+    out << context << ": size got=" << got.size() << " expected=" << expected.size();
+    fail(out.str());
+  }
+  for (size_t i = 0; i < got.size(); ++i) {
+    if (bits(got[i]) != bits(expected[i])) {
+      std::ostringstream out;
+      out << context << ": first bitwise mismatch at " << i << " got=0x" << std::hex << bits(got[i])
+          << " expected=0x" << bits(expected[i]);
+      fail(out.str());
+    }
+  }
+}
+
+void expect_models_match_reference_bitwise(
+  const mps_fast::parser_model_t<int, double>& fast,
+  const cuopt::linear_programming::io::mps_data_model_t<int, double>& reference,
+  std::string_view context)
+{
+  expect_eq(fast.n_vars_, reference.n_vars_, std::string(context) + " n_vars");
+  expect_eq(fast.n_constraints_, reference.n_constraints_, std::string(context) + " n_constraints");
+  expect_eq(fast.nnz_, reference.nnz_, std::string(context) + " nnz");
+  expect_eq(fast.maximize_, reference.maximize_, std::string(context) + " maximize");
+  expect_eq(fast.problem_name_, reference.problem_name_, std::string(context) + " problem_name");
+  expect_eq(
+    fast.objective_name_, reference.objective_name_, std::string(context) + " objective_name");
+
+  expect_double_bitwise_eq(fast.objective_scaling_factor_,
+                           reference.objective_scaling_factor_,
+                           std::string(context) + " objective_scaling_factor");
+  expect_double_bitwise_eq(fast.objective_offset_,
+                           reference.objective_offset_,
+                           std::string(context) + " objective_offset");
+
+  expect_double_vector_bitwise_eq(fast.A_, reference.A_, std::string(context) + " A");
+  expect_vector_eq(fast.A_indices_, reference.A_indices_, std::string(context) + " A_indices");
+  expect_vector_eq(fast.A_offsets_, reference.A_offsets_, std::string(context) + " A_offsets");
+  expect_double_vector_bitwise_eq(fast.b_, reference.b_, std::string(context) + " b");
+  expect_double_vector_bitwise_eq(fast.c_, reference.c_, std::string(context) + " c");
+  expect_double_vector_bitwise_eq(fast.variable_lower_bounds_,
+                                  reference.variable_lower_bounds_,
+                                  std::string(context) + " variable_lower_bounds");
+  expect_double_vector_bitwise_eq(fast.variable_upper_bounds_,
+                                  reference.variable_upper_bounds_,
+                                  std::string(context) + " variable_upper_bounds");
+  expect_double_vector_bitwise_eq(fast.constraint_lower_bounds_,
+                                  reference.constraint_lower_bounds_,
+                                  std::string(context) + " constraint_lower_bounds");
+  expect_double_vector_bitwise_eq(fast.constraint_upper_bounds_,
+                                  reference.constraint_upper_bounds_,
+                                  std::string(context) + " constraint_upper_bounds");
+  expect_vector_eq(fast.var_types_, reference.var_types_, std::string(context) + " var_types");
+  expect_vector_eq(fast.row_types_, reference.row_types_, std::string(context) + " row_types");
+  expect_vector_eq(fast.var_names_, reference.var_names_, std::string(context) + " var_names");
+  expect_vector_eq(fast.row_names_, reference.row_names_, std::string(context) + " row_names");
+}
+
+void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
+{
+  TempMpsFile file(std::move(contents));
+  auto fast = mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto reference = cuopt::linear_programming::io::read_mps<int, double>(file.path, false);
+  expect_models_match_reference_bitwise(fast, reference, fixture_name);
+}
+
+std::string row_name(size_t i)
+{
+  std::ostringstream out;
+  out << 'R' << std::setw(6) << std::setfill('0') << i;
+  return out.str();
+}
+
+size_t find_var(const mps_fast::parser_model_t<int, double>& model, std::string_view name)
+{
+  for (size_t i = 0; i < model.var_names_.size(); ++i) {
+    if (model.var_names_[i] == name) { return i; }
+  }
+  fail("variable not found: " + std::string(name));
+}
+
+void expect_model_shapes(const mps_fast::parser_model_t<int, double>& model,
+                         int rows,
+                         int vars,
+                         int nnz,
+                         std::string_view context)
+{
+  expect_eq(model.n_constraints_, rows, std::string(context) + " rows");
+  expect_eq(model.n_vars_, vars, std::string(context) + " vars");
+  expect_eq(model.nnz_, nnz, std::string(context) + " nnz");
+  expect_eq(
+    model.A_offsets_.size(), static_cast<size_t>(rows + 1), std::string(context) + " offsets");
+  expect_eq(model.A_.size(), static_cast<size_t>(nnz), std::string(context) + " values");
+  expect_eq(model.A_indices_.size(), static_cast<size_t>(nnz), std::string(context) + " indices");
+}
+
+std::string section_split_fixture()
+{
+  return "NAME SPLITS\n"
+         "ROWS\n"
+         " N OBJ\n"
+         " L R1\n"
+         "COLUMNS\n"
+         " X1 OBJ 1 R1 2\n"
+         "RHS\n"
+         " RHS1 R1 3\n"
+         "BOUNDS\n"
+         " UP BND X1 4\n"
+         "ENDATA\n";
+}
+
+void scanner_finds_headers_split_at_every_byte()
+{
+  const std::string mps                       = section_split_fixture();
+  const std::vector<std::string_view> headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"};
+
+  for (std::string_view header : headers) {
+    const size_t pos = mps.find(header);
+    expect_true(pos != std::string::npos, "missing header in split fixture");
+    for (size_t offset = 1; offset < header.size(); ++offset) {
+      const size_t split = pos + offset;
+      mps_fast::mps_phase_registry_t registry;
+      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+      scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+      scanner.observe_block(0, mps.data(), mps.data() + split);
+      scanner.publish_ready(mps.size());
+
+      expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready after split");
+      expect_true(registry.ready(mps_fast::mps_phase_kind::columns),
+                  "columns not ready after split");
+      expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready after split");
+      expect_true(registry.ready(mps_fast::mps_phase_kind::bounds), "bounds not ready after split");
+      expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic),
+                  "quadratic sentinel not ready after split");
+    }
+  }
+}
+
+void bounds_defaults_and_types_match_reference()
+{
+  verify_fixture_bitwise("bounds_defaults_and_types",
+                         "NAME BOUNDS_EDGE\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L rowA\n"
+                         "COLUMNS\n"
+                         " XFREE rowA 1\n"
+                         " XUP0 rowA 1\n"
+                         " XNEG rowA 1\n"
+                         " XBV rowA 1\n"
+                         " XFX rowA 1\n"
+                         " XLI rowA 1\n"
+                         "RHS\n"
+                         " RHS1 rowA 10\n"
+                         "BOUNDS\n"
+                         " FR BND XFREE\n"
+                         " UP BND XUP0 0\n"
+                         " UP BND XNEG -1\n"
+                         " BV BND XBV\n"
+                         " FX BND XFX 7\n"
+                         " LI BND XLI 2\n"
+                         " UI BND XLI 9\n"
+                         "ENDATA\n");
+}
+
+void duplicate_bounds_last_statement_wins()
+{
+  const std::string contents =
+    "NAME BOUNDS_DUP\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 rowA 1\n"
+    "RHS\n"
+    " RHS1 rowA 10\n"
+    "BOUNDS\n"
+    " LO BND X1 0\n"
+    " UP BND X1 5\n"
+    " UP BND X1 3\n"
+    " LO BND X1 2\n"
+    "ENDATA\n";
+
+  verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents);
+  TempMpsFile file(contents);
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_eq(model.n_vars_, 1, "n_vars");
+  expect_eq(model.variable_lower_bounds_.at(0), 2.0, "duplicate lower bound");
+  expect_eq(model.variable_upper_bounds_.at(0), 3.0, "duplicate upper bound");
+}
+
+void nondense_row_and_column_names_use_hash_path()
+{
+  verify_fixture_bitwise("nondense_row_and_column_names",
+                         "NAME HASH_NAMES\n"
+                         "ROWS\n"
+                         " N obj.row\n"
+                         " G demand-east\n"
+                         " L capacity-west\n"
+                         " E balance.17\n"
+                         "COLUMNS\n"
+                         " alpha obj.row 4.5 demand-east 1\n"
+                         " beta_two capacity-west -2 balance.17 3\n"
+                         " z-last demand-east 7 balance.17 -1\n"
+                         "RHS\n"
+                         " rhs demand-east 2 capacity-west 9\n"
+                         " rhs balance.17 0\n"
+                         "BOUNDS\n"
+                         " LO b alpha -5\n"
+                         " UP b beta_two 6\n"
+                         " FR b z-last\n"
+                         "ENDATA\n");
+}
+
+void missing_optional_bounds_fast_path()
+{
+  TempMpsFile file(
+    "NAME OPTIONALS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 rowA 2\n"
+    "RHS\n"
+    " RHS1 rowA 0\n"
+    "ENDATA\n");
+
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_eq(model.n_vars_, 1, "missing optional n_vars");
+  expect_eq(model.n_constraints_, 1, "missing optional n_constraints");
+  expect_eq(model.variable_lower_bounds_.at(0), 0.0, "missing BOUNDS lower default");
+  expect_near_inf(model.variable_upper_bounds_.at(0), 1, "missing BOUNDS upper default");
+}
+
+void bounds_only_variables_are_appended_deterministically()
+{
+  TempMpsFile file(
+    "NAME BOUNDS_ONLY\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " XMAIN OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 0\n"
+    "BOUNDS\n"
+    " UP B AUX_Z 9\n"
+    " LO B AUX_Z -3\n"
+    " BV B AUX_A\n"
+    " SC B AUX_S 5\n"
+    "ENDATA\n");
+
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_model_shapes(model, 1, 4, 1, "bounds-only");
+  expect_eq(model.var_names_.at(0), std::string("XMAIN"), "main var name");
+  expect_eq(model.var_names_.at(1), std::string("AUX_A"), "bounds-only sorted name 1");
+  expect_eq(model.var_names_.at(2), std::string("AUX_S"), "bounds-only sorted name 2");
+  expect_eq(model.var_names_.at(3), std::string("AUX_Z"), "bounds-only sorted name 3");
+
+  size_t aux_a = find_var(model, "AUX_A");
+  size_t aux_s = find_var(model, "AUX_S");
+  size_t aux_z = find_var(model, "AUX_Z");
+  expect_eq(model.var_types_.at(aux_a), 'I', "bounds-only BV type");
+  expect_eq(model.variable_lower_bounds_.at(aux_a), 0.0, "bounds-only BV lb");
+  expect_eq(model.variable_upper_bounds_.at(aux_a), 1.0, "bounds-only BV ub");
+  expect_eq(model.var_types_.at(aux_s), 'S', "bounds-only SC type");
+  expect_eq(model.variable_upper_bounds_.at(aux_s), 5.0, "bounds-only SC ub");
+  expect_eq(model.variable_lower_bounds_.at(aux_z), -3.0, "bounds-only duplicate lb");
+  expect_eq(model.variable_upper_bounds_.at(aux_z), 9.0, "bounds-only duplicate ub");
+}
+
+void integer_markers_assign_types_and_default_bounds()
+{
+  TempMpsFile file(
+    "NAME MARKERS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " MARK000 'MARKER' 'INTORG'\n"
+    " XINT OBJ 1 R1 1\n"
+    " MARK001 'MARKER' 'INTEND'\n"
+    " XCONT OBJ 2 R1 2\n"
+    " MARK002 'MARKER' 'INTORG'\n"
+    " XBIN OBJ 3 R1 3\n"
+    " MARK003 'MARKER' 'INTEND'\n"
+    "RHS\n"
+    " RHS1 R1 10\n"
+    "ENDATA\n");
+
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_model_shapes(model, 1, 3, 3, "integer markers");
+  size_t xint  = find_var(model, "XINT");
+  size_t xcont = find_var(model, "XCONT");
+  size_t xbin  = find_var(model, "XBIN");
+  expect_eq(model.var_types_.at(xint), 'I', "XINT type");
+  expect_eq(model.var_types_.at(xcont), 'C', "XCONT type");
+  expect_eq(model.var_types_.at(xbin), 'I', "XBIN type");
+  expect_eq(model.variable_lower_bounds_.at(xint), 0.0, "XINT default lb");
+  expect_eq(model.variable_upper_bounds_.at(xint), 1.0, "XINT default ub");
+  expect_eq(model.variable_lower_bounds_.at(xbin), 0.0, "XBIN default lb");
+  expect_eq(model.variable_upper_bounds_.at(xbin), 1.0, "XBIN default ub");
+}
+
+void numeric_parsing_integration_matches_reference_bitwise()
+{
+  verify_fixture_bitwise("numeric_parsing_integration",
+                         "NAME NUMBERS\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L R1\n"
+                         " G R2\n"
+                         " E R3\n"
+                         "COLUMNS\n"
+                         " X0 OBJ 0.12345678901234 R1 1e-9\n"
+                         " X1 OBJ -2.5E3 R2 0.12345678901234567890123\n"
+                         " X2 R3 9999999999999999\n"
+                         "RHS\n"
+                         " RHS1 R1 3.14159 R2 -0.000000000000001\n"
+                         " RHS1 R3 42\n"
+                         "RANGES\n"
+                         " RNG R1 0.25 R2 1E2\n"
+                         "BOUNDS\n"
+                         " LO B X0 -123456789\n"
+                         " UP B X0 123456789\n"
+                         " FX B X1 0.3333333333333333\n"
+                         " FR B X2\n"
+                         "ENDATA\n");
+}
+
+std::string to_crlf(std::string text)
+{
+  std::string converted;
+  converted.reserve(text.size() + text.size() / 8);
+  for (char c : text) {
+    if (c == '\n') {
+      converted += "\r\n";
+    } else {
+      converted.push_back(c);
+    }
+  }
+  return converted;
+}
+
+void crlf_line_endings_match_reference_bitwise()
+{
+  verify_fixture_bitwise("crlf_line_endings",
+                         to_crlf("NAME CRLF_EDGE\n"
+                                 "OBJSENSE\n"
+                                 " MAX\n"
+                                 "ROWS\n"
+                                 " N OBJ\n"
+                                 " L R1\n"
+                                 "COLUMNS\n"
+                                 " X1 OBJ 1 R1 2\n"
+                                 "RHS\n"
+                                 " RHS1 R1 3\n"
+                                 "BOUNDS\n"
+                                 " UP B X1 4\n"
+                                 "ENDATA\n"));
+}
+
+void comment_placement_supported_cases_match_reference_bitwise()
+{
+  verify_fixture_bitwise("comment_placement_supported_cases",
+                         "* leading star comment\n"
+                         "$ leading dollar comment\n"
+                         "NAME COMMENTS\n"
+                         "$ comment between NAME and ROWS\n"
+                         "ROWS\n"
+                         "* comment after ROWS header\n"
+                         " N OBJ $ row objective comment\n"
+                         "$ comment between ROW records\n"
+                         " L R1 $ row constraint comment\n"
+                         "COLUMNS\n"
+                         "* comment after COLUMNS header\n"
+                         " X1 OBJ 1 R1 2 $ inline column comment\n"
+                         "$ comment before next column\n"
+                         " X2 OBJ -1 R1 3\n"
+                         "RHS\n"
+                         "$ comment after RHS header\n"
+                         " RHS1 R1 5 $ inline rhs comment\n"
+                         "BOUNDS\n"
+                         "* comment after BOUNDS header\n"
+                         " LO B X1 0 $ inline bound comment\n"
+                         "$ comment before ENDATA\n"
+                         "ENDATA\n");
+}
+
+void objective_metadata_selects_named_objective()
+{
+  TempMpsFile file(
+    "NAME OBJMETA\n"
+    "OBJSENSE\n"
+    " MAX\n"
+    "OBJNAME\n"
+    " COST\n"
+    "ROWS\n"
+    " N ALT\n"
+    " N COST\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " X1 ALT 100 COST 5\n"
+    " X1 R1 1\n"
+    " X2 COST -2 R1 3\n"
+    "RHS\n"
+    " RHS1 COST 7 R1 11\n"
+    "ENDATA\n");
+
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_true(model.maximize_, "OBJSENSE MAX not applied");
+  expect_eq(model.problem_name_, std::string("OBJMETA"), "problem name");
+  expect_eq(model.objective_name_, std::string("COST"), "objective name");
+  expect_eq(model.objective_offset_, -7.0, "objective RHS offset");
+  size_t x1 = find_var(model, "X1");
+  size_t x2 = find_var(model, "X2");
+  expect_eq(model.c_.at(x1), 5.0, "named objective coefficient X1");
+  expect_eq(model.c_.at(x2), -2.0, "named objective coefficient X2");
+}
+
+void malformed_inputs_report_errors()
+{
+  expect_fast_parse_error("bad objsense",
+                          "NAME BADOBJ\n"
+                          "OBJSENSE\n"
+                          " SIDEWAYS\n"
+                          "ROWS\n"
+                          " N OBJ\n"
+                          " L R1\n"
+                          "COLUMNS\n"
+                          " X1 OBJ 1 R1 2\n"
+                          "RHS\n"
+                          " RHS1 R1 0\n"
+                          "ENDATA\n");
+
+  expect_fast_parse_error("unknown row in columns",
+                          "NAME BADCOLROW\n"
+                          "ROWS\n"
+                          " N OBJ\n"
+                          " L R1\n"
+                          "COLUMNS\n"
+                          " X1 MISSING 1\n"
+                          "RHS\n"
+                          " RHS1 R1 0\n"
+                          "ENDATA\n");
+
+  expect_fast_parse_error("unknown row in rhs",
+                          "NAME BADRHSROW\n"
+                          "ROWS\n"
+                          " N OBJ\n"
+                          " L R1\n"
+                          "COLUMNS\n"
+                          " X1 OBJ 1 R1 2\n"
+                          "RHS\n"
+                          " RHS1 MISSING 1\n"
+                          "ENDATA\n");
+
+  expect_fast_parse_error("unknown bound type",
+                          "NAME BADBOUND\n"
+                          "ROWS\n"
+                          " N OBJ\n"
+                          " L R1\n"
+                          "COLUMNS\n"
+                          " X1 OBJ 1 R1 2\n"
+                          "RHS\n"
+                          " RHS1 R1 0\n"
+                          "BOUNDS\n"
+                          " XX B X1 1\n"
+                          "ENDATA\n");
+
+  expect_fast_parse_error("semi-continuous bound without value",
+                          "NAME BADSC\n"
+                          "ROWS\n"
+                          " N OBJ\n"
+                          " L R1\n"
+                          "COLUMNS\n"
+                          " X1 OBJ 1 R1 2\n"
+                          "RHS\n"
+                          " RHS1 R1 0\n"
+                          "BOUNDS\n"
+                          " SC B X1\n"
+                          "ENDATA\n");
+}
+
+void large_columns_repeated_column_chunk_boundary()
+{
+  constexpr size_t row_count = 180000;
+  std::string mps;
+  mps.reserve(8 * 1024 * 1024);
+  mps += "NAME BIGCOLS\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " XBIG ";
+    mps += row_name(i);
+    mps += " 1\n";
+  }
+  mps += " XTAIL ";
+  mps += row_name(1);
+  mps += " 2\nRHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 0\nENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_model_shapes(
+    model, static_cast<int>(row_count), 2, static_cast<int>(row_count + 1), "large columns");
+  expect_eq(model.var_names_.at(0), std::string("XBIG"), "large repeated column name");
+  expect_eq(model.var_names_.at(1), std::string("XTAIL"), "large tail column name");
+}
+
+void large_bounds_repeated_var_stays_ordered()
+{
+  constexpr size_t repeat_count = 700000;
+  std::string mps;
+  mps.reserve(12 * 1024 * 1024);
+  mps +=
+    "NAME BIGBOUNDS\nROWS\n N OBJ\n L R1\nCOLUMNS\n alpha OBJ 1 R1 1\nRHS\n RHS1 R1 0\nBOUNDS\n";
+  for (size_t i = 0; i < repeat_count; ++i) {
+    mps += " UP B alpha ";
+    mps += std::to_string(i % 1000);
+    mps += '\n';
+  }
+  mps += "ENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model =
+    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  expect_model_shapes(model, 1, 1, 1, "large bounds");
+  expect_eq(model.variable_upper_bounds_.at(0),
+            static_cast<double>((repeat_count - 1) % 1000),
+            "large repeated bounds last value");
+}
+
+void lz4_and_raw_paths_match_on_multiblock_input()
+{
+  constexpr size_t row_count = 70000;
+  std::string mps;
+  mps.reserve(4 * 1024 * 1024);
+  mps += "NAME LZ4PARITY\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " X";
+    mps += std::to_string(i);
+    mps += ' ';
+    mps += row_name(i);
+    mps += " 0.125\n";
+  }
+  mps += "RHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 1\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath lz4_file(raw_file.path + ".lz4");
+  const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path;
+  if (std::system(cmd.c_str()) != 0) { throw skip_test("lz4 CLI unavailable"); }
+
+  auto raw =
+    mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
+  auto lz4 =
+    mps_fast::parse_mps_fast_file<int, double>(lz4_file.path, mps_fast::FileReadMethod::Read);
+
+  expect_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
+  expect_eq(lz4.var_names_.size(), raw.var_names_.size(), "lz4 var name count");
+  expect_eq(lz4.row_names_.size(), raw.row_names_.size(), "lz4 row name count");
+  expect_vector_eq(lz4.A_, raw.A_, "lz4 A values");
+  expect_vector_eq(lz4.A_indices_, raw.A_indices_, "lz4 A indices");
+  expect_vector_eq(lz4.A_offsets_, raw.A_offsets_, "lz4 A offsets");
+  expect_vector_eq(lz4.c_, raw.c_, "lz4 objective");
+  expect_vector_eq(lz4.b_, raw.b_, "lz4 rhs");
+  expect_vector_eq(lz4.var_types_, raw.var_types_, "lz4 var types");
+  expect_vector_eq(lz4.variable_lower_bounds_, raw.variable_lower_bounds_, "lz4 lower bounds");
+  expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds");
+}
+
+}  // namespace
+
+int main()
+{
+  struct TestCase {
+    const char* name;
+    void (*fn)();
+  };
+
+  const TestCase tests[] = {
+    {"ScannerFindsSectionSplitAcrossBlocks", scanner_finds_section_split_across_blocks},
+    {"ScannerFindsHeadersSplitAtEveryByte", scanner_finds_headers_split_at_every_byte},
+    {"ScannerRejectsUnknownColumnOneRecordsAfterRows",
+     scanner_rejects_unknown_column_one_records_after_rows},
+    {"BoundsDefaultsAndTypesMatchReference", bounds_defaults_and_types_match_reference},
+    {"DuplicateBoundsLastStatementWins", duplicate_bounds_last_statement_wins},
+    {"NondenseRowAndColumnNamesUseHashPath", nondense_row_and_column_names_use_hash_path},
+    {"MissingOptionalBoundsFastPath", missing_optional_bounds_fast_path},
+    {"BoundsOnlyVariablesAreAppendedDeterministically",
+     bounds_only_variables_are_appended_deterministically},
+    {"IntegerMarkersAssignTypesAndDefaultBounds", integer_markers_assign_types_and_default_bounds},
+    {"NumericParsingIntegrationMatchesReferenceBitwise",
+     numeric_parsing_integration_matches_reference_bitwise},
+    {"CrlfLineEndingsMatchReferenceBitwise", crlf_line_endings_match_reference_bitwise},
+    {"CommentPlacementSupportedCasesMatchReferenceBitwise",
+     comment_placement_supported_cases_match_reference_bitwise},
+    {"ObjectiveMetadataSelectsNamedObjective", objective_metadata_selects_named_objective},
+    {"MalformedInputsReportErrors", malformed_inputs_report_errors},
+    {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary},
+    {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered},
+    {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input},
+  };
+
+  int failed = 0;
+  for (const TestCase& test : tests) {
+    std::cout << "[ RUN      ] " << test.name << '\n';
+    try {
+      test.fn();
+      std::cout << "[       OK ] " << test.name << '\n';
+    } catch (const skip_test& e) {
+      std::cout << "[  SKIPPED ] " << test.name << ": " << e.what() << '\n';
+    } catch (const std::exception& e) {
+      ++failed;
+      std::cerr << "[  FAILED  ] " << test.name << ": " << e.what() << '\n';
+    }
+  }
+
+  if (failed != 0) {
+    std::cerr << failed << " test(s) failed\n";
+    return 1;
+  }
+  std::cout << "[  PASSED  ] " << std::size(tests) << " test(s)\n";
+  return 0;
+}

From 8e01e28b61d928a89776f9f3032a3057c2acd60f Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 10 Jun 2026 09:40:33 -0700
Subject: [PATCH 07/22] moved perf counters

---
 cpp/src/io/experimental_mps_fast/fast_parser.cpp                | 2 +-
 .../{io/experimental_mps_fast => utilities}/perf_counters.hpp   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/src/{io/experimental_mps_fast => utilities}/perf_counters.hpp (100%)

diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index de1b3ea84c..bd83ef2088 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -9,7 +9,7 @@
 #include "mps_section_scanner.hpp"
 #include "nvtx_ranges.hpp"
 #ifdef MPS_FAST_PERF_COUNTERS
-#include "perf_counters.hpp"
+#include <utilities/perf_counters.hpp>
 #endif
 
 #include <sys/mman.h>
diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp
similarity index 100%
rename from cpp/src/io/experimental_mps_fast/perf_counters.hpp
rename to cpp/src/utilities/perf_counters.hpp

From 94bfbc78448ddae9cb85f12aee19f296871fc2a6 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Thu, 11 Jun 2026 06:17:57 -0700
Subject: [PATCH 08/22] extend the lz4 decompression to the regular parser,
 more cleanup and refactor

---
 cpp/cuopt_cli.cpp                             |    5 +-
 .../cuopt/linear_programming/io/parser.hpp    |   47 +-
 .../fast_fp64_parser.hpp                      |   30 +-
 .../fast_parse_primitives.hpp                 |  100 +-
 .../io/experimental_mps_fast/fast_parser.cpp  | 1274 +++++++----------
 .../io/experimental_mps_fast/file_reader.cpp  |  288 ----
 .../io/experimental_mps_fast/file_reader.hpp  |   70 +-
 .../experimental_mps_fast/lz4_file_reader.cpp |  162 +--
 .../io/experimental_mps_fast/mmap_region.hpp  |    4 +-
 .../mps_section_scanner.cpp                   |   31 +
 .../mps_section_scanner.hpp                   |    8 +
 .../io/experimental_mps_fast/nvtx_ranges.hpp  |   22 +-
 cpp/src/io/file_to_string.cpp                 |  167 ++-
 cpp/src/io/file_to_string.hpp                 |    1 +
 cpp/src/utilities/perf_counters.hpp           |   31 +
 .../fast_parser_edge_test.cpp                 |   44 +
 16 files changed, 1052 insertions(+), 1232 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index e99462091e..714d76dbf5 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -286,9 +286,8 @@ int main(int argc, char* argv[])
   program.add_argument("filename")
     .help(
       "input problem file; format dispatched by extension (case-insensitive). "
-      "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants "
-      "(e.g. .lp.gz, .mps.bz2, .qps.gz). Experimental .mps.lz4 inputs require "
-      "--mps-reader fast")
+      "Supported: .lp, .mps, .qps and their .gz / .bz2 / .lz4 compressed variants "
+      "(e.g. .lp.gz, .mps.bz2, .qps.lz4).")
     .nargs(1)
     .required();
 
diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index 1d47590287..08254f84b3 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -21,7 +21,7 @@ namespace cuopt::linear_programming::io {
  * @brief Selects which MPS reader implementation should be used by dispatching entry points.
  *
  * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems
- * from raw .mps and .mps.lz4 files only.
+ * from raw .mps, .mps.lz4, .mps.gz, and .mps.bz2 files.
  */
 enum class mps_reader_type_t { default_reader, fast_experimental };
 
@@ -29,7 +29,7 @@ enum class mps_reader_type_t { default_reader, fast_experimental };
  * @brief Reads the equation from an MPS or QPS file.
  *
  * The input file can be a plain text file in MPS-/QPS-format or a compressed MPS/QPS
- * file (.mps.gz or .mps.bz2).
+ * file (.mps.gz, .mps.bz2, or .mps.lz4).
  *
  * Read this link http://lpsolve.sourceforge.net/5.5/mps-format.htm for more
  * details on both free and fixed MPS format.
@@ -40,8 +40,8 @@ enum class mps_reader_type_t { default_reader, fast_experimental };
  * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ)
  * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP)
  *
- * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression
- * libraries zlib or libbzip2 are installed, respectively.
+ * Note: Compressed MPS files .mps.gz, .mps.bz2, and .mps.lz4 can only be read if
+ * zlib, libbzip2, or liblz4 are installed, respectively.
  *
  * @param[in] mps_file_path Path to MPS/QPSfile.
  * @param[in] fixed_mps_format If MPS/QPS file should be parsed as fixed, false by default
@@ -54,10 +54,10 @@ mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
 /**
  * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader.
  *
- * This prototype reader supports raw .mps and .mps.lz4 files only. It does not support LP, QPS,
- * quadratic MPS sections, fixed-format forcing, or .gz/.bz2 compressed inputs.
+ * This prototype reader supports raw .mps plus .mps.lz4/.mps.gz/.mps.bz2 files. It does not
+ * support LP, QPS, quadratic constraint sections, or fixed-format forcing.
  *
- * @param[in] mps_file_path Path to a raw .mps or .mps.lz4 file.
+ * @param[in] mps_file_path Path to a raw or compressed .mps file.
  * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file.
  */
 template <typename i_t, typename f_t>
@@ -137,9 +137,9 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
  *        extension. Extension matching is case-insensitive.
  *
  * Routing:
- *   - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps()
- *   - .mps.lz4 → experimental fast MPS reader only
- *   - .lp,  .lp.gz,  .lp.bz2 → read_lp()
+ *   - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4
+ *     → read_mps()
+ *   - .lp,  .lp.gz,  .lp.bz2, .lp.lz4 → read_lp()
  *   - anything else → std::logic_error
  *
  * This is the entry point of choice for user-facing tools (CLI, C API) that
@@ -165,33 +165,36 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  const bool is_mps_lz4 = lower.ends_with(".mps.lz4");
-  if (lower.ends_with(".mps") || is_mps_lz4 || lower.ends_with(".mps.gz") ||
-      lower.ends_with(".mps.bz2") || lower.ends_with(".qps") || lower.ends_with(".qps.gz") ||
-      lower.ends_with(".qps.bz2")) {
+  const bool is_mps_lz4  = lower.ends_with(".mps.lz4");
+  const bool is_mps_gzip = lower.ends_with(".mps.gz");
+  const bool is_mps_bzip = lower.ends_with(".mps.bz2");
+  const bool is_qps_lz4  = lower.ends_with(".qps.lz4");
+  const bool is_lp_lz4   = lower.ends_with(".lp.lz4");
+  if (lower.ends_with(".mps") || is_mps_lz4 || is_mps_gzip || is_mps_bzip ||
+      lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") ||
+      is_qps_lz4) {
     if (mps_reader == mps_reader_type_t::fast_experimental) {
       if (fixed_mps_format) {
         throw std::logic_error(
           "experimental fast MPS reader does not support fixed MPS format forcing");
       }
-      if (!lower.ends_with(".mps") && !is_mps_lz4) {
+      if (!lower.ends_with(".mps") && !is_mps_lz4 && !is_mps_gzip && !is_mps_bzip) {
         throw std::logic_error(
-          "experimental fast MPS reader supports raw .mps and .mps.lz4 LP/MIP files only");
+          "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 "
+          "LP/MIP files only");
       }
       return read_mps_fast_experimental<i_t, f_t>(path);
     }
-    if (is_mps_lz4) {
-      throw std::logic_error(".mps.lz4 inputs require the experimental fast MPS reader");
-    }
     return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
-  if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) {
+  if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2") ||
+      is_lp_lz4) {
     return read_lp<i_t, f_t>(path);
   }
   throw std::logic_error(
     "read: unrecognized input file extension. Supported (case-insensitive): "
-    ".mps, .mps.lz4, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, "
-    ".lp.bz2. "
+    ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, "
+    ".lp, .lp.gz, .lp.bz2, .lp.lz4. "
     "Given path: " +
     path);
 }
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index 905dcc9e7b..0f947aa644 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -40,6 +40,8 @@ struct power_10_lut_entry_t {
   int biased_e2;
 };
 
+// util class to perform 256bit precision arithmetic in constexpr to build the eisel-lemire lookup
+// table
 struct cuopt_uint256_t {
   std::array<uint64_t, 4> limb{};
 
@@ -169,7 +171,7 @@ inline constexpr std::array<uint64_t, 16> small_integer_powers = {1ULL,
                                                                   100000000000000ULL,
                                                                   1000000000000000ULL};
 
-struct ParsedDecimal {
+struct parsed_decimal_t {
   bool negative      = false;
   bool fast_eligible = false;
   uint64_t mantissa  = 0;
@@ -181,6 +183,7 @@ static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; }
 // SWAR 8char run of digits -> integer representation
 static inline bool parse_8_digits(const char* p, uint32_t& out)
 {
+  // comply with strict aliasing rules
   std::array<char, sizeof(uint64_t)> bytes{};
   std::memcpy(bytes.data(), p, bytes.size());
   uint64_t raw       = std::bit_cast<uint64_t>(bytes);
@@ -195,10 +198,26 @@ static inline bool parse_8_digits(const char* p, uint32_t& out)
   return true;
 }
 
+static inline void parse_u64_digits_advance(const char*& p, const char* end, uint64_t& out)
+{
+  while (p < end && is_digit(*p)) {
+    if (end - p >= 8) {
+      uint32_t chunk = 0;
+      if (parse_8_digits(p, chunk)) {
+        out = out * 100000000ULL + (uint64_t)chunk;
+        p += 8;
+        continue;
+      }
+    }
+    out = out * 10 + (uint64_t)(*p - '0');
+    ++p;
+  }
+}
+
 static inline void scan_digit_run(const char*& p,
                                   const char* end,
                                   bool after_dot,
-                                  ParsedDecimal& out,
+                                  parsed_decimal_t& out,
                                   bool& saw_digit,
                                   int& frac_digits,
                                   int& sig_digits,
@@ -244,7 +263,7 @@ static inline void scan_digit_run(const char*& p,
   }
 }
 
-static inline bool parse_decimal_advance(const char*& p, const char* end, ParsedDecimal& out)
+static inline bool parse_decimal_advance(const char*& p, const char* end, parsed_decimal_t& out)
 {
   if (p < end && (*p == '-' || *p == '+')) {
     out.negative = *p == '-';
@@ -294,6 +313,7 @@ static inline bool parse_decimal_advance(const char*& p, const char* end, Parsed
 static inline double fallback_strtod(std::string_view s)
 {
   char stack_buf[32];
+  // The MPS specs mandate that numeric tokens are no longer than 25 characters
   if (s.size() >= sizeof(stack_buf)) {
     mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length");
   }
@@ -358,7 +378,7 @@ static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits)
   return true;
 }
 
-static inline double assemble_fp64(const ParsedDecimal& dec)
+static inline double assemble_fp64(const parsed_decimal_t& dec)
 {
   uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0;
   if (dec.mantissa == 0) { return std::bit_cast<double>(bits); }
@@ -390,7 +410,7 @@ static inline double assemble_fp64(const ParsedDecimal& dec)
 static inline double parse_fp64_advance(const char*& p, const char* end)
 {
   const char* start = p;
-  ParsedDecimal dec;
+  parsed_decimal_t dec;
   if (!parse_decimal_advance(p, end, dec)) {
     return fallback_strtod(std::string_view(start, (size_t)(p - start)));
   }
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index 70ed3283c3..d3317c50e1 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -15,16 +15,21 @@
 #include <simde/x86/avx2.h>
 #include <simde/x86/sse4.2.h>
 
-#ifndef __likely
-#define __likely(x) __builtin_expect(!!(x), 1)
+#ifndef LIKELY
+#define LIKELY(x) __builtin_expect(!!(x), 1)
 #endif
 
-#ifndef __unlikely
-#define __unlikely(x) __builtin_expect(!!(x), 0)
+#ifndef UNLIKELY
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #endif
 
 namespace mps_fast {
 
+enum scan_mode {
+  skip_whitespace,
+  until_whitespace,
+};
+
 struct cursor_t {
   const char* start;
   const char* ptr;
@@ -67,12 +72,12 @@ struct cursor_t {
     ptr += n;
   }
 
-  template <bool skip_ws_mode>
+  template <scan_mode mode>
   static const char* scalar_scan(const char* p, const char* end)
   {
     while (p < end) {
       unsigned char c = (unsigned char)*p;
-      if constexpr (skip_ws_mode) {
+      if constexpr (mode == skip_whitespace) {
         if (c > 32 || c == '\n') return p;
       } else {
         if (c <= 32) return p;
@@ -82,7 +87,7 @@ struct cursor_t {
     return end;
   }
 
-  template <bool skip_ws_mode>
+  template <scan_mode mode>
   static const char* simd_scan(const char* p, const char* end)
   {
     const simde__m256i v32 = simde_mm256_set1_epi8(32);
@@ -93,7 +98,7 @@ struct cursor_t {
       simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
 
       unsigned int mask;
-      if (skip_ws_mode) {
+      if constexpr (mode == skip_whitespace) {
         simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
         mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
       } else {
@@ -103,10 +108,10 @@ struct cursor_t {
       if (mask != 0) { return p + __builtin_ctz(mask); }
       p += 32;
     }
-    return scalar_scan<skip_ws_mode>(p, end);
+    return scalar_scan<mode>(p, end);
   }
 
-  void skip_ws() { ptr = simd_scan<true>(ptr, end); }
+  void skip_ws() { ptr = simd_scan<skip_whitespace>(ptr, end); }
 
   bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); }
 
@@ -135,13 +140,31 @@ struct cursor_t {
     }
   }
 
+  std::string_view read_rest_of_line_trimmed()
+  {
+    const char* begin    = ptr;
+    const char* line_end = begin;
+    while (line_end < end && *line_end != '\n' && *line_end != '\r') {
+      ++line_end;
+    }
+
+    while (begin < line_end && (*begin == ' ' || *begin == '\t')) {
+      ++begin;
+    }
+    while (line_end > begin && (line_end[-1] == ' ' || line_end[-1] == '\t')) {
+      --line_end;
+    }
+    ptr = line_end;
+    return std::string_view(begin, (std::size_t)(line_end - begin));
+  }
+
   inline __attribute__((always_inline)) std::string_view read_field()
   {
-    if (__unlikely(done())) { return {}; }
+    if (UNLIKELY(done())) { return {}; }
 
     const char* field_start = ptr;
-    if (__unlikely(end - ptr < 32)) {
-      ptr                   = scalar_scan<false>(ptr, end);
+    if (UNLIKELY(end - ptr < 32)) {
+      ptr                   = scalar_scan<until_whitespace>(ptr, end);
       const char* field_end = ptr;
       if (ptr < end) { skip_ws(); }
       return std::string_view(field_start, field_end - field_start);
@@ -150,14 +173,14 @@ struct cursor_t {
     const simde__m256i v32 = simde_mm256_set1_epi8(32);
     const simde__m256i vnl = simde_mm256_set1_epi8('\n');
 
-    // Input buffers are padded by file_reader/lz4_file_reader/small_raw_read,
-    // so this unaligned 32-byte load is valid whenever end - ptr >= 32.
+    // All input streams provide trailing padding, so this unaligned 32-byte load is valid
+    // whenever end - ptr >= 32.
     simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
     simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
     unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
 
-    if (__unlikely(ws_mask == 0)) {
-      ptr                   = simd_scan<false>(ptr + 32, end);
+    if (UNLIKELY(ws_mask == 0)) {
+      ptr                   = simd_scan<until_whitespace>(ptr + 32, end);
       const char* field_end = ptr;
       if (ptr < end) { skip_ws(); }
       return std::string_view(field_start, field_end - field_start);
@@ -171,7 +194,7 @@ struct cursor_t {
       (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
     unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1);
 
-    if (__likely(after_field != 0)) {
+    if (LIKELY(after_field != 0)) {
       ptr = ptr + __builtin_ctz(after_field);
     } else {
       ptr = field_end;
@@ -183,11 +206,18 @@ struct cursor_t {
 
   inline __attribute__((always_inline)) std::string_view peek_field()
   {
-    if (__unlikely(done())) { return {}; }
-    const char* field_end = simd_scan<false>(ptr, end);
+    if (UNLIKELY(done())) { return {}; }
+    const char* field_end = simd_scan<until_whitespace>(ptr, end);
     return std::string_view(ptr, field_end - ptr);
   }
 
+  static inline std::string_view peek_field_at(const char* line_start, const char* section_end)
+  {
+    cursor_t cursor(line_start, (std::size_t)(section_end - line_start));
+    cursor.skip_ws();
+    return cursor.peek_field();
+  }
+
   inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
   read_two_fields()
   {
@@ -197,7 +227,7 @@ struct cursor_t {
       return std::pair<std::string_view, std::string_view>{f1, f2};
     };
 
-    if (__unlikely(end - ptr < 32)) { return slow(); }
+    if (UNLIKELY(end - ptr < 32)) { return slow(); }
 
     const char* field1_start = ptr;
     const simde__m256i v32   = simde_mm256_set1_epi8(32);
@@ -213,21 +243,21 @@ struct cursor_t {
     unsigned int nl_mask        = (unsigned int)simde_mm256_movemask_epi8(is_nl);
     unsigned int stop_mask      = printable_mask | nl_mask;
 
-    if (__unlikely(ws_mask == 0)) { return slow(); }
+    if (UNLIKELY(ws_mask == 0)) { return slow(); }
     int field1_end_off = __builtin_ctz(ws_mask);
 
     unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1);
-    if (__unlikely(after_field1 == 0)) { return slow(); }
+    if (UNLIKELY(after_field1 == 0)) { return slow(); }
     int field2_start_off = __builtin_ctz(after_field1);
 
-    if (__unlikely(ptr[field2_start_off] == '\n')) { return slow(); }
+    if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); }
 
     unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1);
-    if (__unlikely(ws_after_field2_start == 0)) { return slow(); }
+    if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); }
     int field2_end_off = __builtin_ctz(ws_after_field2_start);
 
     unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1);
-    if (__likely(after_field2 != 0)) {
+    if (LIKELY(after_field2 != 0)) {
       ptr = ptr + __builtin_ctz(after_field2);
     } else {
       ptr = ptr + field2_end_off;
@@ -242,7 +272,7 @@ struct cursor_t {
 static inline void expect(cursor_t& cursor, const char* field)
 {
   auto id = cursor.read_field();
-  if (__unlikely(id != field)) {
+  if (UNLIKELY(id != field)) {
     cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data());
   }
 }
@@ -260,7 +290,7 @@ static inline void accept_comment_line(cursor_t& cursor)
 
 static inline void expect_eol(cursor_t& cursor)
 {
-  if (__unlikely(!cursor.eol())) {
+  if (UNLIKELY(!cursor.eol())) {
     auto got = cursor.peek_field();
     cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data());
   }
@@ -269,20 +299,18 @@ static inline void expect_eol(cursor_t& cursor)
     while (cursor.eol()) {
       cursor.consume_eol();
     }
-    if (__unlikely(cursor.done())) { return; }
+    if (UNLIKELY(cursor.done())) { return; }
 
-    if (__unlikely(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) {
+    if (UNLIKELY(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) {
       cursor.skip_comment_line();
       continue;
     }
 
-    if (__likely(cursor.ptr[0] == ' ') && __likely(cursor.ptr + 1 < cursor.end)) {
-      cursor.ptr += 1;
-    }
+    if (LIKELY(cursor.ptr[0] == ' ') && LIKELY(cursor.ptr + 1 < cursor.end)) { cursor.ptr += 1; }
 
-    if (__unlikely(cursor.done())) { return; }
+    if (UNLIKELY(cursor.done())) { return; }
     char c = cursor.ptr[0];
-    if (__unlikely(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) {
+    if (UNLIKELY(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) {
       cursor.skip_ws();
       if (cursor.eol()) { continue; }
     }
@@ -344,7 +372,7 @@ static inline bool accept_section(cursor_t& cursor, const char* section)
 
 static inline bool accept_comment(cursor_t& cursor)
 {
-  if (__unlikely(!cursor.done() && cursor.ptr[0] == '$')) {
+  if (UNLIKELY(!cursor.done() && cursor.ptr[0] == '$')) {
     cursor.skip_to_eol();
     return true;
   }
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index bd83ef2088..33bf916e05 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -2,12 +2,15 @@
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #include "fast_parser.hpp"
+#include <file_to_string.hpp>
 #include "fast_parse_primitives.hpp"
 #include "file_reader.hpp"
 #include "hash_table_smallstr.hpp"
 #include "mmap_region.hpp"
 #include "mps_section_scanner.hpp"
 #include "nvtx_ranges.hpp"
+
+#include <cuda/cmath>
 #ifdef MPS_FAST_PERF_COUNTERS
 #include <utilities/perf_counters.hpp>
 #endif
@@ -36,31 +39,43 @@
 #include <string_view>
 #include <tuple>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
-#ifndef MADV_COLLAPSE
-#define MADV_COLLAPSE 25
-#endif
+#define MPS_FAST_COMPACT_ROW_HASH
+#define MPS_FAST_THP_PREFAULT
 
 namespace mps_fast {
 
-static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS                = 4096;
-static constexpr int MPS_ROWS_THREAD_CAP                           = 16;
-static constexpr int MPS_COLUMNS_THREAD_CAP                        = 32;
-static constexpr int MPS_BOUNDS_THREAD_CAP                         = 32;
-static constexpr int MPS_NAMES_THREAD_CAP                          = 16;
-static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS          = 16 * 1024 * 1024;
-static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES              = 256ull * 1024ull * 1024ull;
-static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8ull * 1024ull * 1024ull;
-static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES                = 1 * 1024 * 1024;
-static constexpr size_t MPS_SMALL_RAW_FILE_BYTES                   = 4ull * 1024ull * 1024ull;
-static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES     = 100ull * 1000ull * 1000ull;
-static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS          = 64ull * 1024ull;
-static constexpr size_t MPS_ROW_HASH_PARTITIONS                    = 32;
-static constexpr int MPS_ROW_HASH_PARTITION_BITS                   = 5;
-static constexpr int MPS_SMALL_FILE_THREAD_CAP                     = 16;
-static constexpr int MPS_LARGE_FILE_THREAD_CAP                     = 32;
+static constexpr size_t KiB = 1024;
+static constexpr size_t MiB = 1024 * KiB;
+static constexpr size_t GiB = 1024 * MiB;
+
+// per-chunk row-count scratch tile for the column parsing workers
+// small enough to remain warm in L1
+static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096;
+static constexpr int MPS_ROWS_THREAD_CAP            = 16;
+static constexpr int MPS_COLUMNS_THREAD_CAP         = 32;
+static constexpr int MPS_BOUNDS_THREAD_CAP          = 32;
+static constexpr int MPS_NAMES_THREAD_CAP           = 16;
+// avoid openmp setup for small bounds sections
+static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256 * MiB;
+// ordered-name fallback is cheap enough to parallelize on smaller bounds sections
+static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB;
+// lower bound on columns chunk size to avoid tiny parser tasks
+static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB;
+// parser-wide thread cap switch; very small files lose to scheduling overhead
+static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull;
+// below this, the serial row-hash build is usually cheaper than partition setup
+static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * KiB;
+// number of partitions for the row hash table, used to avoid races and atomics during row hash
+// table initialization
+static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5;
+static constexpr size_t MPS_ROW_HASH_PARTITIONS  = (size_t{1} << MPS_ROW_HASH_PARTITION_BITS);
+// thread caps for small and large files
+static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16;
+static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32;
 
 static int parser_thread_cap_for_size(size_t bytes)
 {
@@ -75,6 +90,8 @@ static int phase_thread_count(int phase_cap)
   return std::max(1, std::min(phase_cap, available_threads));
 }
 
+// Arena allocator for the strings (row names, column names) to avoid the dreadful overheads of
+// glibc's malloc and std::vector<std::string>
 class chunk_name_arena_t {
  public:
   void reserve(size_t bytes)
@@ -92,110 +109,45 @@ class chunk_name_arena_t {
 
  private:
   struct slab_t {
-    std::unique_ptr<char[]> data;
-    size_t capacity = 0;
-    size_t used     = 0;
+    std::vector<char> data;
+    size_t used = 0;
   };
 
   char* allocate(size_t bytes)
   {
-    if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().capacity) {
+    if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().data.size()) {
       size_t capacity = std::max(bytes, next_slab_size_);
       slab_t slab;
-      slab.data     = std::make_unique<char[]>(capacity);
-      slab.capacity = capacity;
+      slab.data.resize(capacity);
       slabs_.push_back(std::move(slab));
       next_slab_size_ = std::max(next_slab_size_ * 2, capacity);
     }
     slab_t& slab = slabs_.back();
-    char* ptr    = slab.data.get() + slab.used;
+    char* ptr    = slab.data.data() + slab.used;
     slab.used += bytes;
     return ptr;
   }
 
   std::vector<slab_t> slabs_;
-  size_t next_slab_size_ = 64 * 1024;
+  size_t next_slab_size_ = 64 * KiB;
 };
 
+// returns the hash table partition to use for a given hash
 static inline size_t row_hash_partition_for(uint32_t hash)
 {
   return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
 }
 
-// =============================================================================
-// RAII Timer for profiling with deferred output
-// =============================================================================
-
-struct TimerEntry {
+struct timer_entry_t {
   const char* name;
   double elapsed_ms;
   size_t rss_kb;
   size_t hwm_kb;
-  size_t compressed_bytes;
-};
-
-static std::atomic_size_t& get_timer_compressed_bytes()
-{
-  static std::atomic_size_t compressed_bytes{0};
-  return compressed_bytes;
-}
-
-class timer_io_context_t {
- public:
-  explicit timer_io_context_t(size_t compressed_bytes)
-    : old_compressed_bytes_(
-        get_timer_compressed_bytes().exchange(compressed_bytes, std::memory_order_acq_rel))
-  {
-  }
-
-  ~timer_io_context_t()
-  {
-    get_timer_compressed_bytes().store(old_compressed_bytes_, std::memory_order_release);
-  }
-
-  timer_io_context_t(const timer_io_context_t&)            = delete;
-  timer_io_context_t& operator=(const timer_io_context_t&) = delete;
-
- private:
-  size_t old_compressed_bytes_ = 0;
 };
 
-static size_t parse_status_kb_line(const char* line, const char* key)
+static std::vector<timer_entry_t>& get_timer_buffer()
 {
-  size_t key_len = std::strlen(key);
-  if (std::strncmp(line, key, key_len) != 0) { return 0; }
-  const char* p = line + key_len;
-  while (*p == ' ' || *p == '\t') {
-    ++p;
-  }
-  size_t value = 0;
-  while (*p >= '0' && *p <= '9') {
-    value = value * 10 + (size_t)(*p - '0');
-    ++p;
-  }
-  return value;
-}
-
-static std::pair<size_t, size_t> current_process_rss_kb()
-{
-  FILE* file = std::fopen("/proc/self/status", "r");
-  if (file == nullptr) { return {0, 0}; }
-
-  size_t rss_kb = 0;
-  size_t hwm_kb = 0;
-  char line[256];
-  while (std::fgets(line, sizeof(line), file) != nullptr) {
-    if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); }
-    if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); }
-    if (rss_kb != 0 && hwm_kb != 0) { break; }
-  }
-  std::fclose(file);
-  return {rss_kb, hwm_kb};
-}
-
-static std::vector<TimerEntry>& get_timer_buffer()
-{
-  static std::vector<TimerEntry> buffer;
+  static std::vector<timer_entry_t> buffer;
   buffer.reserve(100);
   return buffer;
 }
@@ -213,26 +165,16 @@ static void flush_timers()
   auto& buffer = get_timer_buffer();
   for (const auto& entry : buffer) {
     std::fprintf(stderr,
-                 "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f compressed_GB=%.3f\n",
+                 "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f\n",
                  entry.name,
                  entry.elapsed_ms,
-                 (double)entry.rss_kb / (1024.0 * 1024.0),
-                 (double)entry.hwm_kb / (1024.0 * 1024.0),
-                 (double)entry.compressed_bytes / (1024.0 * 1024.0 * 1024.0));
+                 (double)entry.rss_kb / (double)(GiB / KiB),
+                 (double)entry.hwm_kb / (double)(GiB / KiB));
   }
   buffer.clear();
 #endif
 }
 
-static size_t system_page_size()
-{
-  static size_t page_size = [] {
-    long value = sysconf(_SC_PAGESIZE);
-    return value > 0 ? (size_t)value : (size_t)4096;
-  }();
-  return page_size;
-}
-
 enum class materialize_touch_t {
   write_2mb,
   write_4kb,
@@ -248,7 +190,7 @@ static void materialize_hugepages(const char* label,
   (void)label;
   if (data == nullptr || bytes == 0) return;
 
-  constexpr size_t two_mb = 2 * 1024 * 1024;
+  constexpr size_t two_mb = 2 * MiB;
   size_t page_size        = system_page_size();
   uintptr_t start         = reinterpret_cast<uintptr_t>(data);
   uintptr_t end           = start + bytes;
@@ -257,10 +199,10 @@ static void materialize_hugepages(const char* label,
   size_t aligned_bytes    = (size_t)(aligned_end - aligned_start);
 
   errno = 0;
-  madvise(reinterpret_cast<void*>(aligned_start), aligned_bytes, MADV_HUGEPAGE);
+  madvise((void*)(aligned_start), aligned_bytes, MADV_HUGEPAGE);
 
   size_t step        = touch == materialize_touch_t::write_2mb ? two_mb : page_size;
-  volatile char* ptr = reinterpret_cast<volatile char*>(data);
+  volatile char* ptr = (volatile char*)(data);
   for (size_t offset = 0; offset < bytes; offset += step) {
     ptr[offset] = ptr[offset];
   }
@@ -297,10 +239,9 @@ class scoped_timer_t {
     double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
     nvtx_.end();
     if (accumulator_) { *accumulator_ += elapsed_ms; }
-    auto [rss_kb, hwm_kb]   = current_process_rss_kb();
-    size_t compressed_bytes = get_timer_compressed_bytes().load(std::memory_order_acquire);
+    auto [rss_kb, hwm_kb] = current_process_rss_kb();
     std::lock_guard<std::mutex> lock(get_timer_mutex());
-    get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb, compressed_bytes});
+    get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb});
 #endif
   }
 
@@ -313,11 +254,27 @@ class scoped_timer_t {
 #endif
   double* accumulator_;
 #ifdef MPS_FAST_TIMERS
-  nvtx::scoped_range nvtx_;
+  nvtx::scoped_range_t nvtx_;
   std::chrono::high_resolution_clock::time_point start_;
 #endif
 };
 
+class omp_max_active_levels_guard_t {
+ public:
+  explicit omp_max_active_levels_guard_t(int value) : old_value_(omp_get_max_active_levels())
+  {
+    omp_set_max_active_levels(value);
+  }
+
+  ~omp_max_active_levels_guard_t() { omp_set_max_active_levels(old_value_); }
+
+  omp_max_active_levels_guard_t(const omp_max_active_levels_guard_t&)            = delete;
+  omp_max_active_levels_guard_t& operator=(const omp_max_active_levels_guard_t&) = delete;
+
+ private:
+  int old_value_ = 0;
+};
+
 static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section)
 {
   const char* row_end = row_start;
@@ -327,29 +284,17 @@ static inline void error_unknown_row(cursor_t& cursor, const char* row_start, co
   cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start);
 }
 
-// =============================================================================
-// Parsing state shared across section parsers
-// =============================================================================
-
-static inline size_t next_power_of_2(size_t n)
-{
-  if (n == 0) return 1;
-  n--;
-  n |= n >> 1;
-  n |= n >> 2;
-  n |= n >> 4;
-  n |= n >> 8;
-  n |= n >> 16;
-  n |= n >> 32;
-  return n + 1;
-}
-
-enum class row_index_mode_t {
+// Two modes for row/column name lookup:
+// - hash: arbitrary names via hash table (rows) or var_names_map (columns)
+// - dense_ordered: sequential numeric suffixes like R0001/R0002 or V0/V1
+enum class index_mode_t {
   hash,
   dense_ordered,
 };
 
-static inline bool is_decimal_digit(char c) { return (unsigned)(c - '0') <= 9; }
+// Every 19-digit decimal string fits in uint64_t; 20+ digits may not and are wildly unlikely in the
+// context of dense MPS rows/cols
+static constexpr size_t dense_suffix_max_digits = 19;
 
 static inline size_t decimal_digits_u64(uint64_t value)
 {
@@ -367,39 +312,144 @@ static inline bool parse_trailing_u64(std::string_view name,
                                       size_t& suffix_width)
 {
   size_t pos = name.size();
-  while (pos > 0 && is_decimal_digit(name[pos - 1])) {
+  while (pos > 0 && fp64::is_digit(name[pos - 1])) {
     pos--;
   }
   if (pos == name.size()) { return false; }
 
+  suffix_width = name.size() - pos;
+  if (suffix_width > dense_suffix_max_digits) { return false; }
+
   uint64_t parsed = 0;
   for (size_t i = pos; i < name.size(); ++i) {
-    uint64_t digit = (uint64_t)(name[i] - '0');
-    if (parsed > (std::numeric_limits<uint64_t>::max() - digit) / 10) { return false; }
-    parsed = parsed * 10 + digit;
+    parsed = parsed * 10 + (uint64_t)(name[i] - '0');
   }
 
-  prefix       = std::string_view(name.data(), pos);
-  value        = parsed;
-  suffix_width = name.size() - pos;
+  prefix = std::string_view(name.data(), pos);
+  value  = parsed;
   return true;
 }
 
+// necessary to handle cases like R0001, ..., R2000, ...
 static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width)
 {
   return suffix_width > 1 && name[name.size() - suffix_width] == '0';
 }
 
-static inline bool dense_suffix_width_ok(uint64_t value,
-                                         size_t suffix_width,
-                                         bool zero_padded,
-                                         size_t pad_width)
+static inline size_t dense_initial_pad_width(std::string_view name, size_t suffix_width)
+{
+  return dense_suffix_is_zero_padded(name, suffix_width) ? suffix_width : 0;
+}
+
+static inline bool dense_suffix_width_ok(uint64_t value, size_t suffix_width, size_t pad_width)
 {
   size_t digits         = decimal_digits_u64(value);
-  size_t expected_width = zero_padded ? std::max(pad_width, digits) : digits;
+  size_t expected_width = std::max(pad_width, digits);
   return suffix_width == expected_width;
 }
 
+struct dense_name_index_t {
+  std::string prefix;
+  uint64_t min_id  = 0;
+  uint64_t max_id  = 0;
+  size_t pad_width = 0;
+
+  void reset()
+  {
+    prefix.clear();
+    min_id    = 0;
+    max_id    = 0;
+    pad_width = 0;
+  }
+
+  bool suffix_width_ok(uint64_t value, size_t suffix_width) const
+  {
+    return dense_suffix_width_ok(value, suffix_width, pad_width);
+  }
+
+  size_t lookup(std::string_view name) const
+  {
+    std::string_view parsed_prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, parsed_prefix, value, suffix_width)) { return SIZE_MAX; }
+    if (parsed_prefix != prefix || !suffix_width_ok(value, suffix_width)) { return SIZE_MAX; }
+    if (value < min_id || value > max_id) { return SIZE_MAX; }
+    return (size_t)(value - min_id);
+  }
+
+  void format_name(size_t idx, std::string& out) const
+  {
+    uint64_t value = min_id + idx;
+    char digits_buf[32];
+    auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value);
+    if (ec != std::errc()) {
+      out.assign(prefix);
+      return;
+    }
+    size_t digits_len = (size_t)(digits_end - digits_buf);
+    size_t width      = std::max(pad_width, digits_len);
+    out.resize(prefix.size() + width);
+    std::memcpy(out.data(), prefix.data(), prefix.size());
+    char* suffix = out.data() + prefix.size();
+    if (width > digits_len) {
+      std::memset(suffix, '0', width - digits_len);
+      suffix += width - digits_len;
+    }
+    std::memcpy(suffix, digits_buf, digits_len);
+  }
+};
+
+struct dense_observe_state_t {
+  bool candidate = true;
+  dense_name_index_t index;
+  size_t count = 0;
+};
+
+static inline void observe_dense_name(bool& candidate,
+                                      dense_name_index_t& index,
+                                      size_t& observed_count,
+                                      std::string_view name,
+                                      uint64_t expected_id = std::numeric_limits<uint64_t>::max())
+{
+  if (!candidate) { return; }
+
+  std::string_view prefix;
+  uint64_t value      = 0;
+  size_t suffix_width = 0;
+  if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  if (observed_count == 0) {
+    index.prefix.assign(prefix);
+    index.min_id    = value;
+    index.max_id    = value;
+    index.pad_width = dense_initial_pad_width(name, suffix_width);
+    observed_count  = 1;
+    return;
+  }
+
+  if (prefix != index.prefix) {
+    candidate = false;
+    return;
+  }
+
+  if (expected_id != std::numeric_limits<uint64_t>::max() && value != expected_id) {
+    candidate = false;
+    return;
+  }
+
+  if (!index.suffix_width_ok(value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  index.max_id = value;
+  observed_count++;
+}
+
 template <typename i_t, typename f_t>
 struct parse_state_t {
   struct row_hash_partition_t {
@@ -411,42 +461,36 @@ struct parse_state_t {
   cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem;
   cursor_t& cursor;
 
-  // Temporary string_view storage (points into input buffer, no allocation)
+  // backed by the input buffer
   std::vector<std::string_view> row_names_sv;
+  // backed by the arena allocator
   std::vector<std::string_view> var_names_sv;
   std::vector<chunk_name_arena_t> var_name_arenas;
   std::string_view problem_name_sv;
   std::string_view objective_name_sv;
-  std::vector<std::string_view> ignored_objective_names_sv;
+  // secondary 'N' rows in ROWS — rare; membership distinguishes them from unknown row names
+  std::unordered_set<std::string_view> ignored_objective_names;
 
-  // Optional dense ordered column index for labels like V0, V1, ...
-  bool col_dense_ordered = false;
-  std::string col_dense_prefix_storage;
-  std::string_view col_dense_prefix;
-  uint64_t col_dense_min_id  = 0;
-  uint64_t col_dense_max_id  = 0;
-  size_t col_dense_pad_width = 0;
-  bool col_dense_zero_padded = false;
+  // Column name lookup for labels like V0, V1, ...
+  index_mode_t col_index_mode = index_mode_t::hash;
+  dense_name_index_t col_dense;
 
   // Row name hash table - sized at runtime based on row count
   size_t row_hash_buckets = 0;
   size_t row_hash_mask    = 0;  // buckets - 1, for fast modulo via &
   mmap_region_t row_hash_region;
-  hash_slot_var_t* row_names_ht                                                 = nullptr;
+  hash_slot_var_t* row_names_ht = nullptr;
+  // compute hash, select the subtable from high hash bits,
+  // then run the same open-addressing probe loop inside that subtable.
   size_t row_hash_partition_count                                               = 0;
   std::array<row_hash_partition_t, MPS_ROW_HASH_PARTITIONS> row_hash_partitions = {};
-  // Overflow map for row names longer than HASH_KEY_BYTES
+  // Overflow map for row names longer than HASH_KEY_BYTES (usually very rare)
   std::unordered_map<std::string_view, size_t> row_names_long;
 
-  // Optional dense ordered row index for labels like R0001, R0002, ...
-  row_index_mode_t row_index_mode = row_index_mode_t::hash;
-  bool row_dense_candidate        = true;
-  std::string_view row_dense_prefix;
-  uint64_t row_dense_min_id  = 0;
-  uint64_t row_dense_max_id  = 0;
-  uint64_t row_dense_base_id = 0;
-  size_t row_dense_pad_width = 0;
-  bool row_dense_zero_padded = false;
+  // Row name lookup for labels like R0001, R0002, ...
+  index_mode_t row_index_mode = index_mode_t::hash;
+  bool row_dense_candidate    = true;
+  dense_name_index_t row_dense;
 
   // var_names still uses STL (only used in parse_bounds, not as hot)
   std::unordered_map<std::string_view, size_t> var_names_map;
@@ -457,7 +501,7 @@ struct parse_state_t {
     char type = 'C';
   };
 
-  // Some writers introduce zero-column variables only in BOUNDS.
+  // some writers introduce zero-column variables only in BOUNDS.
   std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
 
   parse_state_t(cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& p, cursor_t& c)
@@ -471,77 +515,13 @@ struct parse_state_t {
     init_row_hash_table_impl();
   }
 
-  bool row_dense_has_expected_width(uint64_t value, size_t suffix_width) const
-  {
-    return dense_suffix_width_ok(value, suffix_width, row_dense_zero_padded, row_dense_pad_width);
-  }
-
-  bool col_dense_has_expected_width(uint64_t value, size_t suffix_width) const
-  {
-    return dense_suffix_width_ok(value, suffix_width, col_dense_zero_padded, col_dense_pad_width);
-  }
-
-  bool is_ignored_objective_name(std::string_view name) const
-  {
-    return std::find(ignored_objective_names_sv.begin(), ignored_objective_names_sv.end(), name) !=
-           ignored_objective_names_sv.end();
-  }
-
-  void add_ignored_objective_name(std::string_view name)
-  {
-    if (name == objective_name_sv || is_ignored_objective_name(name)) { return; }
-    ignored_objective_names_sv.push_back(name);
-  }
-
   void observe_objective_row_name(std::string_view name)
   {
     if (objective_name_sv.empty()) {
       objective_name_sv = name;
-    } else {
-      add_ignored_objective_name(name);
-    }
-  }
-
-  void observe_row_name_for_dense_index(std::string_view name, size_t row_index)
-  {
-    if (!row_dense_candidate) { return; }
-
-    std::string_view prefix;
-    uint64_t value      = 0;
-    size_t suffix_width = 0;
-    if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
-      row_dense_candidate = false;
-      return;
-    }
-
-    if (row_index == 0) {
-      row_dense_prefix      = prefix;
-      row_dense_min_id      = value;
-      row_dense_max_id      = value;
-      row_dense_base_id     = value;
-      row_dense_pad_width   = suffix_width;
-      row_dense_zero_padded = dense_suffix_is_zero_padded(name, suffix_width);
-      return;
-    }
-
-    if (prefix != row_dense_prefix) {
-      row_dense_candidate = false;
-      return;
+    } else if (name != objective_name_sv) {
+      ignored_objective_names.insert(name);
     }
-
-    if (row_dense_base_id > std::numeric_limits<uint64_t>::max() - row_index) {
-      row_dense_candidate = false;
-      return;
-    }
-
-    uint64_t expected = row_dense_base_id + row_index;
-    if (value != expected || !row_dense_has_expected_width(value, suffix_width)) {
-      row_dense_candidate = false;
-      return;
-    }
-
-    row_dense_min_id = std::min(row_dense_min_id, value);
-    row_dense_max_id = std::max(row_dense_max_id, value);
   }
 
   bool init_row_dense_ordered_table()
@@ -549,23 +529,22 @@ struct parse_state_t {
     scoped_timer_t timer("row_dense_finalize");
     size_t n_rows = row_names_sv.size();
     if (!row_dense_candidate || n_rows == 0) { return false; }
-    if (row_dense_max_id < row_dense_min_id) { return false; }
-    uint64_t dense_count = row_dense_max_id - row_dense_min_id + 1;
+    if (row_dense.max_id < row_dense.min_id) { return false; }
+    uint64_t dense_count = row_dense.max_id - row_dense.min_id + 1;
     if (dense_count != n_rows) { return false; }
 
-    row_index_mode = row_index_mode_t::dense_ordered;
+    row_index_mode = index_mode_t::dense_ordered;
     return true;
   }
 
   size_t row_hash_bucket_count_for(size_t n_rows) const
   {
 #ifdef MPS_FAST_COMPACT_ROW_HASH
-    // Keep the row hash compact. Probe counts are usually low, and a smaller
+    // probe counts are usually low, and a smaller
     // table reduces cache/TLB footprint on medium instances.
-    return next_power_of_2(std::max(n_rows + n_rows / 2, (size_t)64));
+    return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64));
 #else
-    // Original conservative sizing policy.
-    return next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64));
+    return cuda::next_power_of_two(std::max((size_t)(n_rows * 2), (size_t)64));
 #endif
   }
 
@@ -582,11 +561,13 @@ struct parse_state_t {
 
     if (use_partitioned) {
       scoped_timer_t timer("row_hash_partition_metadata");
+      // Pre-hash once, count rows per partition, then pack row indices by partition.
+      // This turns the build into disjoint single-writer table fills.
       row_hashes.resize(n_rows);
       size_t inline_rows = 0;
       for (size_t idx = 0; idx < n_rows; ++idx) {
         std::string_view name = row_names_sv[idx];
-        if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+        if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
           row_names_long[name] = idx;
           continue;
         }
@@ -603,7 +584,7 @@ struct parse_state_t {
       row_order.resize(inline_rows);
       auto next_offsets = partition_offsets;
       for (size_t idx = 0; idx < n_rows; ++idx) {
-        if (__unlikely(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
+        if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
         size_t part                     = row_hash_partition_for(row_hashes[idx]);
         row_order[next_offsets[part]++] = idx;
       }
@@ -639,7 +620,7 @@ struct parse_state_t {
           next_slots += row_hash_partitions[p].buckets;
         }
       }
-      // Request huge pages to reduce TLB misses
+      // request huge pages to reduce TLB misses
       row_hash_region.advise(MADV_HUGEPAGE);
     }
 
@@ -666,6 +647,7 @@ struct parse_state_t {
         std::vector<size_t> partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0);
         std::vector<size_t> partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0);
 #endif
+// initialize the row hash tables in parallel
 #pragma omp parallel for schedule(static) num_threads(num_threads)
         for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
           size_t p = (size_t)part_id;
@@ -675,6 +657,7 @@ struct parse_state_t {
           size_t local_max_probes   = 0;
 #endif
           const auto& part = row_hash_partitions[p];
+          // Each worker owns its subtable, so row_insert_into remains the plain serial probe loop.
           for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
             size_t idx = row_order[pos];
 #ifdef MPS_FAST_PERF_COUNTERS
@@ -745,24 +728,9 @@ struct parse_state_t {
 #endif
   }
 
-  size_t row_lookup_dense_ordered(std::string_view name) const
-  {
-    std::string_view prefix;
-    uint64_t value      = 0;
-    size_t suffix_width = 0;
-    if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; }
-    if (prefix != row_dense_prefix || !row_dense_has_expected_width(value, suffix_width)) {
-      return SIZE_MAX;
-    }
-    if (value < row_dense_min_id || value > row_dense_max_id) { return SIZE_MAX; }
-    return (size_t)(value - row_dense_min_id);
-  }
-
   size_t row_lookup(std::string_view name) const
   {
-    if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) {
-      return row_lookup_dense_ordered(name);
-    }
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); }
     return row_lookup_hash(name);
   }
 
@@ -771,10 +739,10 @@ struct parse_state_t {
     const char* start = cursor.ptr;
     const char* p     = start;
 
-    size_t prefix_len = row_dense_prefix.size();
+    size_t prefix_len = row_dense.prefix.size();
     if (prefix_len > 0) {
       if ((size_t)(cursor.end - p) < prefix_len ||
-          std::memcmp(p, row_dense_prefix.data(), prefix_len) != 0) {
+          std::memcmp(p, row_dense.prefix.data(), prefix_len) != 0) {
         cursor.read_field();
         return SIZE_MAX;
       }
@@ -783,21 +751,12 @@ struct parse_state_t {
 
     const char* digits_start = p;
     uint64_t value           = 0;
-    while (p < cursor.end && is_decimal_digit(*p)) {
-      uint64_t digit = (uint64_t)(*p - '0');
-      if (value > (std::numeric_limits<uint64_t>::max() - digit) / 10) {
-        cursor.ptr = start;
-        cursor.read_field();
-        return SIZE_MAX;
-      }
-      value = value * 10 + digit;
-      p++;
-    }
+    fp64::parse_u64_digits_advance(p, cursor.end, value);
 
     size_t suffix_width = (size_t)(p - digits_start);
-    if (suffix_width == 0 || p >= cursor.end || *p > ' ' ||
-        !row_dense_has_expected_width(value, suffix_width) || value < row_dense_min_id ||
-        value > row_dense_max_id) {
+    if (suffix_width == 0 || suffix_width > dense_suffix_max_digits || p >= cursor.end ||
+        *p > ' ' || !row_dense.suffix_width_ok(value, suffix_width) || value < row_dense.min_id ||
+        value > row_dense.max_id) {
       cursor.ptr = start;
       cursor.read_field();
       return SIZE_MAX;
@@ -805,12 +764,12 @@ struct parse_state_t {
 
     cursor.ptr = p;
     cursor.skip_ws();
-    return (size_t)(value - row_dense_min_id);
+    return (size_t)(value - row_dense.min_id);
   }
 
   size_t read_row_lookup(cursor_t& cursor) const
   {
-    if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) {
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) {
       return read_row_lookup_dense_ordered(cursor);
     }
 
@@ -820,13 +779,14 @@ struct parse_state_t {
 
   size_t row_lookup_hash(std::string_view name) const
   {
-    if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+    if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
       auto it = row_names_long.find(name);
       return it != row_names_long.end() ? it->second : SIZE_MAX;
     }
     hash_key_t key = make_key(name.data(), name.size());
     uint32_t hash  = fnv1a_hash(name.data(), name.size());
-    if (__likely(row_hash_partition_count != 0)) {
+    if (LIKELY(row_hash_partition_count != 0)) {
+      // Lookups mirror the build routing and probe only the selected subtable.
       const auto& part = row_hash_partitions[row_hash_partition_for(hash)];
       return row_lookup_in(part.slots, part.buckets, part.mask, key, hash);
     }
@@ -845,43 +805,9 @@ struct parse_state_t {
     return SIZE_MAX;
   }
 
-  size_t col_lookup_dense_ordered(std::string_view name) const
-  {
-    std::string_view prefix;
-    uint64_t value      = 0;
-    size_t suffix_width = 0;
-    if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; }
-    if (prefix != col_dense_prefix || !col_dense_has_expected_width(value, suffix_width)) {
-      return SIZE_MAX;
-    }
-    if (value < col_dense_min_id || value > col_dense_max_id) { return SIZE_MAX; }
-    return (size_t)(value - col_dense_min_id);
-  }
-
-  void dense_col_name(size_t idx, std::string& out) const
-  {
-    uint64_t value = col_dense_min_id + idx;
-    char digits_buf[32];
-    auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value);
-    if (ec != std::errc()) {
-      out.assign(col_dense_prefix);
-      return;
-    }
-    size_t digits_len = (size_t)(digits_end - digits_buf);
-    size_t width = col_dense_zero_padded ? std::max(col_dense_pad_width, digits_len) : digits_len;
-    out.resize(col_dense_prefix.size() + width);
-    std::memcpy(out.data(), col_dense_prefix.data(), col_dense_prefix.size());
-    char* suffix = out.data() + col_dense_prefix.size();
-    if (width > digits_len) {
-      std::memset(suffix, '0', width - digits_len);
-      suffix += width - digits_len;
-    }
-    std::memcpy(suffix, digits_buf, digits_len);
-  }
-
   size_t row_insert(std::string_view name, size_t index)
   {
-    if (__unlikely(name.size() > HASH_KEY_BYTES)) {
+    if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
       row_names_long[name] = index;
       return 0;
     }
@@ -906,7 +832,8 @@ struct parse_state_t {
       if (slot >= &slots[buckets]) { slot = &slots[0]; }
       if (slot->count == 0) {
         key_store(slot->key, key);            // Writes 32 bytes, including garbage in last 4
-        slot->count = (uint32_t)(index + 1);  // Overwrite last 4 bytes with actual count
+        slot->count = (uint32_t)(index + 1);  // Overwrite last 4 bytes with actual count. i trust
+                                              // the compiler to optimize this
         return i + 1;
       }
       if (key_cmpeq(slot->key, key)) {
@@ -914,7 +841,8 @@ struct parse_state_t {
         return i + 1;
       }
     }
-    __builtin_trap();
+    // can't happen, the table is properly sized to fit all rows
+    __builtin_unreachable();
   }
 };
 
@@ -922,31 +850,13 @@ struct parse_state_t {
 // Section parsers
 // =============================================================================
 
-static std::string_view read_rest_of_line_trimmed(cursor_t& cursor)
-{
-  const char* begin = cursor.ptr;
-  const char* end   = begin;
-  while (end < cursor.end && *end != '\n' && *end != '\r') {
-    ++end;
-  }
-
-  while (begin < end && (*begin == ' ' || *begin == '\t')) {
-    ++begin;
-  }
-  while (end > begin && (end[-1] == ' ' || end[-1] == '\t')) {
-    --end;
-  }
-  cursor.ptr = end;
-  return std::string_view(begin, (size_t)(end - begin));
-}
-
 template <typename i_t, typename f_t>
 static void parse_name_section(parse_state_t<i_t, f_t>& state)
 {
   scoped_timer_t timer("parse_name");
   if (peek(state.cursor) == "ROWS") { return; }
   expect(state.cursor, "NAME");
-  if (!state.cursor.eol()) { state.problem_name_sv = read_rest_of_line_trimmed(state.cursor); }
+  if (!state.cursor.eol()) { state.problem_name_sv = state.cursor.read_rest_of_line_trimmed(); }
   expect_eol(state.cursor);
 }
 
@@ -974,19 +884,18 @@ static void parse_objname_section(parse_state_t<i_t, f_t>& state)
 {
   scoped_timer_t timer("parse_objname");
   if (accept(state.cursor, "OBJNAME")) {
-    if (state.cursor.eol()) { expect_eol(state.cursor); }
-    state.objective_name_sv = state.cursor.read_field();
+    if (!state.cursor.eol()) { state.objective_name_sv = state.cursor.read_rest_of_line_trimmed(); }
     accept_comment(state.cursor);
     expect_eol(state.cursor);
   }
 }
 
-struct RowChunkBoundary {
+struct row_chunk_boundary_t {
   const char* start;
   const char* end;
 };
 
-struct RowChunkInfo {
+struct row_chunk_info_t {
   size_t constraints = 0;
   bool malformed     = false;
   std::vector<std::string_view> objective_names;
@@ -1007,7 +916,7 @@ static bool parse_rows_line_fast(const char*& p,
                                  char& row_type,
                                  std::string_view& row_name)
 {
-  p = cursor_t::simd_scan<true>(p, end);
+  p = cursor_t::simd_scan<skip_whitespace>(p, end);
   if (p >= end) { return false; }
   if (*p == '\n') {
     p++;
@@ -1019,26 +928,29 @@ static bool parse_rows_line_fast(const char*& p,
   }
 
   row_type = *p++;
-  p        = cursor_t::simd_scan<true>(p, end);
+  p        = cursor_t::simd_scan<skip_whitespace>(p, end);
 
   const char* name_start = p;
-  p                      = cursor_t::simd_scan<false>(p, end);
+  p                      = cursor_t::simd_scan<until_whitespace>(p, end);
   if (name_start == p) { return false; }
   row_name = std::string_view(name_start, (size_t)(p - name_start));
 
   // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and
   // field 3 may start with '$' to comment the rest of the record.
+  // could be SIMD'd, but in practice the newline is right after the row name
   p = rows_find_next_line(p, end);
   return true;
 }
 
-static std::vector<RowChunkBoundary> compute_row_chunk_boundaries(const char* rows_start,
-                                                                  const char* rows_end,
-                                                                  int num_threads)
+// row chunks are established based on byte count, thus boundaries can land in the middle of a row
+// this cleans up chunks to have row line boundaries
+static std::vector<row_chunk_boundary_t> compute_row_chunk_boundaries(const char* rows_start,
+                                                                      const char* rows_end,
+                                                                      int num_threads)
 {
   scoped_timer_t timer("rows_compute_chunk_boundaries");
 
-  std::vector<RowChunkBoundary> boundaries((size_t)num_threads);
+  std::vector<row_chunk_boundary_t> boundaries((size_t)num_threads);
   size_t total_size = (size_t)(rows_end - rows_start);
   size_t chunk_size = total_size / (size_t)num_threads;
 
@@ -1057,6 +969,7 @@ static std::vector<RowChunkBoundary> compute_row_chunk_boundaries(const char* ro
   return boundaries;
 }
 
+// reads the row section in chunks and inserts into the worker's hash table partition
 template <typename i_t, typename f_t>
 static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
                                              const char* rows_start,
@@ -1066,7 +979,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
   scoped_timer_t timer("parse_rows_parallel");
 
   auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads);
-  std::vector<RowChunkInfo> infos((size_t)num_threads);
+  std::vector<row_chunk_info_t> infos((size_t)num_threads);
 
   {
     scoped_timer_t timer("rows_count_parallel");
@@ -1075,7 +988,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
       MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows);
       const char* p   = boundaries[(size_t)t].start;
       const char* end = boundaries[(size_t)t].end;
-      RowChunkInfo info;
+      row_chunk_info_t info;
 
       while (p < end) {
         char row_type = 0;
@@ -1104,10 +1017,12 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
     }
   }
 
-  for (const auto& info : infos) {
-    if (info.malformed) { return false; }
+  if (std::any_of(
+        infos.begin(), infos.end(), [](const row_chunk_info_t& info) { return info.malformed; })) {
+    return false;
   }
 
+  // prefix sum to do a paralle scatter of every row entries into the global output arrays
   std::vector<size_t> offsets((size_t)num_threads + 1, 0);
   {
     scoped_timer_t timer("rows_prefix_sum");
@@ -1133,7 +1048,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
   }
   for (const auto& info : infos) {
     for (std::string_view name : info.objective_names) {
-      state.add_ignored_objective_name(name);
+      if (name != state.objective_name_sv) { state.ignored_objective_names.insert(name); }
     }
   }
 
@@ -1141,7 +1056,6 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
   std::string_view dense_prefix;
   uint64_t dense_base_id = 0;
   size_t dense_pad_width = 0;
-  bool dense_zero_padded = false;
 
   if (dense_candidate) {
     std::string_view first_name;
@@ -1157,9 +1071,8 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
     if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) {
       dense_candidate = false;
     } else {
-      dense_base_id     = first_value;
-      dense_pad_width   = first_suffix_width;
-      dense_zero_padded = dense_suffix_is_zero_padded(first_name, first_suffix_width);
+      dense_base_id   = first_value;
+      dense_pad_width = dense_initial_pad_width(first_name, first_suffix_width);
     }
   }
 
@@ -1175,6 +1088,13 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
       size_t out      = offsets[(size_t)t];
 
       bool local_dense_ok = dense_candidate;
+      dense_name_index_t dense_index;
+      if (local_dense_ok) {
+        dense_index.prefix.assign(dense_prefix);
+        dense_index.min_id    = dense_base_id;
+        dense_index.max_id    = dense_base_id;
+        dense_index.pad_width = dense_pad_width;
+      }
 
       while (p < end) {
         char row_type = 0;
@@ -1194,14 +1114,9 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
         state.problem.row_types_[out] = row_type;
 
         if (local_dense_ok) {
-          std::string_view prefix;
-          uint64_t value      = 0;
-          size_t suffix_width = 0;
-          uint64_t expected   = dense_base_id + out;
-          local_dense_ok =
-            parse_trailing_u64(row_name, prefix, value, suffix_width) && prefix == dense_prefix &&
-            value == expected &&
-            dense_suffix_width_ok(value, suffix_width, dense_zero_padded, dense_pad_width);
+          size_t observed_count = out;
+          observe_dense_name(
+            local_dense_ok, dense_index, observed_count, row_name, dense_base_id + out);
         }
         out++;
       }
@@ -1217,12 +1132,10 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
     }
     state.row_dense_candidate = dense_candidate;
     if (dense_candidate) {
-      state.row_dense_prefix      = dense_prefix;
-      state.row_dense_min_id      = dense_base_id;
-      state.row_dense_max_id      = dense_base_id + total_rows - 1;
-      state.row_dense_base_id     = dense_base_id;
-      state.row_dense_pad_width   = dense_pad_width;
-      state.row_dense_zero_padded = dense_zero_padded;
+      state.row_dense.prefix.assign(dense_prefix);
+      state.row_dense.min_id    = dense_base_id;
+      state.row_dense.max_id    = dense_base_id + total_rows - 1;
+      state.row_dense.pad_width = dense_pad_width;
     }
   }
 
@@ -1238,9 +1151,6 @@ static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const
     auto row_type = state.cursor.ptr[0];
     state.cursor.advance(1);
     state.cursor.skip_ws();
-    // if (row_type != "E" && row_type != "L" && row_type != "G" && row_type != "N") {
-    //   state.cursor.error("expected E, L, G, or N, got '%s'", row_type.data());
-    // }
 
     auto row_name = state.cursor.read_field();
     // ROWS fields after the row name are unused; tolerate annotations/comments there.
@@ -1252,7 +1162,12 @@ static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const
     } else {
       size_t row_idx = state.row_names_sv.size();
       state.row_names_sv.push_back(row_name);
-      state.observe_row_name_for_dense_index(row_name, row_idx);
+      observe_dense_name(
+        state.row_dense_candidate,
+        state.row_dense,
+        row_idx,
+        row_name,
+        row_idx == 0 ? std::numeric_limits<uint64_t>::max() : state.row_dense.min_id + row_idx);
       state.problem.row_types_.push_back(row_type);
     }
     expect_eol(state.cursor);
@@ -1272,20 +1187,17 @@ static void parse_rows_section(parse_state_t<i_t, f_t>& state, const char* rows_
     size_t rows_bytes    = (size_t)(rows_end - state.cursor.ptr);
     int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
     bool parsed_parallel = false;
-    if (rows_bytes >= 512ull * 1024ull * 1024ull && num_threads > 1) {
+    if (rows_bytes >= 512 * MiB && num_threads > 1) {
       parsed_parallel =
         parse_rows_section_parallel_impl<i_t, f_t>(state, state.cursor.ptr, rows_end, num_threads);
+      // serial fallback in case a likely malformed chunk has been encounter
+      // makes error reporting much easier
       if (!parsed_parallel) {
         state.row_names_sv.clear();
         state.problem.row_types_.clear();
-        state.row_dense_candidate   = true;
-        state.row_dense_prefix      = {};
-        state.row_dense_min_id      = 0;
-        state.row_dense_max_id      = 0;
-        state.row_dense_base_id     = 0;
-        state.row_dense_pad_width   = 0;
-        state.row_dense_zero_padded = false;
-        state.cursor.ptr            = rows_start;
+        state.row_dense_candidate = true;
+        state.row_dense.reset();
+        state.cursor.ptr = rows_start;
         parse_rows_section_serial_impl(state, rows_end);
       }
     } else {
@@ -1303,71 +1215,61 @@ static void parse_rows_section(parse_state_t<i_t, f_t>& state, const char* rows_
   }
 }
 
-// =============================================================================
-// Parallel COLUMNS parser
-// =============================================================================
+// Columns parser
 
-struct MarkerInfo {
+// integer variable markers
+struct marker_info_t {
   enum Type { INTORG, INTEND };
   Type type;
   size_t after_local_var_idx;  // SIZE_MAX means "before first variable"
 };
 
-struct RowCountBlock {
+struct row_count_block_t {
   size_t block_id       = 0;
   size_t storage_offset = 0;
 };
 
-struct DenseColChunkStats {
-  bool candidate = true;
-  std::string_view prefix;
-  uint64_t first_id = 0;
-  uint64_t last_id  = 0;
-  size_t pad_width  = 0;
-  bool zero_padded  = false;
-  size_t count      = 0;
-};
-
-struct ChunkResult {
+// Each column parsing worker owns chunks of the global CSC which are parsed in parallel and then
+// later scattered into the final CSR
+struct chunk_result_t {
   std::vector<double> values;
   std::vector<uint32_t> row_indices;
   std::vector<size_t> col_offsets;
   std::vector<std::string_view> var_names;
   chunk_name_arena_t var_name_arena;
-  std::vector<MarkerInfo> markers;
+  std::vector<marker_info_t> markers;
   std::vector<std::pair<size_t, double>> objective_entries;  // local_col_idx -> coefficient
-  // Sparse per-row scratch: each touched 4096-row block stores counts after parsing,
-  // then the same slots become CSR write cursors. This avoids scanning/allocating
-  // chunks*n_rows entries when a chunk only touches clustered row ranges. The
-  // block payloads live in one arena per chunk so scatter has hugepage-friendly
-  // write-position metadata instead of many independent 32 KiB allocations.
+  // COLUMNS is parsed as chunk-local CSC. To build the global CSR, each chunk needs row counts
+  // first, then row-local write cursors for scatter. Store those counts only for touched
+  // 4096-row blocks instead of allocating a dense chunks*n_rows matrix
+  // The same slots are rewritten as write cursors after the global CSR row offsets are known
   std::vector<int64_t> row_count_storage;
-  std::vector<RowCountBlock> row_count_blocks;
+  std::vector<row_count_block_t> row_count_blocks;
   std::vector<int32_t> row_count_block_dir;
-  std::string_view first_var_name;
-  std::string_view last_var_name;
-  DenseColChunkStats dense_col_stats;
+  dense_observe_state_t dense_col_stats;
 };
 
-struct ChunkBoundary {
+struct chunk_boundary_t {
   const char* start;
   const char* end;
 };
 
-struct BoundsChunkBoundary {
+struct bounds_chunk_boundary_t {
   const char* start;
   const char* end;
 };
 
-static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx)
+// enables representing row counts per chunk as a sparse representation w/ 4096 granularity
+// works well since nnzs are often clustered around the same matrix blocks
+static inline int64_t& column_row_count_slot(chunk_result_t& result, size_t row_idx)
 {
   size_t block_id   = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
   size_t local      = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
   int32_t block_pos = result.row_count_block_dir[block_id];
-  if (__unlikely(block_pos < 0)) {
+  if (UNLIKELY(block_pos < 0)) {
     block_pos                            = (int32_t)result.row_count_blocks.size();
     result.row_count_block_dir[block_id] = block_pos;
-    RowCountBlock block;
+    row_count_block_t block;
     block.block_id       = block_id;
     block.storage_offset = result.row_count_storage.size();
     result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0);
@@ -1377,68 +1279,17 @@ static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx
     .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local];
 }
 
-static void observe_dense_col_name(DenseColChunkStats& stats, std::string_view name)
-{
-  if (!stats.candidate) { return; }
-
-  std::string_view prefix;
-  uint64_t value      = 0;
-  size_t suffix_width = 0;
-  if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
-    stats.candidate = false;
-    return;
-  }
-
-  if (stats.count == 0) {
-    stats.prefix      = prefix;
-    stats.first_id    = value;
-    stats.last_id     = value;
-    stats.pad_width   = suffix_width;
-    stats.zero_padded = dense_suffix_is_zero_padded(name, suffix_width);
-    stats.count       = 1;
-    return;
-  }
-
-  if (prefix != stats.prefix) {
-    stats.candidate = false;
-    return;
-  }
-  if (stats.last_id == std::numeric_limits<uint64_t>::max() || value != stats.last_id + 1) {
-    stats.candidate = false;
-    return;
-  }
-  if (!dense_suffix_width_ok(value, suffix_width, stats.zero_padded, stats.pad_width)) {
-    stats.candidate = false;
-    return;
-  }
-  stats.last_id = value;
-  stats.count++;
-}
-
-static bool dense_col_chunk_padding_compatible(const DenseColChunkStats& stats,
-                                               bool global_zero_padded,
+static bool dense_col_chunk_padding_compatible(const dense_observe_state_t& stats,
                                                size_t global_pad_width)
 {
-  if (global_zero_padded) {
-    return stats.pad_width == global_pad_width ||
-           (!stats.zero_padded && decimal_digits_u64(stats.first_id) >= global_pad_width);
+  if (global_pad_width > 0) {
+    return stats.index.pad_width == global_pad_width ||
+           (stats.index.pad_width == 0 &&
+            decimal_digits_u64(stats.index.min_id) >= global_pad_width);
   }
-  return !stats.zero_padded;
-}
-
-// Read first field (column name) from a line without modifying any state
-static std::string_view peek_line_column_name(const char* line_start, const char* end)
-{
-  const char* p = line_start;
-  while (p < end && *p <= ' ' && *p != '\n')
-    p++;
-  const char* field_start = p;
-  while (p < end && *p > ' ')
-    p++;
-  return std::string_view(field_start, (size_t)(p - field_start));
+  return stats.index.pad_width == 0;
 }
 
-// Find the start of the next line
 static const char* find_next_line(const char* p, const char* end)
 {
   while (p < end && *p != '\n')
@@ -1471,16 +1322,15 @@ static const char* find_line_start(const char* section_start, const char* p)
   return p;
 }
 
-static std::vector<BoundsChunkBoundary> compute_bounds_chunk_boundaries(const char* section_start,
-                                                                        const char* section_end,
-                                                                        int num_threads)
+static std::vector<bounds_chunk_boundary_t> compute_bounds_chunk_boundaries(
+  const char* section_start, const char* section_end, int num_threads)
 {
   scoped_timer_t timer("bounds_compute_chunk_boundaries");
 
   const size_t total_size = (size_t)(section_end - section_start);
   const size_t chunk_size = total_size / (size_t)num_threads;
 
-  std::vector<BoundsChunkBoundary> boundaries((size_t)num_threads);
+  std::vector<bounds_chunk_boundary_t> boundaries((size_t)num_threads);
   boundaries[0].start = section_start;
   for (int t = 0; t < num_threads; ++t) {
     if (t == num_threads - 1) {
@@ -1506,19 +1356,17 @@ static std::vector<BoundsChunkBoundary> compute_bounds_chunk_boundaries(const ch
   return boundaries;
 }
 
-static std::vector<ChunkBoundary> compute_chunk_boundaries(const char* columns_start,
-                                                           const char* columns_end,
-                                                           int num_threads)
+static std::vector<chunk_boundary_t> compute_chunk_boundaries(const char* columns_start,
+                                                              const char* columns_end,
+                                                              int num_threads)
 {
   scoped_timer_t timer("compute_chunk_boundaries");
 
   size_t total_size = (size_t)(columns_end - columns_start);
   size_t chunk_size = total_size / (size_t)num_threads;
 
-  std::vector<ChunkBoundary> boundaries(num_threads);
+  std::vector<chunk_boundary_t> boundaries(num_threads);
 
-  // Parallel boundary finding - each thread finds its own end at a column transition
-  // #pragma omp parallel for
   for (int t = 0; t < num_threads; t++) {
     if (t == 0) { boundaries[t].start = columns_start; }
 
@@ -1533,7 +1381,7 @@ static std::vector<ChunkBoundary> compute_chunk_boundaries(const char* columns_s
       if (line_start < columns_end) line_start++;
 
       // Read column name at this line
-      std::string_view col_name = peek_line_column_name(line_start, columns_end);
+      std::string_view col_name = cursor_t::peek_field_at(line_start, columns_end);
 
       // Scan forward until column name changes (to avoid splitting a column)
       const char* boundary = line_start;
@@ -1541,9 +1389,9 @@ static std::vector<ChunkBoundary> compute_chunk_boundaries(const char* columns_s
         const char* next_line = find_next_line(boundary, columns_end);
         if (next_line >= columns_end) break;
 
-        std::string_view next_col = peek_line_column_name(next_line, columns_end);
+        std::string_view next_col = cursor_t::peek_field_at(next_line, columns_end);
         if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') {
-          // Found a column transition (and it's not a MARKER line)
+          // Found a column transition. Marker-state fixup later handles any split near markers.
           boundary = next_line;
           break;
         }
@@ -1562,11 +1410,11 @@ static std::vector<ChunkBoundary> compute_chunk_boundaries(const char* columns_s
 }
 
 template <typename i_t, typename f_t>
-static ChunkResult parse_columns_chunk(const char* chunk_start,
-                                       const char* chunk_end,
-                                       const parse_state_t<i_t, f_t>& state)
+static chunk_result_t parse_columns_chunk(const char* chunk_start,
+                                          const char* chunk_end,
+                                          const parse_state_t<i_t, f_t>& state)
 {
-  ChunkResult result;
+  chunk_result_t result;
 
   if (chunk_start >= chunk_end) {
     result.col_offsets.push_back(0);
@@ -1576,7 +1424,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
   size_t chunk_size     = (size_t)(chunk_end - chunk_start);
   size_t estimated_nnz  = chunk_size / 100;
   size_t estimated_cols = estimated_nnz / 10;
-  if (__unlikely(state.problem.n_constraints_ > (i_t)std::numeric_limits<int32_t>::max())) {
+  if (UNLIKELY(state.problem.n_constraints_ > (i_t)std::numeric_limits<int32_t>::max())) {
     state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices");
   }
   result.values.reserve(estimated_nnz);
@@ -1585,8 +1433,8 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
   result.var_names.reserve(estimated_cols);
   result.var_name_arena.reserve(std::max<size_t>(4096, estimated_cols * 16));
   result.objective_entries.reserve(estimated_cols);
-  size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) /
-                        COLUMN_ROW_COUNT_BLOCK_ROWS;
+  size_t n_row_blocks =
+    cuda::ceil_div((size_t)state.problem.n_constraints_, COLUMN_ROW_COUNT_BLOCK_ROWS);
   result.row_count_block_dir.resize(n_row_blocks, -1);
   size_t estimated_touched_blocks = std::min(n_row_blocks, std::max<size_t>(16, estimated_nnz));
   result.row_count_blocks.reserve(estimated_touched_blocks);
@@ -1598,31 +1446,35 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
   cursor.skip_ws();
 
   while (!cursor.done()) {
-    if (__unlikely(*cursor.ptr == 'R')) {
+    if (UNLIKELY(*cursor.ptr == 'R')) {
       auto next = cursor.peek_field();
       // RHS section is mandatory right after COLUMNS section
       if (next == "RHS") { break; }
     }
 
     auto [var_name, field2] = cursor.read_two_fields();
-    if (__unlikely(!field2.empty() && field2[0] == '$')) {
+    if (UNLIKELY(!field2.empty() && field2[0] == '$')) {
       cursor.skip_to_eol();
       expect_eol(cursor);
       continue;
     }
 
     // Check for integer marker
-    if (__unlikely(field2[0] == '\'' && field2 == "'MARKER'")) {
+    if (UNLIKELY(field2[0] == '\'' && field2 == "'MARKER'")) {
       auto marker_type = cursor.read_field();
 
-      MarkerInfo marker;
+      marker_info_t marker;
       marker.after_local_var_idx =
         result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1;
 
       if (marker_type == "'INTORG'") {
-        marker.type = MarkerInfo::INTORG;
+        marker.type = marker_info_t::INTORG;
+      } else if (marker_type == "'INTEND'") {
+        marker.type = marker_info_t::INTEND;
       } else {
-        marker.type = MarkerInfo::INTEND;
+        cursor.error("unknown integer marker type in COLUMNS: %.*s",
+                     (int)marker_type.size(),
+                     marker_type.data());
       }
       result.markers.push_back(marker);
 
@@ -1649,32 +1501,33 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
       value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end);
     }
     // usually EOL directly follows
-    if (__unlikely(!cursor.eol())) { cursor.skip_ws(); }
+    if (UNLIKELY(!cursor.eol())) { cursor.skip_ws(); }
     accept_comment(cursor);
 
     if (prev_var_name != var_name) {
       std::string_view owned_var_name = result.var_name_arena.copy(var_name);
       result.var_names.push_back(owned_var_name);
-      observe_dense_col_name(result.dense_col_stats, owned_var_name);
+      observe_dense_name(result.dense_col_stats.candidate,
+                         result.dense_col_stats.index,
+                         result.dense_col_stats.count,
+                         owned_var_name);
       result.col_offsets.push_back(result.values.size());
       prev_var_name = owned_var_name;
-      if (result.first_var_name.empty()) { result.first_var_name = owned_var_name; }
-      result.last_var_name = owned_var_name;
     }
 
     auto add_entry = [&](std::string_view rn, double val) {
       size_t row_idx = state.row_lookup(rn);
-      if (__likely(row_idx != SIZE_MAX)) {
+      if (LIKELY(row_idx != SIZE_MAX)) {
         assert(row_idx <= (size_t)std::numeric_limits<int32_t>::max());
         result.values.push_back(val);
         result.row_indices.push_back((uint32_t)row_idx);
         column_row_count_slot(result, row_idx)++;
-      } else if (__likely(rn == state.objective_name_sv)) {
+      } else if (LIKELY(rn == state.objective_name_sv)) {
         result.objective_entries.push_back({result.var_names.size() - 1, val});
-      } else if (state.is_ignored_objective_name(rn)) {
+      } else if (state.ignored_objective_names.count(rn)) {
         return;
       } else {
-        state.cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data());
+        cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data());
       }
     };
 
@@ -1683,7 +1536,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start,
     // Optional second entry on same line
     if (!cursor.eol()) {
       auto row_name2 = cursor.read_field();
-      if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) {
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
         cursor.skip_to_eol();
         expect_eol(cursor);
         continue;
@@ -1714,8 +1567,8 @@ struct column_merge_shape_t {
 };
 
 template <typename i_t>
-static column_merge_shape_t<i_t> compute_column_merge_shape(const std::vector<ChunkResult>& chunks,
-                                                            i_t n_rows)
+static column_merge_shape_t<i_t> compute_column_merge_shape(
+  const std::vector<chunk_result_t>& chunks, i_t n_rows)
 {
   column_merge_shape_t<i_t> shape;
   shape.num_chunks = (int)chunks.size();
@@ -1752,7 +1605,7 @@ static column_merge_shape_t<i_t> compute_column_merge_shape(const std::vector<Ch
 
 template <typename i_t, typename f_t>
 static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
-                                         const std::vector<ChunkResult>& chunks,
+                                         const std::vector<chunk_result_t>& chunks,
                                          const column_merge_shape_t<i_t>& shape)
 {
   scoped_timer_t timer("columns_dense_metadata");
@@ -1763,7 +1616,6 @@ static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
   uint64_t dense_min_id     = 0;
   uint64_t dense_max_id     = 0;
   size_t dense_pad_width    = 0;
-  bool dense_zero_padded    = false;
 
   for (int t = 0; t < shape.num_chunks && dense_ok; ++t) {
     const auto& stats = chunks[(size_t)t].dense_col_stats;
@@ -1773,28 +1625,28 @@ static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
       break;
     }
     if (!have_first) {
-      have_first        = true;
-      dense_prefix      = stats.prefix;
-      expected_next_id  = stats.first_id;
-      dense_min_id      = stats.first_id;
-      dense_pad_width   = stats.pad_width;
-      dense_zero_padded = stats.zero_padded;
-    }
-    if (stats.prefix != dense_prefix || stats.first_id != expected_next_id ||
-        !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) {
+      have_first       = true;
+      dense_prefix     = stats.index.prefix;
+      expected_next_id = stats.index.min_id;
+      dense_min_id     = stats.index.min_id;
+      dense_pad_width  = stats.index.pad_width;
+    }
+    if (stats.index.prefix != dense_prefix || stats.index.min_id != expected_next_id ||
+        !dense_col_chunk_padding_compatible(stats, dense_pad_width)) {
       dense_ok = false;
       break;
     }
-    if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) {
+    if (stats.index.max_id < stats.index.min_id ||
+        stats.index.max_id - stats.index.min_id + 1 != stats.count) {
       dense_ok = false;
       break;
     }
-    dense_max_id = stats.last_id;
-    if (stats.last_id == std::numeric_limits<uint64_t>::max()) {
+    dense_max_id = stats.index.max_id;
+    if (stats.index.max_id == std::numeric_limits<uint64_t>::max()) {
       dense_ok = false;
       break;
     }
-    expected_next_id = stats.last_id + 1;
+    expected_next_id = stats.index.max_id + 1;
   }
 
   if (!have_first || dense_max_id < dense_min_id ||
@@ -1802,20 +1654,18 @@ static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
     dense_ok = false;
   }
 
-  state.col_dense_ordered = dense_ok;
+  state.col_index_mode = dense_ok ? index_mode_t::dense_ordered : index_mode_t::hash;
   if (dense_ok) {
-    state.col_dense_prefix_storage.assign(dense_prefix);
-    state.col_dense_prefix      = state.col_dense_prefix_storage;
-    state.col_dense_min_id      = dense_min_id;
-    state.col_dense_max_id      = dense_max_id;
-    state.col_dense_pad_width   = dense_pad_width;
-    state.col_dense_zero_padded = dense_zero_padded;
+    state.col_dense.prefix.assign(dense_prefix);
+    state.col_dense.min_id    = dense_min_id;
+    state.col_dense.max_id    = dense_max_id;
+    state.col_dense.pad_width = dense_pad_width;
   }
 }
 
 template <typename i_t, typename f_t>
 static std::vector<i_t> build_csr_row_offsets(parse_state_t<i_t, f_t>& state,
-                                              const std::vector<ChunkResult>& chunks,
+                                              const std::vector<chunk_result_t>& chunks,
                                               const column_merge_shape_t<i_t>& shape)
 {
   std::vector<i_t> global_row_counts((size_t)shape.n_rows, 0);
@@ -1846,7 +1696,7 @@ static std::vector<i_t> build_csr_row_offsets(parse_state_t<i_t, f_t>& state,
 }
 
 template <typename i_t>
-static void convert_counts_to_write_positions(std::vector<ChunkResult>& chunks,
+static void convert_counts_to_write_positions(std::vector<chunk_result_t>& chunks,
                                               const column_merge_shape_t<i_t>& shape,
                                               const std::vector<i_t>& row_offsets,
                                               std::vector<i_t>& global_row_counts)
@@ -1870,7 +1720,8 @@ static void convert_counts_to_write_positions(std::vector<ChunkResult>& chunks,
   }
 }
 
-static void materialize_chunk_row_count_storage(std::vector<ChunkResult>& chunks, int num_threads)
+static void materialize_chunk_row_count_storage(std::vector<chunk_result_t>& chunks,
+                                                int num_threads)
 {
   scoped_timer_t timer("columns_row_count_storage_hugepages");
 #pragma omp parallel for num_threads(num_threads)
@@ -1901,7 +1752,7 @@ static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
     }
 #pragma omp section
     {
-      if (!state.col_dense_ordered) {
+      if (state.col_index_mode != index_mode_t::dense_ordered) {
         state.var_name_arenas.clear();
         state.var_name_arenas.resize((size_t)shape.num_chunks);
         state.var_names_sv.resize(shape.total_cols);
@@ -1916,7 +1767,7 @@ static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
 
 template <typename i_t, typename f_t>
 static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
-                                         std::vector<ChunkResult>& chunks,
+                                         std::vector<chunk_result_t>& chunks,
                                          const column_merge_shape_t<i_t>& shape,
                                          int num_threads)
 {
@@ -1942,7 +1793,7 @@ static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
           size_t block_id                = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
           size_t local                   = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
           int32_t block_pos              = chunk.row_count_block_dir[block_id];
-          RowCountBlock& block           = chunk.row_count_blocks[(size_t)block_pos];
+          row_count_block_t& block       = chunk.row_count_blocks[(size_t)block_pos];
           int64_t& write_pos             = chunk.row_count_storage[block.storage_offset + local];
           i_t dest                       = (i_t)write_pos++;
           state.problem.A_[dest]         = (f_t)chunk.values[idx];
@@ -1958,7 +1809,7 @@ static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
 #endif
   }
 
-  if (!state.col_dense_ordered) {
+  if (state.col_index_mode != index_mode_t::dense_ordered) {
     scoped_timer_t names_timer("scatter_var_names");
 #pragma omp parallel for num_threads(num_threads)
     for (int t = 0; t < shape.num_chunks; t++) {
@@ -1975,13 +1826,13 @@ static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
 }
 
 struct global_marker_t {
-  MarkerInfo::Type type;
+  marker_info_t::Type type;
   size_t global_var_idx;
 };
 
 template <typename i_t, typename f_t>
 static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
-                                         const std::vector<ChunkResult>& chunks,
+                                         const std::vector<chunk_result_t>& chunks,
                                          const column_merge_shape_t<i_t>& shape)
 {
   scoped_timer_t timer("columns_apply_markers");
@@ -1999,7 +1850,7 @@ static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
     }
   }
 
-  std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
+  std::stable_sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
     if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
     if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
     return a.global_var_idx < b.global_var_idx;
@@ -2010,7 +1861,7 @@ static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
   for (size_t v = 0; v < shape.total_cols; v++) {
     while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
                                                all_markers[marker_idx].global_var_idx < v)) {
-      is_integer = all_markers[marker_idx].type == MarkerInfo::INTORG;
+      is_integer = all_markers[marker_idx].type == marker_info_t::INTORG;
       marker_idx++;
     }
     state.problem.var_types_[v] = is_integer ? 'I' : 'C';
@@ -2019,7 +1870,7 @@ static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
 
 template <typename i_t, typename f_t>
 static void assign_column_objective_entries(parse_state_t<i_t, f_t>& state,
-                                            const std::vector<ChunkResult>& chunks,
+                                            const std::vector<chunk_result_t>& chunks,
                                             const column_merge_shape_t<i_t>& shape)
 {
   scoped_timer_t timer("columns_objective_entries");
@@ -2034,7 +1885,7 @@ static void assign_column_objective_entries(parse_state_t<i_t, f_t>& state,
 
 template <typename i_t, typename f_t>
 static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
-                                       std::vector<ChunkResult>& chunks,
+                                       std::vector<chunk_result_t>& chunks,
                                        int num_threads)
 {
   scoped_timer_t timer("merge_chunks_to_csr");
@@ -2071,11 +1922,10 @@ static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
   size_t chunk_limited_threads = std::max<size_t>(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES);
   num_threads = std::max(1, std::min<int>(num_threads, (int)chunk_limited_threads));
 
-  // Compute chunk boundaries
   auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads);
 
   // Parse chunks in parallel
-  std::vector<ChunkResult> results(num_threads);
+  std::vector<chunk_result_t> results(num_threads);
 
   {
     scoped_timer_t timer("parse_columns_chunk_parallel");
@@ -2145,7 +1995,7 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
       return;
     }
     // Other objectives, ignored currently. cold path
-    if (state.is_ignored_objective_name(row_name)) { return; }
+    if (state.ignored_objective_names.count(row_name)) { return; }
     // Unexpected!
     error_unknown_row(cursor, row_start, "RHS");
   };
@@ -2175,6 +2025,8 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
   }
 }
 
+// does the job on 99% of instances, in the vast majority of cases bound names are sequential with
+// occasional sparsity
 static size_t find_var_after_hint(const std::vector<std::string_view>& var_names,
                                   std::string_view var_name,
                                   size_t hint_idx)
@@ -2230,7 +2082,7 @@ static bool apply_bound_record(std::string_view bound_type,
     if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
     set_type('I');
   } else if (bound_type == "SC") {
-    if (__unlikely(!has_value)) {
+    if (UNLIKELY(!has_value)) {
       error("SC bound requires an upper bound value", bound_type);
       return false;
     }
@@ -2252,7 +2104,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
 {
   const size_t bounds_bytes   = (size_t)(bounds_body_end - bounds_body_start);
   const int num_threads       = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
-  const bool use_dense_lookup = state.col_dense_ordered;
+  const bool use_dense_lookup = state.col_index_mode == index_mode_t::dense_ordered;
   const size_t min_parallel_bytes =
     use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES;
   if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; }
@@ -2261,7 +2113,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
     use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint",
     nvtx::colors::bounds);
 
-  struct BoundsParallelStats {
+  struct bounds_parallel_stats_t {
     size_t lines            = 0;
     size_t dense_hits       = 0;
     size_t dense_misses     = 0;
@@ -2273,7 +2125,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
     char error_msg[192]     = {};
   };
 
-  std::vector<BoundsParallelStats> stats((size_t)num_threads);
+  std::vector<bounds_parallel_stats_t> stats((size_t)num_threads);
   auto boundaries =
     compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
 
@@ -2297,12 +2149,14 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
       size_t prev_var = SIZE_MAX;
       size_t hint_idx = 0;
       auto lookup_var = [&](std::string_view var_name) {
-        if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); }
+        if (use_dense_lookup) { return state.col_dense.lookup(var_name); }
+        // quite often variables are in order, so a cheap lookup trick is to look for the variable
+        // right after this one
         return find_var_after_hint(state.var_names_sv, var_name, hint_idx);
       };
       try {
         while (cursor.ptr < cursor.end) {
-          if (__unlikely(*cursor.ptr == '$')) {
+          if (UNLIKELY(*cursor.ptr == '$')) {
             cursor.skip_to_eol();
             expect_eol(cursor);
             local.comments++;
@@ -2310,8 +2164,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
           }
 
           auto bound_type = cursor.read_field();
-          if (__unlikely(bound_type.empty())) { break; }
-          if (__unlikely(bound_type[0] == '$')) {
+          if (UNLIKELY(bound_type.empty())) { break; }
+          if (UNLIKELY(bound_type[0] == '$')) {
             cursor.skip_to_eol();
             expect_eol(cursor);
             local.comments++;
@@ -2321,7 +2175,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
           auto bound_name = cursor.read_field();
           (void)bound_name;
           auto var_name = cursor.read_field();
-          if (__unlikely(!var_name.empty() && var_name[0] == '$')) {
+          if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
             cursor.skip_to_eol();
             expect_eol(cursor);
             local.comments++;
@@ -2329,7 +2183,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
           }
 
           size_t var_idx = lookup_var(var_name);
-          if (__unlikely(var_idx == SIZE_MAX)) {
+          if (UNLIKELY(var_idx == SIZE_MAX)) {
             local.dense_misses++;
             break;
           }
@@ -2435,36 +2289,14 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
 }
 
 template <typename i_t, typename f_t>
-static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
-                                 cursor_t& cursor,
-                                 bool allow_parallel_dense = false)
+static void init_variable_bounds_defaults(parse_state_t<i_t, f_t>& state)
 {
   size_t n_vars = (size_t)state.problem.n_vars_;
-
-  // Initialize bounds with defaults
   {
     scoped_timer_t timer("bounds_init_defaults");
-    const bool parallel_init =
-      n_vars >= MPS_BOUNDS_PARALLEL_INIT_MIN_VARS && omp_get_max_threads() >= 2;
-
-    if (parallel_init) {
-#pragma omp parallel sections num_threads(2)
-      {
-#pragma omp section
-        {
-          state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
-        }
-#pragma omp section
-        {
-          state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
-        }
-      }
-    } else {
-      state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
-      state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
-    }
+    state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
+    state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
   }
-
   {
     scoped_timer_t timer("bounds_madvise_pretouch");
     materialize_vector_hugepages("variable_lower_bounds",
@@ -2474,6 +2306,35 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
                                  state.problem.variable_upper_bounds_,
                                  materialize_touch_t::write_4kb);
   }
+}
+
+template <typename i_t, typename f_t, typename HasBound>
+static void apply_unspecified_integer_bounds(parse_state_t<i_t, f_t>& state, HasBound&& has_bound)
+{
+  scoped_timer_t timer("bounds_integer_defaults");
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  for (size_t i = 0; i < n_vars; ++i) {
+    if (!has_bound(i) && state.problem.var_types_[i] == 'I') {
+      state.problem.variable_lower_bounds_[i] = f_t{0};
+      state.problem.variable_upper_bounds_[i] = f_t{1};
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void init_variable_bounds_without_bounds_section(parse_state_t<i_t, f_t>& state)
+{
+  init_variable_bounds_defaults(state);
+  apply_unspecified_integer_bounds(state, [](size_t) { return false; });
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
+                                 cursor_t& cursor,
+                                 bool allow_parallel_dense = false)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  init_variable_bounds_defaults(state);
 
   std::vector<uint64_t> bound_seen((n_vars + 63) / 64, 0);
   auto has_bound = [&](size_t var_idx) {
@@ -2482,18 +2343,9 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
   auto mark_bound = [&](size_t var_idx) {
     bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63);
   };
-  auto apply_unspecified_integer_bounds = [&]() {
-    scoped_timer_t timer("bounds_integer_defaults");
-    for (size_t i = 0; i < n_vars; ++i) {
-      if (!has_bound(i) && state.problem.var_types_[i] == 'I') {
-        state.problem.variable_lower_bounds_[i] = f_t{0};
-        state.problem.variable_upper_bounds_[i] = f_t{1};
-      }
-    }
-  };
 
   if (!accept_section(cursor, "BOUNDS")) {
-    apply_unspecified_integer_bounds();
+    apply_unspecified_integer_bounds(state, has_bound);
     return;
   }
 
@@ -2523,17 +2375,18 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
       auto bound_name = cursor.read_field();
       (void)bound_name;
       auto var_name = cursor.read_field();
-      if (__unlikely(!var_name.empty() && var_name[0] == '$')) {
+      if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
         cursor.skip_to_eol();
         expect_eol(cursor);
         continue;
       }
 
       // optimized lookup using hint (bounds often in same order as columns)
-      size_t var_idx                                               = SIZE_MAX;
+      size_t var_idx = SIZE_MAX;
+      // handle annoying bounds-only vars that weren't declared in COLUMNS
       typename parse_state_t<i_t, f_t>::bounds_only_var_t* aux_var = nullptr;
-      if (__likely(state.col_dense_ordered)) {
-        var_idx = state.col_lookup_dense_ordered(var_name);
+      if (LIKELY(state.col_index_mode == index_mode_t::dense_ordered)) {
+        var_idx = state.col_dense.lookup(var_name);
         if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
       } else {
         var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx);
@@ -2586,15 +2439,12 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
       expect_eol(cursor);
     }
   }
-  apply_unspecified_integer_bounds();
+  apply_unspecified_integer_bounds(state, has_bound);
 }
 
 template <typename i_t, typename f_t>
-static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+static void init_constraint_bounds_from_rows(parse_state_t<i_t, f_t>& state)
 {
-  scoped_timer_t timer("parse_ranges");
-
-  // Initialize constraint bounds from row_types and b_
   state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_);
   state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_);
 
@@ -2612,6 +2462,13 @@ static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& curso
       state.problem.constraint_upper_bounds_[i] = std::numeric_limits<f_t>::infinity();
     }
   }
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_ranges");
+  init_constraint_bounds_from_rows(state);
 
   if (!accept_section(cursor, "RANGES")) { return; }
 
@@ -2654,7 +2511,7 @@ static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& curso
     accept_comment(cursor);
     if (!cursor.eol()) {
       auto row_name2 = cursor.read_field();
-      if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) {
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
         cursor.skip_to_eol();
         expect_eol(cursor);
         continue;
@@ -2667,10 +2524,14 @@ static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& curso
   }
 }
 
+// quadratric stuff is bare bones for now, optimize if needed
+
 template <typename i_t, typename f_t>
 static void build_var_name_map_if_needed(parse_state_t<i_t, f_t>& state)
 {
-  if (state.col_dense_ordered || !state.var_names_map.empty()) { return; }
+  if (state.col_index_mode == index_mode_t::dense_ordered || !state.var_names_map.empty()) {
+    return;
+  }
   scoped_timer_t timer("quadratic_build_var_name_map");
   state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2);
   for (size_t i = 0; i < state.var_names_sv.size(); ++i) {
@@ -2681,7 +2542,7 @@ static void build_var_name_map_if_needed(parse_state_t<i_t, f_t>& state)
 template <typename i_t, typename f_t>
 static size_t lookup_quadratic_var(parse_state_t<i_t, f_t>& state, std::string_view name)
 {
-  if (state.col_dense_ordered) { return state.col_lookup_dense_ordered(name); }
+  if (state.col_index_mode == index_mode_t::dense_ordered) { return state.col_dense.lookup(name); }
   auto it = state.var_names_map.find(name);
   return it == state.var_names_map.end() ? SIZE_MAX : it->second;
 }
@@ -2695,14 +2556,14 @@ static void build_quadratic_csr(parse_state_t<i_t, f_t>& state,
   const size_t n_vars = (size_t)state.problem.n_vars_;
   if (entries.empty()) { return; }
 
-  struct ExpandedEntry {
+  struct expanded_entry_t {
     size_t row;
     size_t col;
     size_t seq;
     f_t value;
   };
 
-  std::vector<ExpandedEntry> expanded;
+  std::vector<expanded_entry_t> expanded;
   expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size());
   size_t seq = 0;
   for (const auto& [row_i, col_i, value] : entries) {
@@ -2779,14 +2640,14 @@ static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& c
     if (active_entries == nullptr) { break; }
 
     auto var1 = cursor.read_field();
-    if (__unlikely(var1.empty())) { break; }
-    if (__unlikely(var1[0] == '$')) {
+    if (UNLIKELY(var1.empty())) { break; }
+    if (UNLIKELY(var1[0] == '$')) {
       cursor.skip_to_eol();
       expect_eol(cursor);
       continue;
     }
     auto var2 = cursor.read_field();
-    if (__unlikely(!var2.empty() && var2[0] == '$')) {
+    if (UNLIKELY(!var2.empty() && var2[0] == '$')) {
       cursor.skip_to_eol();
       expect_eol(cursor);
       continue;
@@ -2847,37 +2708,29 @@ static void parse_rhs_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t ra
 }
 
 template <typename i_t, typename f_t>
-static void parse_bounds_range(parse_state_t<i_t, f_t>& state,
-                               mps_phase_range_t range,
-                               const char* fallback_ptr)
+static void parse_bounds_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
 {
-  if (range.present) {
-    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
-    parse_bounds_section(state, cursor, range.present);
-  } else {
-    cursor_t cursor(fallback_ptr, 16);
-    parse_bounds_section(state, cursor, range.present);
+  if (!range.present) {
+    init_variable_bounds_without_bounds_section(state);
+    return;
   }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_bounds_section(state, cursor, true);
 }
 
 template <typename i_t, typename f_t>
-static void parse_ranges_range(parse_state_t<i_t, f_t>& state,
-                               mps_phase_range_t range,
-                               const char* fallback_ptr)
+static void parse_ranges_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
 {
-  if (range.present) {
-    cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
-    parse_ranges_section(state, cursor);
-  } else {
-    cursor_t cursor(fallback_ptr, 16);
-    parse_ranges_section(state, cursor);
+  if (!range.present) {
+    init_constraint_bounds_from_rows(state);
+    return;
   }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_ranges_section(state, cursor);
 }
 
 template <typename i_t, typename f_t>
-static void parse_quadratic_range(parse_state_t<i_t, f_t>& state,
-                                  mps_phase_range_t range,
-                                  const char*)
+static void parse_quadratic_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
 {
   if (!range.present) { return; }
   cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
@@ -2917,16 +2770,17 @@ static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
 
   {
     scoped_timer_t timer("materialize_problem_var_names");
-    size_t n = state.col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size();
+    const bool col_dense_ordered = state.col_index_mode == index_mode_t::dense_ordered;
+    size_t n = col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size();
     state.problem.var_names_.resize(n);
-    if (state.col_dense_ordered && n >= 1'000'000 && num_threads > 1) {
+    if (col_dense_ordered && n >= 1'000'000 && num_threads > 1) {
 #pragma omp parallel for schedule(static) num_threads(num_threads)
       for (size_t i = 0; i < n; ++i) {
-        state.dense_col_name(i, state.problem.var_names_[i]);
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
       }
-    } else if (state.col_dense_ordered) {
+    } else if (col_dense_ordered) {
       for (size_t i = 0; i < n; ++i) {
-        state.dense_col_name(i, state.problem.var_names_[i]);
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
       }
     } else if (n >= 1'000'000 && num_threads > 1) {
 #pragma omp parallel for schedule(static) num_threads(num_threads)
@@ -2969,7 +2823,7 @@ static std::size_t init_problem_storage(
   problem.objective_scaling_factor_ = f_t{1};
   problem.objective_offset_         = f_t{0};
 
-  std::size_t reserve_size = std::max<std::size_t>(reserve_hint, 1024 * 1024);
+  std::size_t reserve_size = std::max<std::size_t>(reserve_hint, 1 * MiB);
   std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
   problem.A_offsets_.reserve(reserve_dim);
   problem.b_.reserve(reserve_dim);
@@ -2984,26 +2838,14 @@ static std::size_t init_problem_storage(
   return reserve_dim;
 }
 
-static const char* trailing_endata_cursor_end(mps_phase_registry_t& registry)
-{
-  mps_phase_range_t quadratic = registry.range(mps_phase_kind::quadratic);
-  if (quadratic.present) { return quadratic.end; }
-  mps_phase_range_t bounds = registry.range(mps_phase_kind::bounds);
-  if (bounds.present) { return bounds.end; }
-  mps_phase_range_t ranges = registry.range(mps_phase_kind::ranges);
-  if (ranges.present) { return ranges.end; }
-  return registry.range(mps_phase_kind::rhs).end;
-}
-
 template <typename Stream, typename i_t, typename f_t>
 static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
   Stream& stream, const char* total_timer_name, const char* producer_task_name)
 {
-  omp_set_max_active_levels(2);
+  omp_max_active_levels_guard_t omp_active_levels(2);
 
   input_stream_view_t input = stream.view();
-  timer_io_context_t timer_io_context(input.compressed_size);
-  auto total_timer = std::make_unique<scoped_timer_t>(total_timer_name);
+  auto total_timer          = std::make_unique<scoped_timer_t>(total_timer_name);
   cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
   std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint());
 
@@ -3159,7 +3001,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
       {
         run_parser_task([&] {
           MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges);
-          parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges), input.data);
+          parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges));
           phase_end("ranges");
         });
       }
@@ -3168,7 +3010,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
       {
         run_parser_task([&] {
           MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds);
-          parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds), input.data);
+          parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds));
           phase_end("bounds");
         });
       }
@@ -3177,8 +3019,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
       {
         run_parser_task([&] {
           MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic);
-          parse_quadratic_range(
-            state, input.registry->range(mps_phase_kind::quadratic), input.data);
+          parse_quadratic_range(state, input.registry->range(mps_phase_kind::quadratic));
           phase_end("quadratic");
         });
       }
@@ -3192,93 +3033,34 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   append_bounds_only_variables(state);
 
   input.size = stream.size();
-  cursor.ptr = trailing_endata_cursor_end(*input.registry);
   cursor.end = input.data + input.size;
-  if (!cursor.done()) { expect(cursor, "ENDATA"); }
+  if (!input.registry->endata_ready() || !input.registry->endata_present()) {
+    cursor.ptr =
+      input.registry->endata_ready() ? input.registry->endata_begin() : input.data + input.size;
+    cursor.error("missing ENDATA");
+  }
+  cursor.ptr = input.registry->endata_begin();
+  expect(cursor, "ENDATA");
 
   total_timer.reset();
   flush_timers();
   return problem;
 }
 
-struct small_raw_read_t {
-  bool use_small_path = false;
+struct padded_memory_input_t {
   std::vector<char> buffer;
-  std::size_t size = 0;
+  std::size_t input_size      = 0;
+  std::size_t compressed_size = 0;
 };
 
-static small_raw_read_t try_read_small_raw_file(const std::string& path)
-{
-  FILE* file = std::fopen(path.c_str(), "rb");
-  if (file == nullptr) {
-    mps_parser_fail(error_type_t::RuntimeError,
-                    "Failed to open raw MPS file '%s': %s",
-                    path.c_str(),
-                    std::strerror(errno));
-  }
-  std::unique_ptr<FILE, decltype(&std::fclose)> file_guard(file, &std::fclose);
-
-  if (std::fseek(file, 0, SEEK_END) != 0) {
-    mps_parser_fail(error_type_t::RuntimeError, "Failed to seek raw MPS file '%s'", path.c_str());
-  }
-  long file_size_long = std::ftell(file);
-  if (file_size_long < 0) {
-    mps_parser_fail(
-      error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str());
-  }
-  std::size_t file_size = (std::size_t)file_size_long;
-  if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; }
-  if (std::fseek(file, 0, SEEK_SET) != 0) {
-    mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str());
-  }
-
-  if (file_size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "small raw input padding size overflow");
-  }
-  std::vector<char> buffer(file_size + input_buffer_padding_bytes);
-  if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) {
-    mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str());
-  }
-  return {true, std::move(buffer), file_size};
-}
-
-template <typename i_t, typename f_t>
-static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_small_raw_file(
-  std::vector<char> buffer, std::size_t input_size)
+static padded_memory_input_t read_compressed_mps_file(const std::string& path)
 {
-  auto total_timer = std::make_unique<scoped_timer_t>("parse_mps_fast_file_raw_small (total)");
-  const char* data = buffer.data();
-  const char* end  = data + input_size;
-
-  mps_phase_registry_t registry;
-  mps_section_block_scanner_t scanner(data, 1, registry);
-  scanner.observe_block(0, data, end);
-  scanner.publish_ready(input_size);
-
-  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
-  std::size_t reserve_dim = init_problem_storage(problem, input_size);
+  std::vector<char> buffer = cuopt::linear_programming::io::detail::file_to_string(path);
+  if (buffer.empty()) { buffer.push_back('\0'); }
 
-  cursor_t cursor(data, input_size);
-  parse_state_t<i_t, f_t> state(problem, cursor);
-  state.row_names_sv.reserve(reserve_dim);
-
-  parse_header_range(state, registry.range(mps_phase_kind::header));
-  parse_rows_range(state, registry.range(mps_phase_kind::rows));
-  parse_columns_range(state, registry.range(mps_phase_kind::columns), 1);
-  materialize_problem_names(state);
-  parse_rhs_range(state, registry.range(mps_phase_kind::rhs));
-  parse_ranges_range(state, registry.range(mps_phase_kind::ranges), data);
-  parse_bounds_range(state, registry.range(mps_phase_kind::bounds), data);
-  parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data);
-  append_bounds_only_variables(state);
-
-  cursor.ptr = trailing_endata_cursor_end(registry);
-  cursor.end = end;
-  if (!cursor.done()) { expect(cursor, "ENDATA"); }
-
-  total_timer.reset();
-  flush_timers();
-  return problem;
+  std::size_t input_size = buffer.size() - 1;
+  buffer.resize(input_size + input_buffer_padding_bytes, '\0');
+  return {std::move(buffer), input_size, get_file_size(path)};
 }
 
 template <typename i_t, typename f_t>
@@ -3286,22 +3068,30 @@ cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
   const std::string& path, FileReadMethod read_method)
 {
   FileReadMethod effective_method = effective_file_read_method(path, read_method);
-  if (effective_method == FileReadMethod::Lz4) {
-    Lz4InputStream stream(path);
-    return parse_mps_fast_stream<Lz4InputStream, i_t, f_t>(
-      stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
-  }
-  if (effective_method == FileReadMethod::Read) {
-    small_raw_read_t small_raw = try_read_small_raw_file(path);
-    if (small_raw.use_small_path) {
-      return parse_mps_fast_small_raw_file<i_t, f_t>(std::move(small_raw.buffer), small_raw.size);
-    }
-    RawInputStream stream(path);
-    return parse_mps_fast_stream<RawInputStream, i_t, f_t>(
-      stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
-  }
-  mps_parser_fail(error_type_t::RuntimeError,
-                  "single-path parser supports raw read and LZ4 inputs only");
+  switch (effective_method) {
+    case FileReadMethod::Lz4: {
+      lz4_input_stream_t stream(path);
+      return parse_mps_fast_stream<lz4_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
+    }
+    case FileReadMethod::Gzip:
+    case FileReadMethod::Bzip2: {
+      padded_memory_input_t input = read_compressed_mps_file(path);
+      memory_input_stream_t stream(
+        std::move(input.buffer), input.input_size, input.compressed_size);
+      const char* timer_name = effective_method == FileReadMethod::Gzip
+                                 ? "parse_mps_fast_file_gzip (total)"
+                                 : "parse_mps_fast_file_bzip2 (total)";
+      return parse_mps_fast_stream<memory_input_stream_t, i_t, f_t>(
+        stream, timer_name, "task_memory_scan");
+    }
+    case FileReadMethod::Read: {
+      raw_input_stream_t stream(path);
+      return parse_mps_fast_stream<raw_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
+    }
+  }
+  __builtin_unreachable();
 }
 
 template cuopt::linear_programming::io::mps_data_model_t<int, float> parse_mps_fast_file(
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index dc9ae86abc..e69de29bb2 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -1,288 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved. SPDX-License-Identifier: Apache-2.0
-
-#include "file_reader.hpp"
-#include "nvtx_ranges.hpp"
-
-#include <utilities/error.hpp>
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <atomic>
-#include <cerrno>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <mutex>
-#include <stdexcept>
-#include <string>
-#include <thread>
-#include <utility>
-#include <vector>
-
-namespace mps_fast {
-
-using cuopt::linear_programming::io::error_type_t;
-using cuopt::linear_programming::io::mps_parser_expects;
-using cuopt::linear_programming::io::mps_parser_fail;
-
-namespace {
-
-constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
-constexpr std::size_t raw_input_max_read_threads          = 8;
-constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
-
-bool path_has_suffix(const std::string& path, const char* suffix) noexcept
-{
-  std::size_t suffix_len = std::strlen(suffix);
-  return path.size() >= suffix_len &&
-         path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
-}
-
-std::size_t get_file_size(int fd, const std::string& path)
-{
-  struct stat st;
-  if (::fstat(fd, &st) != 0) {
-    mps_parser_fail(error_type_t::RuntimeError,
-                    "Failed to stat file '%s': %s",
-                    path.c_str(),
-                    std::strerror(errno));
-  }
-  return (std::size_t)st.st_size;
-}
-
-std::size_t system_page_size()
-{
-  static std::size_t page_size = [] {
-    long value = ::sysconf(_SC_PAGESIZE);
-    return value > 0 ? (std::size_t)value : (std::size_t)4096;
-  }();
-  return page_size;
-}
-
-std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
-{
-  if (alignment == 0) { return value; }
-  std::size_t remainder = value % alignment;
-  if (remainder == 0) { return value; }
-  std::size_t increment = alignment - remainder;
-  if (value > std::numeric_limits<std::size_t>::max() - increment) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
-  }
-  return value + increment;
-}
-
-std::size_t add_input_padding(std::size_t size)
-{
-  if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow");
-  }
-  return size + input_buffer_padding_bytes;
-}
-
-}  // namespace
-
-RawInputStream::RawInputStream(const std::string& path) : path_(path)
-{
-  MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
-  buffered_fd_ = ::open(path.c_str(), O_RDONLY);
-  if (buffered_fd_ < 0) {
-    mps_parser_fail(error_type_t::RuntimeError,
-                    "Failed to open raw MPS file '%s': %s",
-                    path.c_str(),
-                    std::strerror(errno));
-  }
-
-  file_size_         = get_file_size(buffered_fd_, path);
-  fd_                = buffered_fd_;
-  bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes;
-  if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) {
-    use_direct_io = raw_direct[0] != '0';
-  }
-  if (use_direct_io) {
-#ifdef O_DIRECT
-    int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
-    if (direct_fd >= 0) {
-      fd_        = direct_fd;
-      direct_io_ = true;
-    }
-#endif
-  }
-  window_bytes_ = raw_input_window_bytes;
-  window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
-
-  output_mapped_size_ = round_up_to_multiple(
-    std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
-  output_region_ = mmap_region_t::anonymous(
-    output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
-  output_data_ = output_region_.char_data();
-  output_region_.advise(MADV_HUGEPAGE);
-
-  block_done_.resize(window_count_, 0);
-  block_end_.resize(window_count_, 0);
-  section_scanner_ =
-    std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
-}
-
-RawInputStream::~RawInputStream()
-{
-  if (fd_ >= 0) { ::close(fd_); }
-  if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); }
-}
-
-const char* RawInputStream::data() const noexcept { return output_data_; }
-char* RawInputStream::mutable_data() noexcept { return output_data_; }
-std::size_t RawInputStream::size() const noexcept { return output_view_size_; }
-std::size_t RawInputStream::compressed_size() const noexcept { return file_size_; }
-std::size_t RawInputStream::reserve_size_hint() const noexcept { return file_size_; }
-mps_phase_registry_t& RawInputStream::registry() noexcept { return registry_; }
-input_stream_view_t RawInputStream::view() noexcept
-{
-  return {output_data_, output_data_, output_view_size_, file_size_, &registry_};
-}
-
-void RawInputStream::run_decode_tasks()
-{
-  MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io);
-  if (file_size_ == 0) {
-    output_view_size_ = 0;
-    section_scanner_->publish_ready(0);
-    return;
-  }
-
-  std::size_t hw_threads =
-    std::max<std::size_t>(1, (std::size_t)std::thread::hardware_concurrency());
-  std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
-  thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
-
-  std::atomic_size_t next_window{0};
-  std::exception_ptr first_error = nullptr;
-  std::mutex error_mutex;
-  std::atomic_bool stop{false};
-
-  auto mark_error = [&](std::exception_ptr eptr) {
-    std::lock_guard<std::mutex> lock(error_mutex);
-    if (!first_error) {
-      first_error = eptr;
-      stop.store(true, std::memory_order_release);
-    }
-  };
-
-  auto read_window = [&](std::size_t index) {
-    MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
-    std::size_t offset = index * window_bytes_;
-    std::size_t size   = std::min(window_bytes_, file_size_ - offset);
-    std::size_t done   = 0;
-    {
-      MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
-      while (done < size) {
-        ssize_t got =
-          ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
-        if (got < 0) {
-          if (errno == EINTR) { continue; }
-          if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) {
-            got = ::pread(
-              buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
-            if (got >= 0) {
-              done += (std::size_t)got;
-              continue;
-            }
-            if (errno == EINTR) { continue; }
-          }
-          mps_parser_fail(error_type_t::RuntimeError,
-                          "Failed to pread raw MPS file '%s': %s",
-                          path_.c_str(),
-                          std::strerror(errno));
-        }
-        if (got == 0) {
-          mps_parser_fail(error_type_t::RuntimeError,
-                          "Unexpected EOF while reading raw MPS file '%s'",
-                          path_.c_str());
-        }
-        done += (std::size_t)got;
-      }
-    }
-
-    {
-      MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
-      section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
-      frontier_mutex_.lock();
-      block_done_[index] = 1;
-      block_end_[index]  = offset + size;
-      std::size_t before = ready_bytes_;
-      while (next_block_ < block_done_.size() && block_done_[next_block_]) {
-        ready_bytes_ = block_end_[next_block_];
-        ++next_block_;
-      }
-      std::size_t after = ready_bytes_;
-      frontier_mutex_.unlock();
-      if (after > before) { section_scanner_->publish_ready(after); }
-    }
-  };
-
-  std::vector<std::thread> workers;
-  workers.reserve(thread_count);
-  for (std::size_t t = 0; t < thread_count; ++t) {
-    workers.emplace_back([&, t] {
-      std::string thread_name = "raw-input-read-" + std::to_string(t);
-      nvtx::name_current_thread(thread_name.c_str());
-      MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io);
-      while (!stop.load(std::memory_order_acquire)) {
-        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
-        if (index >= window_count_) { break; }
-        try {
-          read_window(index);
-        } catch (...) {
-          mark_error(std::current_exception());
-          return;
-        }
-      }
-    });
-  }
-  for (auto& worker : workers) {
-    worker.join();
-  }
-  if (first_error) { std::rethrow_exception(first_error); }
-
-  output_view_size_ = ready_bytes_;
-  section_scanner_->publish_ready(output_view_size_);
-}
-
-bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
-
-void drop_file_cache(const std::string& path)
-{
-  MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
-  int fd = ::open(path.c_str(), O_RDONLY);
-  if (fd < 0) { return; }
-
-  ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
-  ::close(fd);
-}
-
-FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
-{
-  if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
-  if (method == FileReadMethod::Lz4) {
-    mps_parser_fail(
-      error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str());
-  }
-  return method;
-}
-
-const char* file_read_method_name(FileReadMethod method) noexcept
-{
-  switch (method) {
-    case FileReadMethod::Read: return "read";
-    case FileReadMethod::Lz4: return "lz4";
-    default: return "unknown";
-  }
-}
-
-}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index bab63c76cf..b0089be257 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -23,12 +23,12 @@ struct lz4_pipeline_t;
 /**
  * @brief File reading method selection
  */
-enum class FileReadMethod { Read, Lz4 };
+enum class FileReadMethod { Read, Lz4, Gzip, Bzip2 };
 
 /**
  * @brief Return the effective method for a path.
  *
- * .lz4 inputs are decompressed; all other inputs use raw input reads.
+ * Compressed inputs are auto-detected by extension; all other inputs use raw input reads.
  */
 FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method);
 
@@ -41,6 +41,8 @@ const char* file_read_method_name(FileReadMethod method) noexcept;
  * @brief True when the file name has an lz4 extension.
  */
 bool has_lz4_extension(const std::string& path) noexcept;
+bool has_gzip_extension(const std::string& path) noexcept;
+bool has_bzip2_extension(const std::string& path) noexcept;
 
 /**
  * @brief Ask the OS to evict clean cached pages for this file.
@@ -49,6 +51,17 @@ bool has_lz4_extension(const std::string& path) noexcept;
  */
 void drop_file_cache(const std::string& path);
 
+/**
+ * @brief OS memory page size, queried once and cached.
+ */
+std::size_t system_page_size();
+
+/**
+ * @brief File size in bytes; fails with a parser error if it cannot be determined.
+ */
+std::size_t get_file_size(int fd, const std::string& path);
+std::size_t get_file_size(const std::string& path);
+
 struct input_stream_view_t {
   const char* data               = nullptr;
   char* mutable_data             = nullptr;
@@ -57,13 +70,13 @@ struct input_stream_view_t {
   mps_phase_registry_t* registry = nullptr;
 };
 
-class Lz4InputStream {
+class lz4_input_stream_t {
  public:
-  explicit Lz4InputStream(const std::string& path);
-  ~Lz4InputStream();
+  explicit lz4_input_stream_t(const std::string& path);
+  ~lz4_input_stream_t();
 
-  Lz4InputStream(const Lz4InputStream&)            = delete;
-  Lz4InputStream& operator=(const Lz4InputStream&) = delete;
+  lz4_input_stream_t(const lz4_input_stream_t&)            = delete;
+  lz4_input_stream_t& operator=(const lz4_input_stream_t&) = delete;
 
   const char* data() const noexcept;
   char* mutable_data() noexcept;
@@ -97,21 +110,17 @@ class Lz4InputStream {
   bool dict_id_                      = false;
   mps_phase_registry_t registry_;
   std::mutex commit_mutex_;
-  std::mutex frontier_mutex_;
-  std::vector<unsigned char> block_done_;
-  std::vector<std::size_t> block_end_;
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
-  std::size_t next_block_  = 0;
-  std::size_t ready_bytes_ = 0;
+  std::size_t block_slot_count_ = 0;
 };
 
-class RawInputStream {
+class raw_input_stream_t {
  public:
-  explicit RawInputStream(const std::string& path);
-  ~RawInputStream();
+  explicit raw_input_stream_t(const std::string& path);
+  ~raw_input_stream_t();
 
-  RawInputStream(const RawInputStream&)            = delete;
-  RawInputStream& operator=(const RawInputStream&) = delete;
+  raw_input_stream_t(const raw_input_stream_t&)            = delete;
+  raw_input_stream_t& operator=(const raw_input_stream_t&) = delete;
 
   const char* data() const noexcept;
   char* mutable_data() noexcept;
@@ -144,4 +153,31 @@ class RawInputStream {
   std::size_t ready_bytes_ = 0;
 };
 
+class memory_input_stream_t {
+ public:
+  memory_input_stream_t(std::vector<char> buffer,
+                        std::size_t input_size,
+                        std::size_t compressed_size);
+
+  memory_input_stream_t(const memory_input_stream_t&)            = delete;
+  memory_input_stream_t& operator=(const memory_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+  mps_phase_registry_t& registry() noexcept;
+  input_stream_view_t view() noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  std::vector<char> buffer_;
+  std::size_t input_size_      = 0;
+  std::size_t compressed_size_ = 0;
+  mps_phase_registry_t registry_;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+};
+
 }  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index bb6657e303..b25e330999 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -7,6 +7,8 @@
 
 #include <utilities/error.hpp>
 
+#include <cuda/cmath>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -14,7 +16,6 @@
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
 #include <unistd.h>
 
 #include <algorithm>
@@ -51,10 +52,19 @@ constexpr uint32_t lz4_block_size_mask                    = 0x7FFFFFFFu;
 constexpr std::size_t lz4_pipeline_batch_bytes            = 64ull * 1024ull * 1024ull;
 constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull;
 constexpr std::size_t lz4_input_max_io_threads            = 8;
-constexpr std::size_t lz4_no_content_size_reserve_ratio   = 16;
+constexpr std::size_t lz4_no_content_size_reserve_ratio   = 128;
 
 using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
 
+std::size_t estimate_lz4_no_content_size(std::size_t compressed_size)
+{
+  constexpr std::size_t max_size = std::numeric_limits<std::size_t>::max();
+  if (compressed_size > max_size / lz4_no_content_size_reserve_ratio) {
+    return max_size - input_buffer_padding_bytes;
+  }
+  return compressed_size * lz4_no_content_size_reserve_ratio;
+}
+
 #if defined(MPS_PARSER_WITH_LZ4)
 struct lz4_runtime_t {
   void* handle                          = nullptr;
@@ -138,8 +148,6 @@ int open_lz4_fd(const std::string& path)
   return fd;
 }
 
-std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment);
-
 uint32_t read_le32(const char* ptr)
 {
   const auto* p = reinterpret_cast<const unsigned char*>(ptr);
@@ -168,67 +176,6 @@ std::size_t block_max_size_from_bd(unsigned char bd)
   }
 }
 
-std::size_t checked_size(uint64_t value, const char* label)
-{
-  if (value > (uint64_t)std::numeric_limits<std::size_t>::max()) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label);
-  }
-  return (std::size_t)value;
-}
-
-std::size_t get_file_size(int fd, const std::string& path)
-{
-  struct stat st;
-  if (::fstat(fd, &st) != 0) {
-    mps_parser_fail(error_type_t::RuntimeError,
-                    "Failed to stat file '%s': %s",
-                    path.c_str(),
-                    std::strerror(errno));
-  }
-  if (st.st_size < 0) {
-    mps_parser_fail(
-      error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str());
-  }
-  return (std::size_t)st.st_size;
-}
-
-std::size_t system_page_size()
-{
-  static std::size_t page_size = [] {
-    long value = ::sysconf(_SC_PAGESIZE);
-    return value > 0 ? (std::size_t)value : (std::size_t)4096;
-  }();
-  return page_size;
-}
-
-std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
-{
-  if (alignment == 0) { return value; }
-  std::size_t remainder = value % alignment;
-  if (remainder == 0) { return value; }
-  std::size_t increment = alignment - remainder;
-  if (value > std::numeric_limits<std::size_t>::max() - increment) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
-  }
-  return value + increment;
-}
-
-std::size_t checked_mul(std::size_t a, std::size_t b, const char* label)
-{
-  if (a != 0 && b > std::numeric_limits<std::size_t>::max() / a) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label);
-  }
-  return a * b;
-}
-
-std::size_t checked_add(std::size_t a, std::size_t b, const char* label)
-{
-  if (a > std::numeric_limits<std::size_t>::max() - b) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label);
-  }
-  return a + b;
-}
-
 bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
 {
   std::size_t done = 0;
@@ -332,7 +279,7 @@ class lz4_resident_windows_t {
 
 }  // namespace
 
-Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
+lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
 {
   MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io);
 
@@ -384,7 +331,7 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
       mps_parser_fail(error_type_t::ValidationError,
                       "truncated LZ4 frame while reading content size");
     }
-    content_size_ = checked_size(read_le64(header + offset), "content size");
+    content_size_ = (std::size_t)read_le64(header + offset);
     offset += 8;
   }
   if (dict_id_) {
@@ -403,14 +350,13 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
 
   std::size_t reserve_size = content_size_;
   if (!content_size_present_) {
-    reserve_size =
-      checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve");
+    reserve_size = estimate_lz4_no_content_size(compressed_size_);
     reserve_size = std::max(reserve_size, block_max_size_);
   }
-  reserve_size = checked_add(reserve_size, input_buffer_padding_bytes, "LZ4 output padding");
+  reserve_size += input_buffer_padding_bytes;
 
   constexpr std::size_t huge_alignment = 2 * 1024 * 1024;
-  output_mapped_size_                  = round_up_to_multiple(reserve_size, system_page_size());
+  output_mapped_size_                  = cuda::round_up(reserve_size, system_page_size());
   output_region_                       = mmap_region_t::anonymous_aligned(output_mapped_size_,
                                                     huge_alignment,
                                                     PROT_NONE,
@@ -418,36 +364,34 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path)
                                                     "LZ4 output buffer");
   output_data_                         = output_region_.char_data();
 
-  std::size_t block_slots =
-    std::max<std::size_t>(1, (reserve_size + block_max_size_ - 1) / block_max_size_ + 1);
-  block_done_.resize(block_slots, 0);
-  block_end_.resize(block_slots, 0);
+  block_slot_count_ = std::max<std::size_t>(1, cuda::ceil_div(reserve_size, block_max_size_) + 1);
 
   section_scanner_ =
-    std::make_unique<mps_section_block_scanner_t>(output_data_, block_slots, registry_);
+    std::make_unique<mps_section_block_scanner_t>(output_data_, block_slot_count_, registry_);
 }
 
-Lz4InputStream::~Lz4InputStream()
+lz4_input_stream_t::~lz4_input_stream_t()
 {
   if (fd_ >= 0) { ::close(fd_); }
 }
 
-const char* Lz4InputStream::data() const noexcept { return output_data_; }
-char* Lz4InputStream::mutable_data() noexcept { return output_data_; }
-std::size_t Lz4InputStream::size() const noexcept { return output_view_size_; }
-std::size_t Lz4InputStream::compressed_size() const noexcept { return compressed_size_; }
-std::size_t Lz4InputStream::reserve_size_hint() const noexcept
+const char* lz4_input_stream_t::data() const noexcept { return output_data_; }
+char* lz4_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t lz4_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t lz4_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept
 {
-  return content_size_present_ ? content_size_
-                               : std::max<std::size_t>(compressed_size_ * 6, 1024 * 1024);
+  return content_size_present_
+           ? content_size_
+           : std::max<std::size_t>(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024);
 }
-mps_phase_registry_t& Lz4InputStream::registry() noexcept { return registry_; }
-input_stream_view_t Lz4InputStream::view() noexcept
+mps_phase_registry_t& lz4_input_stream_t::registry() noexcept { return registry_; }
+input_stream_view_t lz4_input_stream_t::view() noexcept
 {
   return {output_data_, output_data_, output_view_size_, compressed_size_, &registry_};
 }
 
-void Lz4InputStream::commit_up_to(std::size_t bytes)
+void lz4_input_stream_t::commit_up_to(std::size_t bytes)
 {
   MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc);
   std::lock_guard<std::mutex> lock(commit_mutex_);
@@ -455,7 +399,7 @@ void Lz4InputStream::commit_up_to(std::size_t bytes)
   if (bytes > output_mapped_size_) {
     mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping");
   }
-  std::size_t new_committed = round_up_to_multiple(bytes, system_page_size());
+  std::size_t new_committed = cuda::round_up(bytes, system_page_size());
   if (new_committed > output_mapped_size_) new_committed = output_mapped_size_;
   std::size_t add = new_committed - output_committed_size_;
   void* target    = output_data_ + output_committed_size_;
@@ -476,15 +420,17 @@ struct resident_block_desc_t {
 };
 
 struct lz4_pipeline_t {
-  explicit lz4_pipeline_t(Lz4InputStream& input_)
+  explicit lz4_pipeline_t(lz4_input_stream_t& input_)
     : input(input_),
-      window_count((input.compressed_size_ + window_bytes - 1) / window_bytes),
+      window_count(cuda::ceil_div(input.compressed_size_, window_bytes)),
       windows(window_count),
       io_threads(std::min(lz4_input_max_io_threads, window_count)),
       window_done(window_count, 0),
       window_refs(window_count),
       window_scanned(window_count),
-      window_released(window_count)
+      window_released(window_count),
+      block_done(input.block_slot_count_, 0),
+      block_end(input.block_slot_count_, 0)
   {
     for (std::size_t i = 0; i < window_count; ++i) {
       std::size_t offset     = i * window_bytes;
@@ -516,9 +462,8 @@ struct lz4_pipeline_t {
 
   void finalize()
   {
-    input.output_view_size_ = input.ready_bytes_;
-    input.commit_up_to(
-      checked_add(input.output_view_size_, input_buffer_padding_bytes, "LZ4 output padding"));
+    input.output_view_size_ = ready_bytes;
+    input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes);
     input.section_scanner_->publish_ready(input.output_view_size_);
   }
 
@@ -698,15 +643,15 @@ struct lz4_pipeline_t {
     std::size_t after  = 0;
     {
       MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic);
-      std::lock_guard<std::mutex> lock(input.frontier_mutex_);
-      input.block_done_[block.index] = 1;
-      input.block_end_[block.index]  = block.decompressed_offset + actual_size;
-      before                         = input.ready_bytes_;
-      while (input.next_block_ < input.block_done_.size() && input.block_done_[input.next_block_]) {
-        input.ready_bytes_ = input.block_end_[input.next_block_];
-        ++input.next_block_;
+      std::lock_guard<std::mutex> lock(frontier_mutex);
+      block_done[block.index] = 1;
+      block_end[block.index]  = block.decompressed_offset + actual_size;
+      before                  = ready_bytes;
+      while (next_block < block_done.size() && block_done[next_block]) {
+        ready_bytes = block_end[next_block];
+        ++next_block;
       }
-      after = input.ready_bytes_;
+      after = ready_bytes;
     }
     if (after > before) {
       MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic);
@@ -792,7 +737,7 @@ struct lz4_pipeline_t {
       batch_decoded_bytes += block.decompressed_size;
       batch.push_back(block);
       blocks_scanned.fetch_add(1, std::memory_order_relaxed);
-      if (blocks_scanned.load(std::memory_order_relaxed) > input.block_done_.size()) {
+      if (blocks_scanned.load(std::memory_order_relaxed) > block_done.size()) {
         mps_parser_fail(error_type_t::OutOfMemoryError,
                         "LZ4 input block count exceeded reserved metadata slots");
       }
@@ -898,7 +843,7 @@ struct lz4_pipeline_t {
     }
   }
 
-  Lz4InputStream& input;
+  lz4_input_stream_t& input;
   const std::size_t window_bytes = lz4_pipeline_batch_bytes;
   const std::size_t window_count;
   std::vector<lz4_resident_window_t> windows;
@@ -927,9 +872,16 @@ struct lz4_pipeline_t {
   std::vector<std::vector<char>> crossing_payloads;
   std::vector<std::thread> readers;
   std::vector<std::thread> decoders;
+
+  // Tracks the contiguous decoded-byte frontier across out-of-order block completions.
+  std::mutex frontier_mutex;
+  std::vector<unsigned char> block_done;
+  std::vector<std::size_t> block_end;
+  std::size_t next_block  = 0;
+  std::size_t ready_bytes = 0;
 };
 
-void Lz4InputStream::run_decode_tasks()
+void lz4_input_stream_t::run_decode_tasks()
 {
   MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
   lz4_pipeline_t pipeline(*this);
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
index d7b299917b..389f563efa 100644
--- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -6,6 +6,8 @@
 #include <sys/mman.h>
 #include <sys/types.h>
 
+#include <cuda/cmath>
+
 #include <cerrno>
 #include <cstddef>
 #include <cstdint>
@@ -74,7 +76,7 @@ class mmap_region_t {
   static mmap_region_t anonymous_aligned(
     std::size_t size, std::size_t alignment, int prot, int flags, const char* context)
   {
-    if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
+    if (!cuda::is_power_of_two(alignment)) {
       mps_parser_fail(error_type_t::RuntimeError,
                       "mmap aligned allocation requires power-of-two alignment");
     }
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index 498b106955..9eee8708e0 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -121,6 +121,31 @@ mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
   return ranges_[idx];
 }
 
+void mps_phase_registry_t::publish_endata(const char* begin, bool present)
+{
+  std::lock_guard<std::mutex> lock(mutex_);
+  endata_begin_   = begin;
+  endata_present_ = present;
+  endata_ready_.store(true, std::memory_order_release);
+}
+
+bool mps_phase_registry_t::endata_ready() const
+{
+  return endata_ready_.load(std::memory_order_acquire);
+}
+
+const char* mps_phase_registry_t::endata_begin() const
+{
+  assert(endata_ready());
+  return endata_begin_;
+}
+
+bool mps_phase_registry_t::endata_present() const
+{
+  assert(endata_ready());
+  return endata_present_;
+}
+
 static section_record_match_t is_section_record(const char* line_start,
                                                 const char* line_end,
                                                 mps_section_kind* kind)
@@ -397,6 +422,12 @@ void mps_section_block_scanner_t::publish_section_ranges()
       registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false});
     }
   }
+
+  if (available(endata)) {
+    registry_.publish_endata(endata, true);
+  } else if (final_ready && final_boundary != nullptr) {
+    registry_.publish_endata(final_boundary, false);
+  }
 }
 
 }  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index 74bf89da7f..9fcffa6ea7 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -51,6 +51,11 @@ class mps_phase_registry_t {
   // acquire load in ready() pairs with publish()'s release store before ranges_.
   mps_phase_range_t range(mps_phase_kind phase) const;
 
+  void publish_endata(const char* begin, bool present);
+  bool endata_ready() const;
+  const char* endata_begin() const;
+  bool endata_present() const;
+
  private:
   static constexpr std::size_t phase_count = 7;
 
@@ -61,6 +66,9 @@ class mps_phase_registry_t {
   omp_event_handle_t events_[phase_count]{};
   bool has_event_[phase_count]{};
   bool event_fulfilled_[phase_count]{};
+  const char* endata_begin_ = nullptr;
+  bool endata_present_      = false;
+  std::atomic<bool> endata_ready_{false};
   mutable std::mutex mutex_;
 };
 
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
index f8a6d04d1e..fac9e64d78 100644
--- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -59,24 +59,24 @@ inline std::uint32_t color_for_name(std::string_view name) noexcept
   return colors::generic;
 }
 
-class scoped_range {
+class scoped_range_t {
  public:
-  explicit scoped_range(const char* name,
-                        std::uint32_t color    = colors::generic,
-                        std::uint32_t category = 0)
+  explicit scoped_range_t(const char* name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
   {
     push(name, color, category);
   }
 
-  explicit scoped_range(std::string name,
-                        std::uint32_t color    = colors::generic,
-                        std::uint32_t category = 0)
+  explicit scoped_range_t(std::string name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
     : owned_name_(std::move(name))
   {
     push(owned_name_.c_str(), color, category);
   }
 
-  ~scoped_range() { end(); }
+  ~scoped_range_t() { end(); }
 
   void end()
   {
@@ -88,8 +88,8 @@ class scoped_range {
 #endif
   }
 
-  scoped_range(const scoped_range&)            = delete;
-  scoped_range& operator=(const scoped_range&) = delete;
+  scoped_range_t(const scoped_range_t&)            = delete;
+  scoped_range_t& operator=(const scoped_range_t&) = delete;
 
  private:
   void push(const char* name, std::uint32_t color, std::uint32_t category)
@@ -132,4 +132,4 @@ inline void name_current_thread(const char* name)
 #define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b
 #define MPS_FAST_NVTX_CONCAT(a, b)       MPS_FAST_NVTX_CONCAT_INNER(a, b)
 #define MPS_NVTX_RANGE(name, color) \
-  ::mps_fast::nvtx::scoped_range MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color)
+  ::mps_fast::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color)
diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp
index 77b92d90e9..5823381098 100644
--- a/cpp/src/io/file_to_string.cpp
+++ b/cpp/src/io/file_to_string.cpp
@@ -22,9 +22,9 @@
 #include <zlib.h>
 #endif  // MPS_PARSER_WITH_ZLIB
 
-#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB)
+#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) || defined(MPS_PARSER_WITH_LZ4)
 #include <dlfcn.h>
-#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB
+#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB || MPS_PARSER_WITH_LZ4
 
 namespace {
 using cuopt::linear_programming::io::error_type_t;
@@ -207,6 +207,163 @@ std::vector<char> zlib_file_to_string(const std::string& file)
 }  // end namespace
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+namespace {
+// Minimal liblz4 frame ABI declarations; keep in sync with lz4frame.h.
+struct LZ4F_dctx;
+using LZ4F_errorCode_t = size_t;
+struct LZ4F_frameInfo_t {
+  int blockSizeID;
+  int blockMode;
+  int contentChecksumFlag;
+  int frameType;
+  unsigned long long contentSize;
+  unsigned dictID;
+  int blockChecksumFlag;
+};
+using LZ4F_createDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx**, unsigned);
+using LZ4F_freeDecompressionContext_t   = LZ4F_errorCode_t (*)(LZ4F_dctx*);
+using LZ4F_getFrameInfo_t               = LZ4F_errorCode_t (*)(LZ4F_dctx*,
+                                                 LZ4F_frameInfo_t*,
+                                                 const void*,
+                                                 size_t*);
+using LZ4F_decompress_t =
+  LZ4F_errorCode_t (*)(LZ4F_dctx*, void*, size_t*, const void*, size_t*, const void*);
+using LZ4F_isError_t      = unsigned (*)(LZ4F_errorCode_t);
+using LZ4F_getErrorName_t = const char* (*)(LZ4F_errorCode_t);
+
+std::vector<char> lz4_file_to_string(const std::string& file)
+{
+  struct DlCloseDeleter {
+    void operator()(void* fp)
+    {
+      mps_parser_expects_fatal(
+        dlclose(fp) == 0, error_type_t::ValidationError, "Error closing liblz4.so!");
+    }
+  };
+  struct Lz4DctxDeleter {
+    void operator()(LZ4F_dctx* f)
+    {
+      if (f != nullptr) {
+        const LZ4F_errorCode_t err = fptr(f);
+        mps_parser_expects_fatal(
+          !is_error(err), error_type_t::ValidationError, "Error closing lz4 file!");
+      }
+    }
+    LZ4F_freeDecompressionContext_t fptr = nullptr;
+    LZ4F_isError_t is_error              = nullptr;
+  };
+
+  void* raw_lz4handle = nullptr;
+  for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+    raw_lz4handle = dlopen(soname, RTLD_LAZY);
+    if (raw_lz4handle != nullptr) break;
+  }
+  std::unique_ptr<void, DlCloseDeleter> lz4handle{raw_lz4handle};
+  mps_parser_expects(lz4handle != nullptr,
+                     error_type_t::ValidationError,
+                     "Could not open .lz4 file since liblz4 was not found "
+                     "(tried liblz4.so.1, liblz4.so). In order to open .lz4 files directly, "
+                     "please ensure liblz4 is installed. Alternatively, decompress the .lz4 file "
+                     "manually and open the uncompressed file. Given path: %s",
+                     file.c_str());
+
+  LZ4F_createDecompressionContext_t LZ4F_createDecompressionContext =
+    reinterpret_cast<LZ4F_createDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_createDecompressionContext"));
+  LZ4F_freeDecompressionContext_t LZ4F_freeDecompressionContext =
+    reinterpret_cast<LZ4F_freeDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_freeDecompressionContext"));
+  LZ4F_getFrameInfo_t LZ4F_getFrameInfo =
+    reinterpret_cast<LZ4F_getFrameInfo_t>(dlsym(lz4handle.get(), "LZ4F_getFrameInfo"));
+  LZ4F_decompress_t LZ4F_decompress =
+    reinterpret_cast<LZ4F_decompress_t>(dlsym(lz4handle.get(), "LZ4F_decompress"));
+  LZ4F_isError_t LZ4F_isError =
+    reinterpret_cast<LZ4F_isError_t>(dlsym(lz4handle.get(), "LZ4F_isError"));
+  LZ4F_getErrorName_t LZ4F_getErrorName =
+    reinterpret_cast<LZ4F_getErrorName_t>(dlsym(lz4handle.get(), "LZ4F_getErrorName"));
+  mps_parser_expects(
+    LZ4F_createDecompressionContext != nullptr && LZ4F_freeDecompressionContext != nullptr &&
+      LZ4F_getFrameInfo != nullptr && LZ4F_decompress != nullptr && LZ4F_isError != nullptr &&
+      LZ4F_getErrorName != nullptr,
+    error_type_t::ValidationError,
+    "Error loading liblz4! Library version might be incompatible. Please decompress the .lz4 "
+    "file manually and open the uncompressed file. Given path: %s",
+    file.c_str());
+
+  std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "rb")};
+  mps_parser_expects(fp != nullptr,
+                     error_type_t::ValidationError,
+                     "Error opening input file! Given path: %s",
+                     file.c_str());
+  mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0,
+                     error_type_t::ValidationError,
+                     "Error seeking input file! Given path: %s",
+                     file.c_str());
+  const long compressed_size = ftell(fp.get());
+  mps_parser_expects(compressed_size != -1L,
+                     error_type_t::ValidationError,
+                     "Error sizing input file! Given path: %s",
+                     file.c_str());
+  std::vector<char> compressed(compressed_size);
+  rewind(fp.get());
+  mps_parser_expects(fread(compressed.data(), sizeof(char), compressed_size, fp.get()) ==
+                       static_cast<size_t>(compressed_size),
+                     error_type_t::ValidationError,
+                     "Error reading input file! Given path: %s",
+                     file.c_str());
+
+  constexpr unsigned lz4f_version = 100;
+  LZ4F_dctx* raw_dctx             = nullptr;
+  LZ4F_errorCode_t lz4_status     = LZ4F_createDecompressionContext(&raw_dctx, lz4f_version);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Could not open lz4 compressed file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  std::unique_ptr<LZ4F_dctx, Lz4DctxDeleter> dctx{raw_dctx,
+                                                  {LZ4F_freeDecompressionContext, LZ4F_isError}};
+
+  const char* src = compressed.data();
+  size_t src_size = compressed.size();
+  LZ4F_frameInfo_t frame_info{};
+  size_t src_used = src_size;
+  lz4_status      = LZ4F_getFrameInfo(dctx.get(), &frame_info, src, &src_used);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Error reading lz4 frame info for input file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  src += src_used;
+  src_size -= src_used;
+
+  std::vector<char> buf;
+  if (frame_info.contentSize > 0) { buf.reserve((size_t)frame_info.contentSize + 1); }
+  const size_t readbufsize = 1ull << 24;  // 16MiB
+  std::vector<char> readbuf(readbufsize);
+  while (lz4_status != 0) {
+    size_t dst_size = readbuf.size();
+    src_used        = src_size;
+    lz4_status = LZ4F_decompress(dctx.get(), readbuf.data(), &dst_size, src, &src_used, nullptr);
+    mps_parser_expects(!LZ4F_isError(lz4_status),
+                       error_type_t::ValidationError,
+                       "Error in lz4 decompression of input file '%s': %s",
+                       file.c_str(),
+                       LZ4F_getErrorName(lz4_status));
+    if (dst_size > 0) { buf.insert(buf.end(), begin(readbuf), begin(readbuf) + dst_size); }
+    src += src_used;
+    src_size -= src_used;
+    mps_parser_expects(src_used != 0 || dst_size != 0 || lz4_status == 0,
+                       error_type_t::ValidationError,
+                       "Stalled lz4 decompression of input file! Given path: %s",
+                       file.c_str());
+  }
+  buf.push_back('\0');
+  return buf;
+}
+}  // end namespace
+#endif  // MPS_PARSER_WITH_LZ4
+
 namespace cuopt::linear_programming::io::detail {
 
 std::vector<char> file_to_string(const std::string& file)
@@ -223,6 +380,12 @@ std::vector<char> file_to_string(const std::string& file)
   }
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+  if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".lz4") {
+    return lz4_file_to_string(file);
+  }
+#endif  // MPS_PARSER_WITH_LZ4
+
   // Faster than using C++ I/O
   std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "r")};
   mps_parser_expects(fp != nullptr,
diff --git a/cpp/src/io/file_to_string.hpp b/cpp/src/io/file_to_string.hpp
index 94b2df821d..3b1924e12c 100644
--- a/cpp/src/io/file_to_string.hpp
+++ b/cpp/src/io/file_to_string.hpp
@@ -17,6 +17,7 @@ namespace cuopt::linear_programming::io::detail {
 // The dispatcher looks at the extension:
 //   - ".bz2" → libbz2 (dlopen'd at runtime), if MPS_PARSER_WITH_BZIP2.
 //   - ".gz"  → libz   (dlopen'd at runtime), if MPS_PARSER_WITH_ZLIB.
+//   - ".lz4" → liblz4 (dlopen'd at runtime), if MPS_PARSER_WITH_LZ4.
 //   - otherwise → plain fopen.
 // The returned buffer's size includes the null terminator.
 std::vector<char> file_to_string(const std::string& file);
diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp
index 1baaf011e5..96a881c880 100644
--- a/cpp/src/utilities/perf_counters.hpp
+++ b/cpp/src/utilities/perf_counters.hpp
@@ -16,6 +16,37 @@
 
 namespace mps_fast {
 
+// Utils to return to total resident set size (used physical pages)
+static size_t parse_status_kb_line(const char* line, const char* key)
+{
+  size_t key_len = std::strlen(key);
+  if (std::strncmp(line, key, key_len) != 0) { return 0; }
+  const char* p = line + key_len;
+  while (*p == ' ' || *p == '\t') {
+    ++p;
+  }
+  char* end_ptr = nullptr;
+  size_t value  = std::strtol(p, &end_ptr, 10);
+  return value;
+}
+
+static std::pair<size_t, size_t> current_process_rss_kb()
+{
+  FILE* file = std::fopen("/proc/self/status", "r");
+  if (file == nullptr) { return {0, 0}; }
+
+  size_t rss_kb = 0;
+  size_t hwm_kb = 0;
+  char line[256];
+  while (std::fgets(line, sizeof(line), file) != nullptr) {
+    if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); }
+    if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); }
+    if (rss_kb != 0 && hwm_kb != 0) { break; }
+  }
+  std::fclose(file);
+  return {rss_kb, hwm_kb};
+}
+
 struct perf_counter_spec_t {
   const char* name;
   uint32_t type;
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
index 2e087ec4ee..ad6fab51fc 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -815,6 +815,49 @@ void lz4_and_raw_paths_match_on_multiblock_input()
   expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds");
 }
 
+void gzip_bzip2_and_raw_paths_match()
+{
+  std::string mps;
+  mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n";
+  mps += " X1 OBJ 1 R1 2.5\n X2 R1 -3.25 R2 4\n";
+  mps += "RHS\n RHS1 R1 7 R2 8\nBOUNDS\n BV BND X1\n UP BND X2 10\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath gzip_file(raw_file.path + ".gz");
+  TempOwnedPath bzip2_file(raw_file.path + ".bz2");
+
+  const std::string gzip_cmd  = "gzip -c " + raw_file.path + " > " + gzip_file.path;
+  const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path;
+  if (std::system(gzip_cmd.c_str()) != 0) { throw skip_test("gzip CLI unavailable"); }
+  if (std::system(bzip2_cmd.c_str()) != 0) { throw skip_test("bzip2 CLI unavailable"); }
+
+  auto raw =
+    mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
+  auto gzip =
+    mps_fast::parse_mps_fast_file<int, double>(gzip_file.path, mps_fast::FileReadMethod::Read);
+  auto bzip2 =
+    mps_fast::parse_mps_fast_file<int, double>(bzip2_file.path, mps_fast::FileReadMethod::Read);
+
+  expect_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
+  expect_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
+  expect_vector_eq(gzip.A_, raw.A_, "gzip A values");
+  expect_vector_eq(bzip2.A_, raw.A_, "bzip2 A values");
+  expect_vector_eq(gzip.A_indices_, raw.A_indices_, "gzip A indices");
+  expect_vector_eq(bzip2.A_indices_, raw.A_indices_, "bzip2 A indices");
+  expect_vector_eq(gzip.A_offsets_, raw.A_offsets_, "gzip A offsets");
+  expect_vector_eq(bzip2.A_offsets_, raw.A_offsets_, "bzip2 A offsets");
+  expect_vector_eq(gzip.c_, raw.c_, "gzip objective");
+  expect_vector_eq(bzip2.c_, raw.c_, "bzip2 objective");
+  expect_vector_eq(gzip.b_, raw.b_, "gzip rhs");
+  expect_vector_eq(bzip2.b_, raw.b_, "bzip2 rhs");
+  expect_vector_eq(gzip.variable_lower_bounds_, raw.variable_lower_bounds_, "gzip lower bounds");
+  expect_vector_eq(bzip2.variable_lower_bounds_, raw.variable_lower_bounds_, "bzip2 lower bounds");
+  expect_vector_eq(gzip.variable_upper_bounds_, raw.variable_upper_bounds_, "gzip upper bounds");
+  expect_vector_eq(bzip2.variable_upper_bounds_, raw.variable_upper_bounds_, "bzip2 upper bounds");
+  expect_vector_eq(gzip.var_types_, raw.var_types_, "gzip var types");
+  expect_vector_eq(bzip2.var_types_, raw.var_types_, "bzip2 var types");
+}
+
 }  // namespace
 
 int main()
@@ -846,6 +889,7 @@ int main()
     {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary},
     {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered},
     {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input},
+    {"GzipBzip2AndRawPathsMatch", gzip_bzip2_and_raw_paths_match},
   };
 
   int failed = 0;

From 62c8dcda56b95d59e4495dd238c8ad051ec6257c Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Thu, 11 Jun 2026 08:52:39 -0700
Subject: [PATCH 09/22] further cleanup

---
 cpp/CMakeLists.txt                            |  13 +-
 cpp/cuopt_cli.cpp                             |   7 +-
 .../cuopt/linear_programming/io/parser.hpp    |  59 +-
 .../fast_fp64_parser.hpp                      |   6 +-
 .../io/experimental_mps_fast/fast_parser.cpp  |   2 +-
 .../io/experimental_mps_fast/file_reader.cpp  | 335 ++++++++
 .../experimental_mps_fast/lz4_file_reader.cpp |   8 +
 cpp/src/io/file_to_string.cpp                 |  19 +-
 cpp/tests/linear_programming/CMakeLists.txt   |  46 +-
 .../fast_fp64_parser_test.cpp                 |  99 +--
 .../fast_parser_edge_test.cpp                 | 794 ++++++++----------
 cpp/tests/linear_programming/parser_test.cpp  |  40 +-
 12 files changed, 784 insertions(+), 644 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e134d49d02..4ecb1e9a46 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -54,7 +54,6 @@ option(SKIP_ROUTING_BUILD "Skip building routing components" OFF)
 option(SKIP_GRPC_BUILD "Skip building gRPC and protobuf components" OFF)
 option(WRITE_FATBIN "Enable fatbin writing" ON)
 option(HOST_LINEINFO "Build with debug line information for host code" OFF)
-option(MPS_FAST_TIMERS "Enable experimental fast MPS parser phase timer printouts" OFF)
 
 message(VERBOSE "cuOpt: Enable nvcc -lineinfo: ${CMAKE_CUDA_LINEINFO}")
 message(VERBOSE "cuOpt: Build cuOpt unit-tests: ${BUILD_TESTS}")
@@ -65,7 +64,6 @@ message(VERBOSE "cuOpt: Skip C/Python adapters: ${SKIP_C_PYTHON_ADAPTERS}")
 message(VERBOSE "cuOpt: Skip routing build: ${SKIP_ROUTING_BUILD}")
 message(VERBOSE "cuOpt: Build with debug line information for host code: ${HOST_LINEINFO}")
 message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}")
-message(VERBOSE "cuOpt: Fast MPS parser timers: ${MPS_FAST_TIMERS}")
 
 # ##################################################################################################
 # - compiler options ------------------------------------------------------------------------------
@@ -204,8 +202,7 @@ endif ()
 find_package(OpenMP REQUIRED)
 message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
 
-# MPS/QPS parser supports compressed inputs via bzip2 and zlib; the experimental fast MPS parser
-# supports LZ4 via runtime-loaded liblz4.
+# MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4
 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON)
 option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON)
 option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON)
@@ -464,12 +461,16 @@ if (HOST_LINEINFO)
     set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
 endif ()
 
+# Needed for the fast MPS parser
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
         CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
     set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2")
 endif ()
 
+# TODO: figure out a set of flags for ARM that fits the range of CPUs we wish to support (neoverse?)
+# NEON should be universal on aarch64 and enough for our purposes (parsing) though
+
 # Apply -UNDEBUG only to solver source files (not gRPC infrastructure).
 # Must happen before gRPC files are appended to CUOPT_SRC_FILES.
 # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO).
@@ -519,10 +520,6 @@ target_compile_definitions(cuopt
   PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API
 )
 
-if (MPS_FAST_TIMERS)
-    target_compile_definitions(cuopt PRIVATE MPS_FAST_TIMERS=1)
-endif ()
-
 target_compile_options(cuopt
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
         "$<$<COMPILE_LANGUAGE:CUDA>:${CUOPT_CUDA_FLAGS}>"
diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 714d76dbf5..55c506721a 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -307,10 +307,11 @@ int main(int argc, char* argv[])
 
   program.add_argument("--mps-reader")
     .help(
-      "MPS reader implementation: default uses the production parser; fast uses the experimental "
+      "MPS reader implementation: default uses the production parser; experimental-fast uses the "
+      "experimental "
       "SIMD parser for LP/MIP .mps and .mps.lz4 files")
     .default_value(std::string("default"))
-    .choices("default", "fast");
+    .choices("default", "experimental-fast");
 
   program.add_argument("--dump-hyper-params")
     .help("print hyper-parameters only in config file format and exit")
@@ -415,7 +416,7 @@ int main(int argc, char* argv[])
   const auto mps_reader_arg        = program.get<std::string>("--mps-reader");
 
   auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader;
-  if (mps_reader_arg == "fast") {
+  if (mps_reader_arg == "experimental-fast") {
     mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental;
   }
 
diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index 08254f84b3..4e46d43224 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -20,8 +20,7 @@ namespace cuopt::linear_programming::io {
 /**
  * @brief Selects which MPS reader implementation should be used by dispatching entry points.
  *
- * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems
- * from raw .mps, .mps.lz4, .mps.gz, and .mps.bz2 files.
+ * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP/QP problems.
  */
 enum class mps_reader_type_t { default_reader, fast_experimental };
 
@@ -52,10 +51,8 @@ mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
                                     bool fixed_mps_format = false);
 
 /**
- * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader.
- *
- * This prototype reader supports raw .mps plus .mps.lz4/.mps.gz/.mps.bz2 files. It does not
- * support LP, QPS, quadratic constraint sections, or fixed-format forcing.
+ * @brief Reads a raw LP/MIP/QP MPS problem with the experimental SIMD-optimized reader. SOCP is
+ * unsupported for now.
  *
  * @param[in] mps_file_path Path to a raw or compressed .mps file.
  * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file.
@@ -127,11 +124,6 @@ mps_data_model_t<i_t, f_t> read_lp(const std::string& lp_file_path);
 template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
 
-template <typename i_t, typename f_t>
-inline mps_data_model_t<i_t, f_t> read(const std::string& path,
-                                       mps_reader_type_t mps_reader,
-                                       bool fixed_mps_format = false);
-
 /**
  * @brief Reads an optimization problem from a file, dispatching on the file
  *        extension. Extension matching is case-insensitive.
@@ -146,39 +138,30 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
  * want both formats to "just work" without an explicit format flag.
  *
  * @param[in] path Path to the input file.
+ * @param[in] mps_reader Selects the MPS reader implementation for MPS/QPS inputs.
  * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
  *             ignored for LP inputs. False by default.
  * @return mps_data_model_t The parsed problem.
  */
-template <typename i_t, typename f_t>
-inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
-{
-  return read<i_t, f_t>(path, mps_reader_type_t::default_reader, fixed_mps_format);
-}
-
 template <typename i_t, typename f_t>
 inline mps_data_model_t<i_t, f_t> read(const std::string& path,
                                        mps_reader_type_t mps_reader,
-                                       bool fixed_mps_format)
+                                       bool fixed_mps_format = false)
 {
   std::string lower(path);
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  const bool is_mps_lz4  = lower.ends_with(".mps.lz4");
-  const bool is_mps_gzip = lower.ends_with(".mps.gz");
-  const bool is_mps_bzip = lower.ends_with(".mps.bz2");
-  const bool is_qps_lz4  = lower.ends_with(".qps.lz4");
-  const bool is_lp_lz4   = lower.ends_with(".lp.lz4");
-  if (lower.ends_with(".mps") || is_mps_lz4 || is_mps_gzip || is_mps_bzip ||
-      lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") ||
-      is_qps_lz4) {
+  if (lower.ends_with(".mps.lz4") || lower.ends_with(".mps.bz2") || lower.ends_with(".mps.gz") ||
+      lower.ends_with(".mps") || lower.ends_with(".qps.lz4") || lower.ends_with(".qps.bz2") ||
+      lower.ends_with(".qps.gz") || lower.ends_with(".qps")) {
     if (mps_reader == mps_reader_type_t::fast_experimental) {
       if (fixed_mps_format) {
         throw std::logic_error(
           "experimental fast MPS reader does not support fixed MPS format forcing");
       }
-      if (!lower.ends_with(".mps") && !is_mps_lz4 && !is_mps_gzip && !is_mps_bzip) {
+      if (lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") ||
+          lower.ends_with(".qps.lz4")) {
         throw std::logic_error(
           "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 "
           "LP/MIP files only");
@@ -187,8 +170,8 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
     }
     return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
-  if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2") ||
-      is_lp_lz4) {
+  if (lower.ends_with(".lp.lz4") || lower.ends_with(".lp.bz2") || lower.ends_with(".lp.gz") ||
+      lower.ends_with(".lp")) {
     return read_lp<i_t, f_t>(path);
   }
   throw std::logic_error(
@@ -199,4 +182,22 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
     path);
 }
 
+/**
+ * @brief Reads an optimization problem from a file, dispatching on the file
+ *        extension. Extension matching is case-insensitive.
+ *
+ * Uses the default MPS reader. See the 3-argument read() overload for routing
+ * details and supported extensions.
+ *
+ * @param[in] path Path to the input file.
+ * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
+ *             ignored for LP inputs. False by default.
+ * @return mps_data_model_t The parsed problem.
+ */
+template <typename i_t, typename f_t>
+inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+{
+  return read<i_t, f_t>(path, mps_reader_type_t::default_reader, fixed_mps_format);
+}
+
 }  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index 0f947aa644..e446494639 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -325,7 +325,11 @@ static inline double fallback_strtod(std::string_view s)
 
   char* parse_end = nullptr;
   errno           = 0;
-  return std::strtod(stack_buf, &parse_end);
+  double value    = std::strtod(stack_buf, &parse_end);
+  if (parse_end != stack_buf + s.size() || errno == ERANGE) {
+    mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+  }
+  return value;
 }
 
 // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 33bf916e05..bc9000f8f3 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -11,7 +11,7 @@
 #include "nvtx_ranges.hpp"
 
 #include <cuda/cmath>
-#ifdef MPS_FAST_PERF_COUNTERS
+#if defined(MPS_FAST_PERF_COUNTERS) || defined(MPS_FAST_TIMERS)
 #include <utilities/perf_counters.hpp>
 #endif
 
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index e69de29bb2..5eae15a46a 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -0,0 +1,335 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <utilities/error.hpp>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace mps_fast {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace {
+
+constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
+constexpr std::size_t raw_input_max_read_threads          = 8;
+constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
+
+bool path_has_suffix(const std::string& path, const char* suffix) noexcept
+{
+  std::size_t suffix_len = std::strlen(suffix);
+  return path.size() >= suffix_len &&
+         path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
+}
+
+std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
+{
+  if (alignment == 0) { return value; }
+  std::size_t remainder = value % alignment;
+  if (remainder == 0) { return value; }
+  std::size_t increment = alignment - remainder;
+  if (value > std::numeric_limits<std::size_t>::max() - increment) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
+  }
+  return value + increment;
+}
+
+std::size_t add_input_padding(std::size_t size)
+{
+  if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow");
+  }
+  return size + input_buffer_padding_bytes;
+}
+
+}  // namespace
+
+std::size_t get_file_size(int fd, const std::string& path)
+{
+  struct stat st;
+  if (::fstat(fd, &st) != 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to stat file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  if (st.st_size < 0) {
+    mps_parser_fail(error_type_t::RuntimeError, "Negative file size for '%s'", path.c_str());
+  }
+  return (std::size_t)st.st_size;
+}
+
+std::size_t get_file_size(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  std::size_t size = get_file_size(fd, path);
+  ::close(fd);
+  return size;
+}
+
+std::size_t system_page_size()
+{
+  static std::size_t page_size = [] {
+    long value = ::sysconf(_SC_PAGESIZE);
+    return value > 0 ? (std::size_t)value : (std::size_t)4096;
+  }();
+  return page_size;
+}
+
+raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
+  buffered_fd_ = ::open(path.c_str(), O_RDONLY);
+  if (buffered_fd_ < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open raw MPS file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+
+  file_size_         = get_file_size(buffered_fd_, path);
+  fd_                = buffered_fd_;
+  bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes;
+  if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) {
+    use_direct_io = raw_direct[0] != '0';
+  }
+  if (use_direct_io) {
+#ifdef O_DIRECT
+    int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
+    if (direct_fd >= 0) {
+      fd_        = direct_fd;
+      direct_io_ = true;
+    }
+#endif
+  }
+  window_bytes_ = raw_input_window_bytes;
+  window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
+
+  output_mapped_size_ = round_up_to_multiple(
+    std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
+  output_region_ = mmap_region_t::anonymous(
+    output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
+  output_data_ = output_region_.char_data();
+  output_region_.advise(MADV_HUGEPAGE);
+
+  block_done_.resize(window_count_, 0);
+  block_end_.resize(window_count_, 0);
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
+}
+
+raw_input_stream_t::~raw_input_stream_t()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+  if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); }
+}
+
+const char* raw_input_stream_t::data() const noexcept { return output_data_; }
+char* raw_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; }
+std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; }
+mps_phase_registry_t& raw_input_stream_t::registry() noexcept { return registry_; }
+input_stream_view_t raw_input_stream_t::view() noexcept
+{
+  return {output_data_, output_data_, output_view_size_, file_size_, &registry_};
+}
+
+void raw_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io);
+  if (file_size_ == 0) {
+    output_view_size_ = 0;
+    section_scanner_->publish_ready(0);
+    return;
+  }
+
+  std::size_t hw_threads =
+    std::max<std::size_t>(1, (std::size_t)std::thread::hardware_concurrency());
+  std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
+  thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
+
+  std::atomic_size_t next_window{0};
+  std::exception_ptr first_error = nullptr;
+  std::mutex error_mutex;
+  std::atomic_bool stop{false};
+
+  auto mark_error = [&](std::exception_ptr eptr) {
+    std::lock_guard<std::mutex> lock(error_mutex);
+    if (!first_error) {
+      first_error = eptr;
+      stop.store(true, std::memory_order_release);
+    }
+  };
+
+  auto read_window = [&](std::size_t index) {
+    MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
+    std::size_t offset = index * window_bytes_;
+    std::size_t size   = std::min(window_bytes_, file_size_ - offset);
+    std::size_t done   = 0;
+    {
+      MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+      while (done < size) {
+        ssize_t got =
+          ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
+        if (got < 0) {
+          if (errno == EINTR) { continue; }
+          if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) {
+            got = ::pread(
+              buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
+            if (got >= 0) {
+              done += (std::size_t)got;
+              continue;
+            }
+            if (errno == EINTR) { continue; }
+          }
+          mps_parser_fail(error_type_t::RuntimeError,
+                          "Failed to pread raw MPS file '%s': %s",
+                          path_.c_str(),
+                          std::strerror(errno));
+        }
+        if (got == 0) {
+          mps_parser_fail(error_type_t::RuntimeError,
+                          "Unexpected EOF while reading raw MPS file '%s'",
+                          path_.c_str());
+        }
+        done += (std::size_t)got;
+      }
+    }
+
+    {
+      MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
+      section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
+      frontier_mutex_.lock();
+      block_done_[index] = 1;
+      block_end_[index]  = offset + size;
+      std::size_t before = ready_bytes_;
+      while (next_block_ < block_done_.size() && block_done_[next_block_]) {
+        ready_bytes_ = block_end_[next_block_];
+        ++next_block_;
+      }
+      std::size_t after = ready_bytes_;
+      frontier_mutex_.unlock();
+      if (after > before) { section_scanner_->publish_ready(after); }
+    }
+  };
+
+  std::vector<std::thread> workers;
+  workers.reserve(thread_count);
+  for (std::size_t t = 0; t < thread_count; ++t) {
+    workers.emplace_back([&, t] {
+      std::string thread_name = "raw-input-read-" + std::to_string(t);
+      nvtx::name_current_thread(thread_name.c_str());
+      MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io);
+      while (!stop.load(std::memory_order_acquire)) {
+        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
+        if (index >= window_count_) { break; }
+        try {
+          read_window(index);
+        } catch (...) {
+          mark_error(std::current_exception());
+          return;
+        }
+      }
+    });
+  }
+  for (auto& worker : workers) {
+    worker.join();
+  }
+  if (first_error) { std::rethrow_exception(first_error); }
+
+  output_view_size_ = ready_bytes_;
+  section_scanner_->publish_ready(output_view_size_);
+}
+
+memory_input_stream_t::memory_input_stream_t(std::vector<char> buffer,
+                                             std::size_t input_size,
+                                             std::size_t compressed_size)
+  : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size)
+{
+  section_scanner_ = std::make_unique<mps_section_block_scanner_t>(buffer_.data(), 1, registry_);
+}
+
+const char* memory_input_stream_t::data() const noexcept { return buffer_.data(); }
+char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); }
+std::size_t memory_input_stream_t::size() const noexcept { return input_size_; }
+std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; }
+mps_phase_registry_t& memory_input_stream_t::registry() noexcept { return registry_; }
+input_stream_view_t memory_input_stream_t::view() noexcept
+{
+  return {buffer_.data(), buffer_.data(), input_size_, compressed_size_, &registry_};
+}
+
+void memory_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io);
+  section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_);
+  section_scanner_->publish_ready(input_size_);
+}
+
+bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
+bool has_gzip_extension(const std::string& path) noexcept { return path_has_suffix(path, ".gz"); }
+bool has_bzip2_extension(const std::string& path) noexcept { return path_has_suffix(path, ".bz2"); }
+
+void drop_file_cache(const std::string& path)
+{
+  MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) { return; }
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+  ::close(fd);
+}
+
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
+{
+  if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
+  if (has_gzip_extension(path)) { return FileReadMethod::Gzip; }
+  if (has_bzip2_extension(path)) { return FileReadMethod::Bzip2; }
+  if (method == FileReadMethod::Lz4) {
+    mps_parser_fail(
+      error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str());
+  }
+  return method;
+}
+
+const char* file_read_method_name(FileReadMethod method) noexcept
+{
+  switch (method) {
+    case FileReadMethod::Read: return "read";
+    case FileReadMethod::Lz4: return "lz4";
+    case FileReadMethod::Gzip: return "gzip";
+    case FileReadMethod::Bzip2: return "bzip2";
+    default: return "unknown";
+  }
+}
+
+}  // namespace mps_fast
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index b25e330999..9c47ba63c7 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -662,8 +662,16 @@ struct lz4_pipeline_t {
   void wait_range_ready(std::size_t begin, std::size_t size)
   {
     if (size == 0) return;
+    if (begin > input.compressed_size_ || size > input.compressed_size_ - begin) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
     std::size_t first = begin / window_bytes;
     std::size_t last  = (begin + size - 1) / window_bytes;
+    if (last >= window_done.size()) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
     for (std::size_t wi = first; wi <= last; ++wi) {
       MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
       std::unique_lock<std::mutex> lock(window_mutex);
diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp
index 5823381098..30d9c41f9f 100644
--- a/cpp/src/io/file_to_string.cpp
+++ b/cpp/src/io/file_to_string.cpp
@@ -9,6 +9,8 @@
 
 #include <utilities/error.hpp>
 
+#include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <memory>
 #include <string>
@@ -368,22 +370,21 @@ namespace cuopt::linear_programming::io::detail {
 
 std::vector<char> file_to_string(const std::string& file)
 {
+  std::string lower(file);
+  std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
+    return (char)std::tolower(c);
+  });
+
 #ifdef MPS_PARSER_WITH_BZIP2
-  if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".bz2") {
-    return bz2_file_to_string(file);
-  }
+  if (lower.ends_with(".bz2")) { return bz2_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_BZIP2
 
 #ifdef MPS_PARSER_WITH_ZLIB
-  if (file.size() > 3 && file.substr(file.size() - 3, 3) == ".gz") {
-    return zlib_file_to_string(file);
-  }
+  if (lower.ends_with(".gz")) { return zlib_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_ZLIB
 
 #ifdef MPS_PARSER_WITH_LZ4
-  if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".lz4") {
-    return lz4_file_to_string(file);
-  }
+  if (lower.ends_with(".lz4")) { return lz4_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_LZ4
 
   // Faster than using C++ I/O
diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt
index fcceb4af56..6db30755c3 100644
--- a/cpp/tests/linear_programming/CMakeLists.txt
+++ b/cpp/tests/linear_programming/CMakeLists.txt
@@ -21,43 +21,15 @@ ConfigureTest(MPS_PARSER_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp
     LABELS numopt)
 
-function(ConfigureStandaloneMpsFastTest CMAKE_TEST_NAME TEST_SOURCE)
-    add_executable(${CMAKE_TEST_NAME} ${TEST_SOURCE})
-    target_include_directories(${CMAKE_TEST_NAME}
-        PRIVATE
-        "${CUOPT_TEST_DIR}/../src"
-        "${CUOPT_TEST_DIR}/../src/io"
-        "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast"
-    )
-    target_compile_features(${CMAKE_TEST_NAME} PRIVATE cxx_std_20)
-    target_compile_options(${CMAKE_TEST_NAME}
-        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
-    )
-    target_link_libraries(${CMAKE_TEST_NAME}
-        PRIVATE
-        cuopt
-        simde::simde
-        ${CUOPT_PRIVATE_CUDA_LIBS}
-    )
-    if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
-      target_link_options(${CMAKE_TEST_NAME} PRIVATE -Wl,--enable-new-dtags)
-    endif()
-
-    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
-    set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "numopt")
-
-    install(
-        TARGETS ${CMAKE_TEST_NAME}
-        COMPONENT testing
-        DESTINATION bin/gtests/libcuopt
-        EXCLUDE_FROM_ALL
-    )
-endfunction()
-
-ConfigureStandaloneMpsFastTest(MPS_FAST_FP64_PARSER_TEST
-    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp)
-ConfigureStandaloneMpsFastTest(MPS_FAST_PARSER_EDGE_TEST
-    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp)
+ConfigureTest(MPS_FAST_PARSER_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp
+    LABELS numopt)
+target_include_directories(MPS_FAST_PARSER_TEST
+    PRIVATE
+    "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast"
+)
+target_link_libraries(MPS_FAST_PARSER_TEST PRIVATE simde::simde)
 
 # ##################################################################################################
 # - C API Tests----------------------------------------------------------------------
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
index 36171267cf..f07d84ebde 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -3,19 +3,16 @@
 
 #include "fast_fp64_parser.hpp"
 
+#include <gtest/gtest.h>
+
 #include <algorithm>
 #include <bit>
 #include <cerrno>
 #include <clocale>
 #include <cstdint>
 #include <cstdlib>
-#include <exception>
-#include <functional>
-#include <iostream>
 #include <limits>
 #include <random>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -24,22 +21,6 @@ namespace {
 
 uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
 
-[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); }
-
-void expect_true(bool condition, const std::string& message)
-{
-  if (!condition) { fail(message); }
-}
-
-void expect_eq_ptr(const char* got, const char* expected, std::string_view context)
-{
-  if (got != expected) {
-    std::ostringstream out;
-    out << context << ": pointer mismatch got_delta=" << (got - expected);
-    fail(out.str());
-  }
-}
-
 double reference_strtod(std::string_view token)
 {
   std::string normalized(token);
@@ -49,7 +30,7 @@ double reference_strtod(std::string_view token)
   char* end    = nullptr;
   errno        = 0;
   double value = std::strtod(normalized.c_str(), &end);
-  expect_eq_ptr(end, normalized.c_str() + normalized.size(), token);
+  ASSERT_EQ(end, normalized.c_str() + normalized.size());
   return value;
 }
 
@@ -65,22 +46,17 @@ double parse_padded_token(std::string_view token)
   padded.append(40, ' ');
   const char* p = padded.data();
   double value  = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size());
-  expect_eq_ptr(p, padded.data() + token.size(), token);
+  ASSERT_EQ(p, padded.data() + token.size());
   return value;
 }
 
-void expect_bitwise_strtod(std::string_view token)
+void check_bitwise_strtod(std::string_view token)
 {
-  double ref           = reference_strtod(token);
-  uint64_t token_bits  = bits(parse_token(token));
-  uint64_t padded_bits = bits(parse_padded_token(token));
-  uint64_t ref_bits    = bits(ref);
-  if (token_bits != ref_bits || padded_bits != ref_bits) {
-    std::ostringstream out;
-    out << "bitwise mismatch for '" << token << "' ref=0x" << std::hex << ref_bits << " token=0x"
-        << token_bits << " padded=0x" << padded_bits;
-    fail(out.str());
-  }
+  const double ref        = reference_strtod(token);
+  const uint64_t ref_bits = bits(ref);
+  EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'";
+  EXPECT_EQ(ref_bits, bits(parse_padded_token(token)))
+    << "padded parse mismatch for '" << token << "'";
 }
 
 std::string random_token(std::mt19937_64& rng)
@@ -133,7 +109,9 @@ std::string random_token(std::mt19937_64& rng)
   return token;
 }
 
-void common_table_matches_strtod_bitwise()
+}  // namespace
+
+TEST(FastFp64ParserTest, CommonTableMatchesStrtodBitwise)
 {
   std::setlocale(LC_NUMERIC, "C");
   const std::vector<std::string_view> cases = {
@@ -168,64 +146,29 @@ void common_table_matches_strtod_bitwise()
   };
 
   for (std::string_view token : cases) {
-    expect_bitwise_strtod(token);
+    check_bitwise_strtod(token);
   }
 }
 
-void cursor_advances_to_token_end()
+TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd)
 {
   std::setlocale(LC_NUMERIC, "C");
   std::string text = "123.45  ABC";
   const char* p    = text.data();
   double value     = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size());
 
-  expect_true(bits(value) == bits(reference_strtod("123.45")), "parsed value mismatch");
-  expect_eq_ptr(p, text.data() + 6, "cursor_advances_to_token_end");
-  expect_true(std::string_view(p, 5) == "  ABC", "cursor did not stop before trailing field");
+  EXPECT_EQ(bits(reference_strtod("123.45")), bits(value));
+  EXPECT_EQ(text.data() + 6, p);
+  EXPECT_EQ(std::string_view("  ABC"), std::string_view(p, 5));
 }
 
-void fixed_seed_random_differential()
+TEST(FastFp64ParserTest, FixedSeedRandomDifferential)
 {
   std::setlocale(LC_NUMERIC, "C");
   std::mt19937_64 rng(0x4d50535f46415354ULL);
   for (int i = 0; i < 100000; ++i) {
     std::string token = random_token(rng);
-    expect_true(token.size() <= 25U, "generated token exceeds MPS numeric token length");
-    expect_bitwise_strtod(token);
-  }
-}
-
-}  // namespace
-
-int main()
-{
-  struct TestCase {
-    const char* name;
-    void (*fn)();
-  };
-
-  const TestCase tests[] = {
-    {"CommonTableMatchesStrtodBitwise", common_table_matches_strtod_bitwise},
-    {"CursorAdvancesToTokenEnd", cursor_advances_to_token_end},
-    {"FixedSeedRandomDifferential", fixed_seed_random_differential},
-  };
-
-  int failed = 0;
-  for (const TestCase& test : tests) {
-    std::cout << "[ RUN      ] " << test.name << '\n';
-    try {
-      test.fn();
-      std::cout << "[       OK ] " << test.name << '\n';
-    } catch (const std::exception& e) {
-      ++failed;
-      std::cerr << "[  FAILED  ] " << test.name << ": " << e.what() << '\n';
-    }
-  }
-
-  if (failed != 0) {
-    std::cerr << failed << " test(s) failed\n";
-    return 1;
+    ASSERT_LE(token.size(), 25U);
+    check_bitwise_strtod(token);
   }
-  std::cout << "[  PASSED  ] " << std::size(tests) << " test(s)\n";
-  return 0;
 }
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
index ad6fab51fc..aa05736616 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -6,6 +6,8 @@
 
 #include <cuopt/linear_programming/io/parser.hpp>
 
+#include <gtest/gtest.h>
+
 #include <algorithm>
 #include <bit>
 #include <cerrno>
@@ -13,10 +15,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <exception>
-#include <functional>
 #include <iomanip>
-#include <iostream>
 #include <limits>
 #include <sstream>
 #include <stdexcept>
@@ -28,50 +27,6 @@
 
 namespace {
 
-struct skip_test : std::runtime_error {
-  using std::runtime_error::runtime_error;
-};
-
-[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); }
-
-void expect_true(bool condition, const std::string& message)
-{
-  if (!condition) { fail(message); }
-}
-
-template <typename A, typename B>
-void expect_eq(const A& got, const B& expected, std::string_view context)
-{
-  if (!(got == expected)) {
-    std::ostringstream out;
-    out << context << ": got=" << got << " expected=" << expected;
-    fail(out.str());
-  }
-}
-
-template <typename VecA, typename VecB>
-void expect_vector_eq(const VecA& got, const VecB& expected, std::string_view context)
-{
-  if (got.size() != expected.size()) {
-    std::ostringstream out;
-    out << context << ": size got=" << got.size() << " expected=" << expected.size();
-    fail(out.str());
-  }
-  for (size_t i = 0; i < got.size(); ++i) {
-    if (!(got[i] == expected[i])) {
-      std::ostringstream out;
-      out << context << ": first mismatch at " << i;
-      fail(out.str());
-    }
-  }
-}
-
-void expect_near_inf(double value, int sign, std::string_view context)
-{
-  expect_true(std::isinf(value), std::string(context) + ": expected infinity");
-  expect_true(std::signbit(value) == (sign < 0), std::string(context) + ": wrong infinity sign");
-}
-
 struct TempMpsFile {
   explicit TempMpsFile(std::string contents)
   {
@@ -81,20 +36,20 @@ struct TempMpsFile {
                   "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps",
                   static_cast<long>(getpid()));
     int fd = mkstemps(path_template, 4);
-    if (fd < 0) { fail(std::string("mkstemps failed: ") + std::strerror(errno)); }
+    if (fd < 0) { FAIL() << "mkstemps failed: " << std::strerror(errno); }
     path       = path_template;
     FILE* file = fdopen(fd, "wb");
     if (file == nullptr) {
       close(fd);
-      fail(std::string("fdopen failed: ") + std::strerror(errno));
+      FAIL() << "fdopen failed: " << std::strerror(errno);
     }
     if (!contents.empty() &&
         std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) {
       std::fclose(file);
-      fail(std::string("failed to write temporary MPS file: ") + std::strerror(errno));
+      FAIL() << "failed to write temporary MPS file: " << std::strerror(errno);
     }
     if (std::fclose(file) != 0) {
-      fail(std::string("failed to close temporary MPS file: ") + std::strerror(errno));
+      FAIL() << "failed to close temporary MPS file: " << std::strerror(errno);
     }
   }
 
@@ -122,164 +77,50 @@ struct TempOwnedPath {
   std::string path;
 };
 
-template <typename Fn>
-void expect_throws(Fn&& fn, std::string_view context)
-{
-  try {
-    fn();
-  } catch (const std::exception&) {
-    return;
-  }
-  fail(std::string(context) + ": expected exception");
-}
-
-void expect_fast_parse_error(std::string_view fixture_name, std::string contents)
-{
-  TempMpsFile file(std::move(contents));
-  expect_throws(
-    [&] {
-      (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-    },
-    fixture_name);
-}
-
 std::string_view range_text(const mps_fast::mps_phase_range_t& range)
 {
   if (!range.present) { return {}; }
   return std::string_view(range.begin, static_cast<size_t>(range.end - range.begin));
 }
 
-void scanner_finds_section_split_across_blocks()
-{
-  const std::string mps =
-    "NAME EDGE\n"
-    "ROWS\n"
-    " N OBJ\n"
-    " L rowA\n"
-    "COLUMNS\n"
-    " x1 OBJ 1\n"
-    " x1 rowA 2\n"
-    "RHS\n"
-    " rhs rowA 3\n"
-    "ENDATA\n";
-
-  const size_t columns_pos = mps.find("COLUMNS");
-  expect_true(columns_pos != std::string::npos, "failed to place COLUMNS split");
-  const size_t split = columns_pos + 3;
-
-  mps_fast::mps_phase_registry_t registry;
-  mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
-
-  scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
-  scanner.publish_ready(0);
-  scanner.observe_block(0, mps.data(), mps.data() + split);
-  scanner.publish_ready(mps.size());
-
-  expect_true(registry.ready(mps_fast::mps_phase_kind::header), "header not ready");
-  expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready");
-  expect_true(registry.ready(mps_fast::mps_phase_kind::columns), "columns not ready");
-  expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready");
-  expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), "quadratic sentinel not ready");
-
-  expect_true(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"),
-              "columns range begins at wrong boundary");
-  expect_true(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"),
-              "rhs range begins at wrong boundary");
-}
-
-void scanner_rejects_unknown_column_one_records_after_rows()
-{
-  const std::string mps =
-    "NAME BAD\n"
-    "ROWS\n"
-    " N OBJ\n"
-    "FOO\n"
-    "COLUMNS\n"
-    " x OBJ 1\n"
-    "ENDATA\n";
-
-  expect_throws(
-    [&] {
-      mps_fast::mps_phase_registry_t registry;
-      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry);
-      scanner.observe_block(0, mps.data(), mps.data() + mps.size());
-      scanner.publish_ready(mps.size());
-    },
-    "unknown column-1 record after ROWS");
-}
-
 uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
 
-void expect_double_bitwise_eq(double got, double expected, std::string_view context)
-{
-  if (bits(got) != bits(expected)) {
-    std::ostringstream out;
-    out << context << ": got=0x" << std::hex << bits(got) << " expected=0x" << bits(expected);
-    fail(out.str());
-  }
-}
-
-template <typename VecA, typename VecB>
-void expect_double_vector_bitwise_eq(const VecA& got,
-                                     const VecB& expected,
-                                     std::string_view context)
-{
-  if (got.size() != expected.size()) {
-    std::ostringstream out;
-    out << context << ": size got=" << got.size() << " expected=" << expected.size();
-    fail(out.str());
-  }
-  for (size_t i = 0; i < got.size(); ++i) {
-    if (bits(got[i]) != bits(expected[i])) {
-      std::ostringstream out;
-      out << context << ": first bitwise mismatch at " << i << " got=0x" << std::hex << bits(got[i])
-          << " expected=0x" << bits(expected[i]);
-      fail(out.str());
-    }
-  }
-}
-
-void expect_models_match_reference_bitwise(
+void check_models_match_reference_bitwise(
   const mps_fast::parser_model_t<int, double>& fast,
   const cuopt::linear_programming::io::mps_data_model_t<int, double>& reference,
   std::string_view context)
 {
-  expect_eq(fast.n_vars_, reference.n_vars_, std::string(context) + " n_vars");
-  expect_eq(fast.n_constraints_, reference.n_constraints_, std::string(context) + " n_constraints");
-  expect_eq(fast.nnz_, reference.nnz_, std::string(context) + " nnz");
-  expect_eq(fast.maximize_, reference.maximize_, std::string(context) + " maximize");
-  expect_eq(fast.problem_name_, reference.problem_name_, std::string(context) + " problem_name");
-  expect_eq(
-    fast.objective_name_, reference.objective_name_, std::string(context) + " objective_name");
-
-  expect_double_bitwise_eq(fast.objective_scaling_factor_,
-                           reference.objective_scaling_factor_,
-                           std::string(context) + " objective_scaling_factor");
-  expect_double_bitwise_eq(fast.objective_offset_,
-                           reference.objective_offset_,
-                           std::string(context) + " objective_offset");
-
-  expect_double_vector_bitwise_eq(fast.A_, reference.A_, std::string(context) + " A");
-  expect_vector_eq(fast.A_indices_, reference.A_indices_, std::string(context) + " A_indices");
-  expect_vector_eq(fast.A_offsets_, reference.A_offsets_, std::string(context) + " A_offsets");
-  expect_double_vector_bitwise_eq(fast.b_, reference.b_, std::string(context) + " b");
-  expect_double_vector_bitwise_eq(fast.c_, reference.c_, std::string(context) + " c");
-  expect_double_vector_bitwise_eq(fast.variable_lower_bounds_,
-                                  reference.variable_lower_bounds_,
-                                  std::string(context) + " variable_lower_bounds");
-  expect_double_vector_bitwise_eq(fast.variable_upper_bounds_,
-                                  reference.variable_upper_bounds_,
-                                  std::string(context) + " variable_upper_bounds");
-  expect_double_vector_bitwise_eq(fast.constraint_lower_bounds_,
-                                  reference.constraint_lower_bounds_,
-                                  std::string(context) + " constraint_lower_bounds");
-  expect_double_vector_bitwise_eq(fast.constraint_upper_bounds_,
-                                  reference.constraint_upper_bounds_,
-                                  std::string(context) + " constraint_upper_bounds");
-  expect_vector_eq(fast.var_types_, reference.var_types_, std::string(context) + " var_types");
-  expect_vector_eq(fast.row_types_, reference.row_types_, std::string(context) + " row_types");
-  expect_vector_eq(fast.var_names_, reference.var_names_, std::string(context) + " var_names");
-  expect_vector_eq(fast.row_names_, reference.row_names_, std::string(context) + " row_names");
+  EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars";
+  EXPECT_EQ(reference.n_constraints_, fast.n_constraints_)
+    << std::string(context) + " n_constraints";
+  EXPECT_EQ(reference.nnz_, fast.nnz_) << std::string(context) + " nnz";
+  EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize";
+  EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name";
+  EXPECT_EQ(reference.objective_name_, fast.objective_name_)
+    << std::string(context) + " objective_name";
+
+  EXPECT_EQ(bits(reference.objective_scaling_factor_), bits(fast.objective_scaling_factor_))
+    << std::string(context) + " objective_scaling_factor";
+  EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_))
+    << std::string(context) + " objective_offset";
+
+  EXPECT_EQ(reference.A_, fast.A_) << std::string(context) + " A";
+  EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices";
+  EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets";
+  EXPECT_EQ(reference.b_, fast.b_) << std::string(context) + " b";
+  EXPECT_EQ(reference.c_, fast.c_) << std::string(context) + " c";
+  EXPECT_EQ(reference.variable_lower_bounds_, fast.variable_lower_bounds_)
+    << std::string(context) + " variable_lower_bounds";
+  EXPECT_EQ(reference.variable_upper_bounds_, fast.variable_upper_bounds_)
+    << std::string(context) + " variable_upper_bounds";
+  EXPECT_EQ(reference.constraint_lower_bounds_, fast.constraint_lower_bounds_)
+    << std::string(context) + " constraint_lower_bounds";
+  EXPECT_EQ(reference.constraint_upper_bounds_, fast.constraint_upper_bounds_)
+    << std::string(context) + " constraint_upper_bounds";
+  EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types";
+  EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types";
+  EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names";
+  EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names";
 }
 
 void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
@@ -287,7 +128,7 @@ void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
   TempMpsFile file(std::move(contents));
   auto fast = mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
   auto reference = cuopt::linear_programming::io::read_mps<int, double>(file.path, false);
-  expect_models_match_reference_bitwise(fast, reference, fixture_name);
+  check_models_match_reference_bitwise(fast, reference, fixture_name);
 }
 
 std::string row_name(size_t i)
@@ -297,27 +138,27 @@ std::string row_name(size_t i)
   return out.str();
 }
 
-size_t find_var(const mps_fast::parser_model_t<int, double>& model, std::string_view name)
+int find_var_index(const mps_fast::parser_model_t<int, double>& model, std::string_view name)
 {
   for (size_t i = 0; i < model.var_names_.size(); ++i) {
-    if (model.var_names_[i] == name) { return i; }
+    if (model.var_names_[i] == name) { return static_cast<int>(i); }
   }
-  fail("variable not found: " + std::string(name));
+  return -1;
 }
 
-void expect_model_shapes(const mps_fast::parser_model_t<int, double>& model,
-                         int rows,
-                         int vars,
-                         int nnz,
-                         std::string_view context)
+void check_model_shapes(const mps_fast::parser_model_t<int, double>& model,
+                        int rows,
+                        int vars,
+                        int nnz,
+                        std::string_view context)
 {
-  expect_eq(model.n_constraints_, rows, std::string(context) + " rows");
-  expect_eq(model.n_vars_, vars, std::string(context) + " vars");
-  expect_eq(model.nnz_, nnz, std::string(context) + " nnz");
-  expect_eq(
-    model.A_offsets_.size(), static_cast<size_t>(rows + 1), std::string(context) + " offsets");
-  expect_eq(model.A_.size(), static_cast<size_t>(nnz), std::string(context) + " values");
-  expect_eq(model.A_indices_.size(), static_cast<size_t>(nnz), std::string(context) + " indices");
+  EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows";
+  EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars";
+  EXPECT_EQ(nnz, model.nnz_) << std::string(context) + " nnz";
+  EXPECT_EQ(static_cast<size_t>(rows + 1), model.A_offsets_.size())
+    << std::string(context) + " offsets";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_.size()) << std::string(context) + " values";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_indices_.size()) << std::string(context) + " indices";
 }
 
 std::string section_split_fixture()
@@ -335,14 +176,69 @@ std::string section_split_fixture()
          "ENDATA\n";
 }
 
-void scanner_finds_headers_split_at_every_byte()
+std::string to_crlf(std::string text)
+{
+  std::string converted;
+  converted.reserve(text.size() + text.size() / 8);
+  for (char c : text) {
+    if (c == '\n') {
+      converted += "\r\n";
+    } else {
+      converted.push_back(c);
+    }
+  }
+  return converted;
+}
+
+}  // namespace
+
+TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks)
+{
+  const std::string mps =
+    "NAME EDGE\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " x1 OBJ 1\n"
+    " x1 rowA 2\n"
+    "RHS\n"
+    " rhs rowA 3\n"
+    "ENDATA\n";
+
+  const size_t columns_pos = mps.find("COLUMNS");
+  EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split";
+  const size_t split = columns_pos + 3;
+
+  mps_fast::mps_phase_registry_t registry;
+  mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+  scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+  scanner.publish_ready(0);
+  scanner.observe_block(0, mps.data(), mps.data() + split);
+  scanner.publish_ready(mps.size());
+
+  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::header)) << "header not ready";
+  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready";
+  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) << "columns not ready";
+  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready";
+  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic))
+    << "quadratic sentinel not ready";
+
+  EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"))
+    << "columns range begins at wrong boundary";
+  EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"))
+    << "rhs range begins at wrong boundary";
+}
+
+TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte)
 {
   const std::string mps                       = section_split_fixture();
   const std::vector<std::string_view> headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"};
 
   for (std::string_view header : headers) {
     const size_t pos = mps.find(header);
-    expect_true(pos != std::string::npos, "missing header in split fixture");
+    EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture";
     for (size_t offset = 1; offset < header.size(); ++offset) {
       const size_t split = pos + offset;
       mps_fast::mps_phase_registry_t registry;
@@ -352,18 +248,40 @@ void scanner_finds_headers_split_at_every_byte()
       scanner.observe_block(0, mps.data(), mps.data() + split);
       scanner.publish_ready(mps.size());
 
-      expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready after split");
-      expect_true(registry.ready(mps_fast::mps_phase_kind::columns),
-                  "columns not ready after split");
-      expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready after split");
-      expect_true(registry.ready(mps_fast::mps_phase_kind::bounds), "bounds not ready after split");
-      expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic),
-                  "quadratic sentinel not ready after split");
+      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready after split";
+      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns))
+        << "columns not ready after split";
+      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready after split";
+      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::bounds))
+        << "bounds not ready after split";
+      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic))
+        << "quadratic sentinel not ready after split";
     }
   }
 }
 
-void bounds_defaults_and_types_match_reference()
+TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows)
+{
+  const std::string mps =
+    "NAME BAD\n"
+    "ROWS\n"
+    " N OBJ\n"
+    "FOO\n"
+    "COLUMNS\n"
+    " x OBJ 1\n"
+    "ENDATA\n";
+
+  EXPECT_THROW(
+    {
+      mps_fast::mps_phase_registry_t registry;
+      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry);
+      scanner.observe_block(0, mps.data(), mps.data() + mps.size());
+      scanner.publish_ready(mps.size());
+    },
+    std::logic_error);
+}
+
+TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference)
 {
   verify_fixture_bitwise("bounds_defaults_and_types",
                          "NAME BOUNDS_EDGE\n"
@@ -390,7 +308,7 @@ void bounds_defaults_and_types_match_reference()
                          "ENDATA\n");
 }
 
-void duplicate_bounds_last_statement_wins()
+TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins)
 {
   const std::string contents =
     "NAME BOUNDS_DUP\n"
@@ -412,12 +330,12 @@ void duplicate_bounds_last_statement_wins()
   TempMpsFile file(contents);
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_eq(model.n_vars_, 1, "n_vars");
-  expect_eq(model.variable_lower_bounds_.at(0), 2.0, "duplicate lower bound");
-  expect_eq(model.variable_upper_bounds_.at(0), 3.0, "duplicate upper bound");
+  EXPECT_EQ(1, model.n_vars_) << "n_vars";
+  EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound";
+  EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound";
 }
 
-void nondense_row_and_column_names_use_hash_path()
+TEST(FastMpsParserEdgeTest, NondenseRowAndColumnNamesUseHashPath)
 {
   verify_fixture_bitwise("nondense_row_and_column_names",
                          "NAME HASH_NAMES\n"
@@ -440,7 +358,7 @@ void nondense_row_and_column_names_use_hash_path()
                          "ENDATA\n");
 }
 
-void missing_optional_bounds_fast_path()
+TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath)
 {
   TempMpsFile file(
     "NAME OPTIONALS\n"
@@ -455,13 +373,13 @@ void missing_optional_bounds_fast_path()
 
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_eq(model.n_vars_, 1, "missing optional n_vars");
-  expect_eq(model.n_constraints_, 1, "missing optional n_constraints");
-  expect_eq(model.variable_lower_bounds_.at(0), 0.0, "missing BOUNDS lower default");
-  expect_near_inf(model.variable_upper_bounds_.at(0), 1, "missing BOUNDS upper default");
+  EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars";
+  EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default";
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), model.variable_upper_bounds_.at(0));
 }
 
-void bounds_only_variables_are_appended_deterministically()
+TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically)
 {
   TempMpsFile file(
     "NAME BOUNDS_ONLY\n"
@@ -481,25 +399,28 @@ void bounds_only_variables_are_appended_deterministically()
 
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_model_shapes(model, 1, 4, 1, "bounds-only");
-  expect_eq(model.var_names_.at(0), std::string("XMAIN"), "main var name");
-  expect_eq(model.var_names_.at(1), std::string("AUX_A"), "bounds-only sorted name 1");
-  expect_eq(model.var_names_.at(2), std::string("AUX_S"), "bounds-only sorted name 2");
-  expect_eq(model.var_names_.at(3), std::string("AUX_Z"), "bounds-only sorted name 3");
-
-  size_t aux_a = find_var(model, "AUX_A");
-  size_t aux_s = find_var(model, "AUX_S");
-  size_t aux_z = find_var(model, "AUX_Z");
-  expect_eq(model.var_types_.at(aux_a), 'I', "bounds-only BV type");
-  expect_eq(model.variable_lower_bounds_.at(aux_a), 0.0, "bounds-only BV lb");
-  expect_eq(model.variable_upper_bounds_.at(aux_a), 1.0, "bounds-only BV ub");
-  expect_eq(model.var_types_.at(aux_s), 'S', "bounds-only SC type");
-  expect_eq(model.variable_upper_bounds_.at(aux_s), 5.0, "bounds-only SC ub");
-  expect_eq(model.variable_lower_bounds_.at(aux_z), -3.0, "bounds-only duplicate lb");
-  expect_eq(model.variable_upper_bounds_.at(aux_z), 9.0, "bounds-only duplicate ub");
-}
-
-void integer_markers_assign_types_and_default_bounds()
+  check_model_shapes(model, 1, 4, 1, "bounds-only");
+  EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name";
+  EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1";
+  EXPECT_EQ(std::string("AUX_S"), model.var_names_.at(2)) << "bounds-only sorted name 2";
+  EXPECT_EQ(std::string("AUX_Z"), model.var_names_.at(3)) << "bounds-only sorted name 3";
+
+  const int aux_a = find_var_index(model, "AUX_A");
+  const int aux_s = find_var_index(model, "AUX_S");
+  const int aux_z = find_var_index(model, "AUX_Z");
+  ASSERT_GE(aux_a, 0);
+  ASSERT_GE(aux_s, 0);
+  ASSERT_GE(aux_z, 0);
+  EXPECT_EQ('I', model.var_types_.at(aux_a)) << "bounds-only BV type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(aux_a)) << "bounds-only BV lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(aux_a)) << "bounds-only BV ub";
+  EXPECT_EQ('S', model.var_types_.at(aux_s)) << "bounds-only SC type";
+  EXPECT_EQ(5.0, model.variable_upper_bounds_.at(aux_s)) << "bounds-only SC ub";
+  EXPECT_EQ(-3.0, model.variable_lower_bounds_.at(aux_z)) << "bounds-only duplicate lb";
+  EXPECT_EQ(9.0, model.variable_upper_bounds_.at(aux_z)) << "bounds-only duplicate ub";
+}
+
+TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds)
 {
   TempMpsFile file(
     "NAME MARKERS\n"
@@ -520,20 +441,23 @@ void integer_markers_assign_types_and_default_bounds()
 
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_model_shapes(model, 1, 3, 3, "integer markers");
-  size_t xint  = find_var(model, "XINT");
-  size_t xcont = find_var(model, "XCONT");
-  size_t xbin  = find_var(model, "XBIN");
-  expect_eq(model.var_types_.at(xint), 'I', "XINT type");
-  expect_eq(model.var_types_.at(xcont), 'C', "XCONT type");
-  expect_eq(model.var_types_.at(xbin), 'I', "XBIN type");
-  expect_eq(model.variable_lower_bounds_.at(xint), 0.0, "XINT default lb");
-  expect_eq(model.variable_upper_bounds_.at(xint), 1.0, "XINT default ub");
-  expect_eq(model.variable_lower_bounds_.at(xbin), 0.0, "XBIN default lb");
-  expect_eq(model.variable_upper_bounds_.at(xbin), 1.0, "XBIN default ub");
-}
-
-void numeric_parsing_integration_matches_reference_bitwise()
+  check_model_shapes(model, 1, 3, 3, "integer markers");
+  const int xint  = find_var_index(model, "XINT");
+  const int xcont = find_var_index(model, "XCONT");
+  const int xbin  = find_var_index(model, "XBIN");
+  ASSERT_GE(xint, 0);
+  ASSERT_GE(xcont, 0);
+  ASSERT_GE(xbin, 0);
+  EXPECT_EQ('I', model.var_types_.at(xint)) << "XINT type";
+  EXPECT_EQ('C', model.var_types_.at(xcont)) << "XCONT type";
+  EXPECT_EQ('I', model.var_types_.at(xbin)) << "XBIN type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xint)) << "XINT default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xint)) << "XINT default ub";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xbin)) << "XBIN default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xbin)) << "XBIN default ub";
+}
+
+TEST(FastMpsParserEdgeTest, NumericParsingIntegrationMatchesReferenceBitwise)
 {
   verify_fixture_bitwise("numeric_parsing_integration",
                          "NAME NUMBERS\n"
@@ -559,21 +483,7 @@ void numeric_parsing_integration_matches_reference_bitwise()
                          "ENDATA\n");
 }
 
-std::string to_crlf(std::string text)
-{
-  std::string converted;
-  converted.reserve(text.size() + text.size() / 8);
-  for (char c : text) {
-    if (c == '\n') {
-      converted += "\r\n";
-    } else {
-      converted.push_back(c);
-    }
-  }
-  return converted;
-}
-
-void crlf_line_endings_match_reference_bitwise()
+TEST(FastMpsParserEdgeTest, CrlfLineEndingsMatchReferenceBitwise)
 {
   verify_fixture_bitwise("crlf_line_endings",
                          to_crlf("NAME CRLF_EDGE\n"
@@ -591,7 +501,7 @@ void crlf_line_endings_match_reference_bitwise()
                                  "ENDATA\n"));
 }
 
-void comment_placement_supported_cases_match_reference_bitwise()
+TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise)
 {
   verify_fixture_bitwise("comment_placement_supported_cases",
                          "* leading star comment\n"
@@ -618,7 +528,7 @@ void comment_placement_supported_cases_match_reference_bitwise()
                          "ENDATA\n");
 }
 
-void objective_metadata_selects_named_objective()
+TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective)
 {
   TempMpsFile file(
     "NAME OBJMETA\n"
@@ -640,81 +550,118 @@ void objective_metadata_selects_named_objective()
 
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_true(model.maximize_, "OBJSENSE MAX not applied");
-  expect_eq(model.problem_name_, std::string("OBJMETA"), "problem name");
-  expect_eq(model.objective_name_, std::string("COST"), "objective name");
-  expect_eq(model.objective_offset_, -7.0, "objective RHS offset");
-  size_t x1 = find_var(model, "X1");
-  size_t x2 = find_var(model, "X2");
-  expect_eq(model.c_.at(x1), 5.0, "named objective coefficient X1");
-  expect_eq(model.c_.at(x2), -2.0, "named objective coefficient X2");
+  EXPECT_TRUE(model.maximize_) << "OBJSENSE MAX not applied";
+  EXPECT_EQ(std::string("OBJMETA"), model.problem_name_) << "problem name";
+  EXPECT_EQ(std::string("COST"), model.objective_name_) << "objective name";
+  EXPECT_EQ(-7.0, model.objective_offset_) << "objective RHS offset";
+  const int x1 = find_var_index(model, "X1");
+  const int x2 = find_var_index(model, "X2");
+  ASSERT_GE(x1, 0);
+  ASSERT_GE(x2, 0);
+  EXPECT_EQ(5.0, model.c_.at(x1)) << "named objective coefficient X1";
+  EXPECT_EQ(-2.0, model.c_.at(x2)) << "named objective coefficient X2";
 }
 
-void malformed_inputs_report_errors()
+TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
 {
-  expect_fast_parse_error("bad objsense",
-                          "NAME BADOBJ\n"
-                          "OBJSENSE\n"
-                          " SIDEWAYS\n"
-                          "ROWS\n"
-                          " N OBJ\n"
-                          " L R1\n"
-                          "COLUMNS\n"
-                          " X1 OBJ 1 R1 2\n"
-                          "RHS\n"
-                          " RHS1 R1 0\n"
-                          "ENDATA\n");
-
-  expect_fast_parse_error("unknown row in columns",
-                          "NAME BADCOLROW\n"
-                          "ROWS\n"
-                          " N OBJ\n"
-                          " L R1\n"
-                          "COLUMNS\n"
-                          " X1 MISSING 1\n"
-                          "RHS\n"
-                          " RHS1 R1 0\n"
-                          "ENDATA\n");
-
-  expect_fast_parse_error("unknown row in rhs",
-                          "NAME BADRHSROW\n"
-                          "ROWS\n"
-                          " N OBJ\n"
-                          " L R1\n"
-                          "COLUMNS\n"
-                          " X1 OBJ 1 R1 2\n"
-                          "RHS\n"
-                          " RHS1 MISSING 1\n"
-                          "ENDATA\n");
-
-  expect_fast_parse_error("unknown bound type",
-                          "NAME BADBOUND\n"
-                          "ROWS\n"
-                          " N OBJ\n"
-                          " L R1\n"
-                          "COLUMNS\n"
-                          " X1 OBJ 1 R1 2\n"
-                          "RHS\n"
-                          " RHS1 R1 0\n"
-                          "BOUNDS\n"
-                          " XX B X1 1\n"
-                          "ENDATA\n");
-
-  expect_fast_parse_error("semi-continuous bound without value",
-                          "NAME BADSC\n"
-                          "ROWS\n"
-                          " N OBJ\n"
-                          " L R1\n"
-                          "COLUMNS\n"
-                          " X1 OBJ 1 R1 2\n"
-                          "RHS\n"
-                          " RHS1 R1 0\n"
-                          "BOUNDS\n"
-                          " SC B X1\n"
-                          "ENDATA\n");
+  {
+    TempMpsFile file(
+      "NAME BADOBJ\n"
+      "OBJSENSE\n"
+      " SIDEWAYS\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(
+      {
+        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+      },
+      std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADCOLROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 MISSING 1\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(
+      {
+        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+      },
+      std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADRHSROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 MISSING 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(
+      {
+        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+      },
+      std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADBOUND\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " XX B X1 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(
+      {
+        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+      },
+      std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADSC\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " SC B X1\n"
+      "ENDATA\n");
+    EXPECT_THROW(
+      {
+        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+      },
+      std::logic_error);
+  }
 }
 
-void large_columns_repeated_column_chunk_boundary()
+TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary)
 {
   constexpr size_t row_count = 180000;
   std::string mps;
@@ -740,13 +687,13 @@ void large_columns_repeated_column_chunk_boundary()
   TempMpsFile file(std::move(mps));
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_model_shapes(
+  check_model_shapes(
     model, static_cast<int>(row_count), 2, static_cast<int>(row_count + 1), "large columns");
-  expect_eq(model.var_names_.at(0), std::string("XBIG"), "large repeated column name");
-  expect_eq(model.var_names_.at(1), std::string("XTAIL"), "large tail column name");
+  EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name";
+  EXPECT_EQ(std::string("XTAIL"), model.var_names_.at(1)) << "large tail column name";
 }
 
-void large_bounds_repeated_var_stays_ordered()
+TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered)
 {
   constexpr size_t repeat_count = 700000;
   std::string mps;
@@ -763,13 +710,12 @@ void large_bounds_repeated_var_stays_ordered()
   TempMpsFile file(std::move(mps));
   auto model =
     mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  expect_model_shapes(model, 1, 1, 1, "large bounds");
-  expect_eq(model.variable_upper_bounds_.at(0),
-            static_cast<double>((repeat_count - 1) % 1000),
-            "large repeated bounds last value");
+  check_model_shapes(model, 1, 1, 1, "large bounds");
+  EXPECT_EQ(static_cast<double>((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0))
+    << "large repeated bounds last value";
 }
 
-void lz4_and_raw_paths_match_on_multiblock_input()
+TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput)
 {
   constexpr size_t row_count = 70000;
   std::string mps;
@@ -795,27 +741,27 @@ void lz4_and_raw_paths_match_on_multiblock_input()
   TempMpsFile raw_file(std::move(mps));
   TempOwnedPath lz4_file(raw_file.path + ".lz4");
   const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path;
-  if (std::system(cmd.c_str()) != 0) { throw skip_test("lz4 CLI unavailable"); }
+  if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; }
 
   auto raw =
     mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
   auto lz4 =
     mps_fast::parse_mps_fast_file<int, double>(lz4_file.path, mps_fast::FileReadMethod::Read);
 
-  expect_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
-  expect_eq(lz4.var_names_.size(), raw.var_names_.size(), "lz4 var name count");
-  expect_eq(lz4.row_names_.size(), raw.row_names_.size(), "lz4 row name count");
-  expect_vector_eq(lz4.A_, raw.A_, "lz4 A values");
-  expect_vector_eq(lz4.A_indices_, raw.A_indices_, "lz4 A indices");
-  expect_vector_eq(lz4.A_offsets_, raw.A_offsets_, "lz4 A offsets");
-  expect_vector_eq(lz4.c_, raw.c_, "lz4 objective");
-  expect_vector_eq(lz4.b_, raw.b_, "lz4 rhs");
-  expect_vector_eq(lz4.var_types_, raw.var_types_, "lz4 var types");
-  expect_vector_eq(lz4.variable_lower_bounds_, raw.variable_lower_bounds_, "lz4 lower bounds");
-  expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds");
+  check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
+  EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count";
+  EXPECT_EQ(raw.row_names_.size(), lz4.row_names_.size()) << "lz4 row name count";
+  EXPECT_EQ(raw.A_, lz4.A_) << "lz4 A values";
+  EXPECT_EQ(raw.A_indices_, lz4.A_indices_) << "lz4 A indices";
+  EXPECT_EQ(raw.A_offsets_, lz4.A_offsets_) << "lz4 A offsets";
+  EXPECT_EQ(raw.c_, lz4.c_) << "lz4 objective";
+  EXPECT_EQ(raw.b_, lz4.b_) << "lz4 rhs";
+  EXPECT_EQ(raw.var_types_, lz4.var_types_) << "lz4 var types";
+  EXPECT_EQ(raw.variable_lower_bounds_, lz4.variable_lower_bounds_) << "lz4 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, lz4.variable_upper_bounds_) << "lz4 upper bounds";
 }
 
-void gzip_bzip2_and_raw_paths_match()
+TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
 {
   std::string mps;
   mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n";
@@ -828,8 +774,8 @@ void gzip_bzip2_and_raw_paths_match()
 
   const std::string gzip_cmd  = "gzip -c " + raw_file.path + " > " + gzip_file.path;
   const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path;
-  if (std::system(gzip_cmd.c_str()) != 0) { throw skip_test("gzip CLI unavailable"); }
-  if (std::system(bzip2_cmd.c_str()) != 0) { throw skip_test("bzip2 CLI unavailable"); }
+  if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; }
+  if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; }
 
   auto raw =
     mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
@@ -838,78 +784,22 @@ void gzip_bzip2_and_raw_paths_match()
   auto bzip2 =
     mps_fast::parse_mps_fast_file<int, double>(bzip2_file.path, mps_fast::FileReadMethod::Read);
 
-  expect_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
-  expect_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
-  expect_vector_eq(gzip.A_, raw.A_, "gzip A values");
-  expect_vector_eq(bzip2.A_, raw.A_, "bzip2 A values");
-  expect_vector_eq(gzip.A_indices_, raw.A_indices_, "gzip A indices");
-  expect_vector_eq(bzip2.A_indices_, raw.A_indices_, "bzip2 A indices");
-  expect_vector_eq(gzip.A_offsets_, raw.A_offsets_, "gzip A offsets");
-  expect_vector_eq(bzip2.A_offsets_, raw.A_offsets_, "bzip2 A offsets");
-  expect_vector_eq(gzip.c_, raw.c_, "gzip objective");
-  expect_vector_eq(bzip2.c_, raw.c_, "bzip2 objective");
-  expect_vector_eq(gzip.b_, raw.b_, "gzip rhs");
-  expect_vector_eq(bzip2.b_, raw.b_, "bzip2 rhs");
-  expect_vector_eq(gzip.variable_lower_bounds_, raw.variable_lower_bounds_, "gzip lower bounds");
-  expect_vector_eq(bzip2.variable_lower_bounds_, raw.variable_lower_bounds_, "bzip2 lower bounds");
-  expect_vector_eq(gzip.variable_upper_bounds_, raw.variable_upper_bounds_, "gzip upper bounds");
-  expect_vector_eq(bzip2.variable_upper_bounds_, raw.variable_upper_bounds_, "bzip2 upper bounds");
-  expect_vector_eq(gzip.var_types_, raw.var_types_, "gzip var types");
-  expect_vector_eq(bzip2.var_types_, raw.var_types_, "bzip2 var types");
-}
-
-}  // namespace
-
-int main()
-{
-  struct TestCase {
-    const char* name;
-    void (*fn)();
-  };
-
-  const TestCase tests[] = {
-    {"ScannerFindsSectionSplitAcrossBlocks", scanner_finds_section_split_across_blocks},
-    {"ScannerFindsHeadersSplitAtEveryByte", scanner_finds_headers_split_at_every_byte},
-    {"ScannerRejectsUnknownColumnOneRecordsAfterRows",
-     scanner_rejects_unknown_column_one_records_after_rows},
-    {"BoundsDefaultsAndTypesMatchReference", bounds_defaults_and_types_match_reference},
-    {"DuplicateBoundsLastStatementWins", duplicate_bounds_last_statement_wins},
-    {"NondenseRowAndColumnNamesUseHashPath", nondense_row_and_column_names_use_hash_path},
-    {"MissingOptionalBoundsFastPath", missing_optional_bounds_fast_path},
-    {"BoundsOnlyVariablesAreAppendedDeterministically",
-     bounds_only_variables_are_appended_deterministically},
-    {"IntegerMarkersAssignTypesAndDefaultBounds", integer_markers_assign_types_and_default_bounds},
-    {"NumericParsingIntegrationMatchesReferenceBitwise",
-     numeric_parsing_integration_matches_reference_bitwise},
-    {"CrlfLineEndingsMatchReferenceBitwise", crlf_line_endings_match_reference_bitwise},
-    {"CommentPlacementSupportedCasesMatchReferenceBitwise",
-     comment_placement_supported_cases_match_reference_bitwise},
-    {"ObjectiveMetadataSelectsNamedObjective", objective_metadata_selects_named_objective},
-    {"MalformedInputsReportErrors", malformed_inputs_report_errors},
-    {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary},
-    {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered},
-    {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input},
-    {"GzipBzip2AndRawPathsMatch", gzip_bzip2_and_raw_paths_match},
-  };
-
-  int failed = 0;
-  for (const TestCase& test : tests) {
-    std::cout << "[ RUN      ] " << test.name << '\n';
-    try {
-      test.fn();
-      std::cout << "[       OK ] " << test.name << '\n';
-    } catch (const skip_test& e) {
-      std::cout << "[  SKIPPED ] " << test.name << ": " << e.what() << '\n';
-    } catch (const std::exception& e) {
-      ++failed;
-      std::cerr << "[  FAILED  ] " << test.name << ": " << e.what() << '\n';
-    }
-  }
-
-  if (failed != 0) {
-    std::cerr << failed << " test(s) failed\n";
-    return 1;
-  }
-  std::cout << "[  PASSED  ] " << std::size(tests) << " test(s)\n";
-  return 0;
+  check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
+  check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
+  EXPECT_EQ(raw.A_, gzip.A_) << "gzip A values";
+  EXPECT_EQ(raw.A_, bzip2.A_) << "bzip2 A values";
+  EXPECT_EQ(raw.A_indices_, gzip.A_indices_) << "gzip A indices";
+  EXPECT_EQ(raw.A_indices_, bzip2.A_indices_) << "bzip2 A indices";
+  EXPECT_EQ(raw.A_offsets_, gzip.A_offsets_) << "gzip A offsets";
+  EXPECT_EQ(raw.A_offsets_, bzip2.A_offsets_) << "bzip2 A offsets";
+  EXPECT_EQ(raw.c_, gzip.c_) << "gzip objective";
+  EXPECT_EQ(raw.c_, bzip2.c_) << "bzip2 objective";
+  EXPECT_EQ(raw.b_, gzip.b_) << "gzip rhs";
+  EXPECT_EQ(raw.b_, bzip2.b_) << "bzip2 rhs";
+  EXPECT_EQ(raw.variable_lower_bounds_, gzip.variable_lower_bounds_) << "gzip lower bounds";
+  EXPECT_EQ(raw.variable_lower_bounds_, bzip2.variable_lower_bounds_) << "bzip2 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, gzip.variable_upper_bounds_) << "gzip upper bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, bzip2.variable_upper_bounds_) << "bzip2 upper bounds";
+  EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types";
+  EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types";
 }
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index 12f9ed488a..3b01f10227 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -139,23 +139,12 @@ double q_entry(const mps_data_model_t<int, double>& m, int row, int col)
 
 class parser_fixture_base : public ::testing::TestWithParam<mps_reader_param_t> {
  protected:
-  static mps_data_model_t<int, double> read_mps_file(const std::string& file,
-                                                     bool fixed_format = true)
-  {
-    const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
-    return read_mps<int, double>(root + "/" + file, fixed_format);
-  }
-
-  mps_data_model_t<int, double> read_param_mps_file(const std::string& file,
-                                                    bool fixed_format = true) const
+  mps_data_model_t<int, double> read_mps_file(const std::string& file,
+                                              bool fixed_format = true) const
   {
     const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
     const auto reader       = GetParam().reader;
-    // The experimental reader has no fixed/free parser mode. Use the same file but do not force
-    // fixed-format dispatch for that reader.
-    const bool reader_fixed_format =
-      reader == mps_reader_type_t::default_reader ? fixed_format : false;
-    return read<int, double>(root + "/" + file, reader, reader_fixed_format);
+    return read<int, double>(root + "/" + file, reader, fixed_format);
   }
 
   static mps_data_model_t<int, double> read_lp_file(const std::string& file)
@@ -386,7 +375,7 @@ TEST(mps_parser, bad_mps_files)
 
 TEST_P(good_mps_1_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-1.mps", false));
+  check_model(read_mps_file("linear_programming/good-mps-1.mps", false));
 }
 
 TEST_F(good_mps_1_test, mps_parser_internals)
@@ -625,7 +614,7 @@ TEST(mps_parser_free_format, bad_mps_files_free_format)
 
 TEST_P(up_low_bounds_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
+  check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
 }
 
 TEST_F(up_low_bounds_test, mps_parser_internals)
@@ -646,12 +635,12 @@ TEST_P(good_mps_1_test, mps_free_format)
 {
   // free-format-mps-1.mps encodes the same problem as good-mps-1 with default
   // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker.
-  check_model(read_param_mps_file("linear_programming/free-format-mps-1.mps", false));
+  check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false));
 }
 
 TEST_P(some_var_bounds_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
+  check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
 }
 
 TEST_F(some_var_bounds_test, lp)
@@ -661,7 +650,7 @@ TEST_F(some_var_bounds_test, lp)
 
 TEST_P(fixed_var_bound_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-fixed-var.mps"));
+  check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps"));
 }
 
 TEST_F(fixed_var_bound_test, lp)
@@ -671,7 +660,7 @@ TEST_F(fixed_var_bound_test, lp)
 
 TEST_P(free_var_bound_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-free-var.mps"));
+  check_model(read_mps_file("linear_programming/good-mps-free-var.mps"));
 }
 
 TEST_F(free_var_bound_test, lp)
@@ -681,7 +670,7 @@ TEST_F(free_var_bound_test, lp)
 
 TEST_P(lower_inf_var_bound_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
+  check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
 }
 
 TEST_F(lower_inf_var_bound_test, lp)
@@ -699,7 +688,7 @@ TEST(mps_bounds, rhs_cost)
 
 TEST_P(upper_inf_var_bound_test, mps)
 {
-  check_model(read_param_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
+  check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
 }
 
 TEST_F(upper_inf_var_bound_test, lp)
@@ -854,7 +843,7 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types)
 
 TEST_P(mip_with_bounds_test, mps)
 {
-  check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
+  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
 }
 
 TEST_F(mip_with_bounds_test, mps_parser_internals)
@@ -918,7 +907,7 @@ TEST(mps_parser, good_mps_file_mip_no_marker)
 
 TEST_P(mip_no_bounds_test, mps)
 {
-  check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
+  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
 }
 
 TEST_F(mip_no_bounds_test, lp)
@@ -928,8 +917,7 @@ TEST_F(mip_no_bounds_test, lp)
 
 TEST_P(mip_partial_bounds_test, mps)
 {
-  check_model(
-    read_param_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
+  check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
 }
 
 TEST_F(mip_partial_bounds_test, lp)

From 79e958ed4ce2e672780b9fc8ca81c6d31010fb1a Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 12 Jun 2026 02:02:35 -0700
Subject: [PATCH 10/22] cleanup for clarity

---
 cpp/src/io/CMakeLists.txt                     |   1 -
 .../fast_fp64_parser.hpp                      |   5 +-
 .../fast_parse_primitives.hpp                 |  52 +--
 .../io/experimental_mps_fast/fast_parser.cpp  | 338 +++++++-----------
 .../fast_parser_adapter.cpp                   |  32 --
 .../hash_table_smallstr.hpp                   | 242 ++++++++++++-
 cpp/src/io/parser.cpp                         |  19 +
 7 files changed, 413 insertions(+), 276 deletions(-)
 delete mode 100644 cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp

diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt
index 4c99b1848b..cafcffb23f 100644
--- a/cpp/src/io/CMakeLists.txt
+++ b/cpp/src/io/CMakeLists.txt
@@ -5,7 +5,6 @@
 
 set(MPS_FAST_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_adapter.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index e446494639..b7987738fc 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -33,6 +33,7 @@ namespace fp64 {
 // Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm
 // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
 // (8), 2021.
+// verified on a large corpus of FP64 values: https://github.com/lemire/simple_fastfloat_benchmark
 
 struct power_10_lut_entry_t {
   uint64_t high;
@@ -181,6 +182,8 @@ struct parsed_decimal_t {
 static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; }
 
 // SWAR 8char run of digits -> integer representation
+// better and more portable than AVX2 stuff since AVX2 doesn't like swizzling across 16B lanes
+// saw no real difference w/ 16B SSE
 static inline bool parse_8_digits(const char* p, uint32_t& out)
 {
   // comply with strict aliasing rules
@@ -313,7 +316,7 @@ static inline bool parse_decimal_advance(const char*& p, const char* end, parsed
 static inline double fallback_strtod(std::string_view s)
 {
   char stack_buf[32];
-  // The MPS specs mandate that numeric tokens are no longer than 25 characters
+  // The MPS specs mandate that numeric tokens are not longer than 25 characters
   if (s.size() >= sizeof(stack_buf)) {
     mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length");
   }
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index d3317c50e1..f35726a118 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -7,9 +7,6 @@
 
 #include <cstdarg>
 #include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string_view>
 #include <utility>
 
 #include <simde/x86/avx2.h>
@@ -30,6 +27,8 @@ enum scan_mode {
   until_whitespace,
 };
 
+// util to serially scan along an in-memory input buffer
+// contains optimized primitives for most parsing operations
 struct cursor_t {
   const char* start;
   const char* ptr;
@@ -39,7 +38,8 @@ struct cursor_t {
 
   bool done() const { return ptr >= end; }
 
-  std::pair<std::size_t, std::size_t> position() const
+  // used in error reporting
+  std::pair<std::size_t, std::size_t> linecol_position() const
   {
     std::size_t line       = 1;
     const char* line_start = start;
@@ -55,7 +55,7 @@ struct cursor_t {
 
   [[noreturn]] void error(const char* msg, ...)
   {
-    auto [line, col] = position();
+    auto [line, col] = linecol_position();
     va_list args;
     va_start(args, msg);
     char msg_buf[512];
@@ -66,9 +66,7 @@ struct cursor_t {
 
   void advance(std::size_t n)
   {
-    if (ptr + n > end) {
-      mps_parser_fail(error_type_t::ValidationError, "cursor advanced past end of file");
-    }
+    if (ptr + n > end) { mps_parser_fail(error_type_t::ValidationError, "Unexpected end of file"); }
     ptr += n;
   }
 
@@ -87,10 +85,11 @@ struct cursor_t {
     return end;
   }
 
+  // scans for the first non-whitespace (or vice versa)
   template <scan_mode mode>
   static const char* simd_scan(const char* p, const char* end)
   {
-    const simde__m256i v32 = simde_mm256_set1_epi8(32);
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);  // space/control characters
     const simde__m256i vnl = simde_mm256_set1_epi8('\n');
 
     while (p + 32 <= end) {
@@ -125,6 +124,7 @@ struct cursor_t {
     if (ptr < end && *ptr == '\n') { ptr++; }
   }
 
+  // could be SIMD but comments are usually rare
   void skip_comment_line()
   {
     while (!done() && *ptr != '\n' && *ptr != '\r') {
@@ -140,6 +140,7 @@ struct cursor_t {
     }
   }
 
+  // useful for parsing NAME/OBJNAME which may span multiple "fields" according to the MPS spec
   std::string_view read_rest_of_line_trimmed()
   {
     const char* begin    = ptr;
@@ -173,8 +174,8 @@ struct cursor_t {
     const simde__m256i v32 = simde_mm256_set1_epi8(32);
     const simde__m256i vnl = simde_mm256_set1_epi8('\n');
 
-    // All input streams provide trailing padding, so this unaligned 32-byte load is valid
-    // whenever end - ptr >= 32.
+    // all input streams provide trailing padding, so this 32B load is valid
+    // whenever end - ptr >= 32
     simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
     simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
     unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
@@ -204,6 +205,7 @@ struct cursor_t {
     return std::string_view(field_start, field_end - field_start);
   }
 
+  // read but do not consume
   inline __attribute__((always_inline)) std::string_view peek_field()
   {
     if (UNLIKELY(done())) { return {}; }
@@ -218,6 +220,7 @@ struct cursor_t {
     return cursor.peek_field();
   }
 
+  // usually in MPS fields go in pair. these can usually be extracted in a single 32B load
   inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
   read_two_fields()
   {
@@ -234,31 +237,30 @@ struct cursor_t {
     const simde__m256i vnl   = simde_mm256_set1_epi8('\n');
 
     // Same padded-buffer contract as read_field().
-    simde__m256i data  = simde_mm256_loadu_si256((const simde__m256i*)ptr);
-    simde__m256i gt32  = simde_mm256_cmpgt_epi8(data, v32);
-    simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+    simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
 
     unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32);
     unsigned int ws_mask        = ~printable_mask;
-    unsigned int nl_mask        = (unsigned int)simde_mm256_movemask_epi8(is_nl);
-    unsigned int stop_mask      = printable_mask | nl_mask;
 
     if (UNLIKELY(ws_mask == 0)) { return slow(); }
     int field1_end_off = __builtin_ctz(ws_mask);
 
-    unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1);
-    if (UNLIKELY(after_field1 == 0)) { return slow(); }
-    int field2_start_off = __builtin_ctz(after_field1);
+    unsigned int printable_after_field1 = printable_mask >> field1_end_off;
+    if (UNLIKELY(printable_after_field1 == 0)) { return slow(); }
+    int field2_start_off = field1_end_off + __builtin_ctz(printable_after_field1);
 
     if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); }
 
-    unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1);
+    unsigned int ws_after_field2_start = ws_mask >> field2_start_off;
     if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); }
-    int field2_end_off = __builtin_ctz(ws_after_field2_start);
+    int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start);
 
-    unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1);
-    if (LIKELY(after_field2 != 0)) {
-      ptr = ptr + __builtin_ctz(after_field2);
+    simde__m256i is_nl     = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int stop_mask = printable_mask | (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int stop_after_field2 = stop_mask >> field2_end_off;
+    if (LIKELY(stop_after_field2 != 0)) {
+      ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2);
     } else {
       ptr = ptr + field2_end_off;
       skip_ws();
@@ -346,8 +348,6 @@ static inline double expect_number(cursor_t& cursor)
 static inline double expect_number_fast_pm_one(cursor_t& cursor)
 {
   const char* p = cursor.ptr;
-  // Kept bounded despite the global padding invariant: this path is also used
-  // on section-local cursors whose logical end may precede the physical buffer.
   if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
     cursor.ptr = p + 2;
     cursor.skip_ws();
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index bc9000f8f3..3e47c7ee8c 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -28,6 +28,7 @@
 #include <chrono>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <exception>
 #include <limits>
@@ -67,12 +68,6 @@ static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB;
 static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB;
 // parser-wide thread cap switch; very small files lose to scheduling overhead
 static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull;
-// below this, the serial row-hash build is usually cheaper than partition setup
-static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * KiB;
-// number of partitions for the row hash table, used to avoid races and atomics during row hash
-// table initialization
-static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5;
-static constexpr size_t MPS_ROW_HASH_PARTITIONS  = (size_t{1} << MPS_ROW_HASH_PARTITION_BITS);
 // thread caps for small and large files
 static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16;
 static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32;
@@ -132,12 +127,6 @@ class chunk_name_arena_t {
   size_t next_slab_size_ = 64 * KiB;
 };
 
-// returns the hash table partition to use for a given hash
-static inline size_t row_hash_partition_for(uint32_t hash)
-{
-  return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
-}
-
 struct timer_entry_t {
   const char* name;
   double elapsed_ms;
@@ -452,12 +441,6 @@ static inline void observe_dense_name(bool& candidate,
 
 template <typename i_t, typename f_t>
 struct parse_state_t {
-  struct row_hash_partition_t {
-    hash_slot_var_t* slots = nullptr;
-    size_t buckets         = 0;
-    size_t mask            = 0;
-  };
-
   cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem;
   cursor_t& cursor;
 
@@ -475,17 +458,7 @@ struct parse_state_t {
   index_mode_t col_index_mode = index_mode_t::hash;
   dense_name_index_t col_dense;
 
-  // Row name hash table - sized at runtime based on row count
-  size_t row_hash_buckets = 0;
-  size_t row_hash_mask    = 0;  // buckets - 1, for fast modulo via &
-  mmap_region_t row_hash_region;
-  hash_slot_var_t* row_names_ht = nullptr;
-  // compute hash, select the subtable from high hash bits,
-  // then run the same open-addressing probe loop inside that subtable.
-  size_t row_hash_partition_count                                               = 0;
-  std::array<row_hash_partition_t, MPS_ROW_HASH_PARTITIONS> row_hash_partitions = {};
-  // Overflow map for row names longer than HASH_KEY_BYTES (usually very rare)
-  std::unordered_map<std::string_view, size_t> row_names_long;
+  smallstr_hash_table_t row_hash_;
 
   // Row name lookup for labels like R0001, R0002, ...
   index_mode_t row_index_mode = index_mode_t::hash;
@@ -495,6 +468,13 @@ struct parse_state_t {
   // var_names still uses STL (only used in parse_bounds, not as hot)
   std::unordered_map<std::string_view, size_t> var_names_map;
 
+  mmap_region_t temp_A_region;
+  mmap_region_t temp_A_indices_region;
+  f_t* temp_A                = nullptr;
+  i_t* temp_A_indices        = nullptr;
+  size_t temp_csr_nnz        = 0;
+  bool temp_csr_materialized = false;
+
   struct bounds_only_var_t {
     f_t lb    = f_t{0};
     f_t ub    = std::numeric_limits<f_t>::infinity();
@@ -537,23 +517,17 @@ struct parse_state_t {
     return true;
   }
 
-  size_t row_hash_bucket_count_for(size_t n_rows) const
-  {
-#ifdef MPS_FAST_COMPACT_ROW_HASH
-    // probe counts are usually low, and a smaller
-    // table reduces cache/TLB footprint on medium instances.
-    return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64));
-#else
-    return cuda::next_power_of_two(std::max((size_t)(n_rows * 2), (size_t)64));
-#endif
-  }
-
   void init_row_hash_table_impl()
   {
     scoped_timer_t timer("row_hash_init_total");
     size_t n_rows              = row_names_sv.size();
     const int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
     const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1;
+#ifdef MPS_FAST_COMPACT_ROW_HASH
+    constexpr bool compact_row_hash = true;
+#else
+    constexpr bool compact_row_hash = false;
+#endif
     std::vector<uint32_t> row_hashes;
     std::vector<size_t> row_order;
     std::array<size_t, MPS_ROW_HASH_PARTITIONS> partition_counts      = {};
@@ -561,19 +535,17 @@ struct parse_state_t {
 
     if (use_partitioned) {
       scoped_timer_t timer("row_hash_partition_metadata");
-      // Pre-hash once, count rows per partition, then pack row indices by partition.
-      // This turns the build into disjoint single-writer table fills.
       row_hashes.resize(n_rows);
       size_t inline_rows = 0;
       for (size_t idx = 0; idx < n_rows; ++idx) {
         std::string_view name = row_names_sv[idx];
         if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
-          row_names_long[name] = idx;
+          row_hash_.note_long_name(name, idx);
           continue;
         }
         uint32_t hash   = fnv1a_hash(name.data(), name.size());
         row_hashes[idx] = hash;
-        ++partition_counts[row_hash_partition_for(hash)];
+        ++partition_counts[hash_partition_for(hash)];
         ++inline_rows;
       }
 
@@ -585,102 +557,55 @@ struct parse_state_t {
       auto next_offsets = partition_offsets;
       for (size_t idx = 0; idx < n_rows; ++idx) {
         if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
-        size_t part                     = row_hash_partition_for(row_hashes[idx]);
+        size_t part                     = hash_partition_for(row_hashes[idx]);
         row_order[next_offsets[part]++] = idx;
       }
     }
 
     if (use_partitioned) {
-      row_hash_partition_count = MPS_ROW_HASH_PARTITIONS;
-      size_t total_buckets     = 0;
-      for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
-        row_hash_partitions[p].buckets = row_hash_bucket_count_for(partition_counts[p]);
-        row_hash_partitions[p].mask    = row_hash_partitions[p].buckets - 1;
-        total_buckets += row_hash_partitions[p].buckets;
-      }
-      row_hash_buckets = total_buckets;
-      row_hash_mask    = row_hash_buckets - 1;
+      row_hash_.configure_partitioned_buckets(partition_counts, compact_row_hash);
     } else {
-      row_hash_partition_count = 0;
-      row_hash_buckets         = row_hash_bucket_count_for(n_rows);
-      row_hash_mask            = row_hash_buckets - 1;
+      row_hash_.configure_serial_buckets(n_rows, compact_row_hash);
     }
-    size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t);
 
     {
       scoped_timer_t timer("row_hash_mmap");
-      // Use mmap for allocation - the OS provides zero'd pages
-      row_hash_region = mmap_region_t::anonymous(
-        row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table");
-      row_names_ht = (hash_slot_var_t*)row_hash_region.data();
-      if (use_partitioned) {
-        hash_slot_var_t* next_slots = row_names_ht;
-        for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
-          row_hash_partitions[p].slots = next_slots;
-          next_slots += row_hash_partitions[p].buckets;
-        }
-      }
-      // request huge pages to reduce TLB misses
-      row_hash_region.advise(MADV_HUGEPAGE);
+      row_hash_.allocate_mmap("row hash table");
     }
 
-    // pre-touch the 2MB huge pages to nudge the kernel into allocating them
 #ifdef MPS_FAST_THP_PREFAULT
     {
       scoped_timer_t timer("row_hash_thp_prefault");
-      materialize_hugepages(
-        "row_names_ht", row_names_ht, row_hash_region.size(), materialize_touch_t::write_2mb);
+      materialize_hugepages("row_names_ht",
+                            row_hash_.slots(),
+                            row_hash_.region().size(),
+                            materialize_touch_t::write_2mb);
     }
 #endif
 
     {
       scoped_timer_t timer("row_hash_insert_all");
-#ifdef MPS_FAST_PERF_COUNTERS
-      size_t total_probes = 0;
-      size_t max_probes   = 0;
-      size_t long_names   = row_names_long.size();
-#endif
+      row_hash_.reset_build_probe_stats();
       if (use_partitioned) {
         scoped_timer_t timer("row_hash_insert_partitioned");
 #ifdef MPS_FAST_PERF_COUNTERS
         std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
-        std::vector<size_t> partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0);
-        std::vector<size_t> partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0);
 #endif
-// initialize the row hash tables in parallel
 #pragma omp parallel for schedule(static) num_threads(num_threads)
         for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
           size_t p = (size_t)part_id;
 #ifdef MPS_FAST_PERF_COUNTERS
           thread_perf_counters_t perf_counters;
-          size_t local_total_probes = 0;
-          size_t local_max_probes   = 0;
 #endif
-          const auto& part = row_hash_partitions[p];
-          // Each worker owns its subtable, so row_insert_into remains the plain serial probe loop.
           for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
             size_t idx = row_order[pos];
-#ifdef MPS_FAST_PERF_COUNTERS
-            size_t probes = row_insert_into(
-              part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx);
-            local_total_probes += probes;
-            local_max_probes = std::max(local_max_probes, probes);
-#else
-            row_insert_into(
-              part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx);
-#endif
+            row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx);
           }
 #ifdef MPS_FAST_PERF_COUNTERS
-          partition_total_probes[p] = local_total_probes;
-          partition_max_probes[p]   = local_max_probes;
-          perf_snapshots[p]         = perf_counters.stop();
+          perf_snapshots[p] = perf_counters.stop();
 #endif
         }
 #ifdef MPS_FAST_PERF_COUNTERS
-        for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
-          total_probes += partition_total_probes[p];
-          max_probes = std::max(max_probes, partition_max_probes[p]);
-        }
         print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
 #endif
       } else {
@@ -688,42 +613,19 @@ struct parse_state_t {
         thread_perf_counters_t perf_counters;
 #endif
         for (size_t idx = 0; idx < n_rows; ++idx) {
-#ifdef MPS_FAST_PERF_COUNTERS
-          size_t probes = row_insert(row_names_sv[idx], idx);
-          if (probes == 0) {
-            ++long_names;
-          } else {
-            total_probes += probes;
-            max_probes = std::max(max_probes, probes);
-          }
-#else
-          row_insert(row_names_sv[idx], idx);
-#endif
+          row_hash_.insert_serial(row_names_sv[idx], idx);
         }
 #ifdef MPS_FAST_PERF_COUNTERS
         print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
 #endif
       }
-#ifdef MPS_FAST_PERF_COUNTERS
-      size_t probed_rows = n_rows - long_names;
-      double mean_probes = probed_rows == 0 ? 0.0 : (double)total_probes / (double)probed_rows;
-      double load_factor = row_hash_buckets == 0 ? 0.0 : (double)n_rows / (double)row_hash_buckets;
-      std::fprintf(stderr,
-                   "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n",
-                   n_rows,
-                   row_hash_buckets,
-                   load_factor,
-                   long_names,
-                   mean_probes,
-                   max_probes);
-#endif
+      row_hash_.print_build_probe_report(n_rows);
     }
 
-    // Force the kernel to please please collapse the page range into THP pages
 #ifdef MPS_FAST_MADV_COLLAPSE
     {
       scoped_timer_t timer("row_hash_madv_collapse");
-      row_hash_region.advise(MADV_COLLAPSE);
+      row_hash_.region().advise(MADV_COLLAPSE);
     }
 #endif
   }
@@ -731,7 +633,7 @@ struct parse_state_t {
   size_t row_lookup(std::string_view name) const
   {
     if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); }
-    return row_lookup_hash(name);
+    return row_hash_.lookup(name);
   }
 
   size_t read_row_lookup_dense_ordered(cursor_t& cursor) const
@@ -774,75 +676,7 @@ struct parse_state_t {
     }
 
     auto row_name = cursor.read_field();
-    return row_lookup_hash(row_name);
-  }
-
-  size_t row_lookup_hash(std::string_view name) const
-  {
-    if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
-      auto it = row_names_long.find(name);
-      return it != row_names_long.end() ? it->second : SIZE_MAX;
-    }
-    hash_key_t key = make_key(name.data(), name.size());
-    uint32_t hash  = fnv1a_hash(name.data(), name.size());
-    if (LIKELY(row_hash_partition_count != 0)) {
-      // Lookups mirror the build routing and probe only the selected subtable.
-      const auto& part = row_hash_partitions[row_hash_partition_for(hash)];
-      return row_lookup_in(part.slots, part.buckets, part.mask, key, hash);
-    }
-    return row_lookup_in(row_names_ht, row_hash_buckets, row_hash_mask, key, hash);
-  }
-
-  size_t row_lookup_in(
-    const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) const
-  {
-    const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
-    for (size_t i = 0; i < buckets; ++i, ++slot) {
-      if (slot >= &slots[buckets]) { slot = &slots[0]; }
-      if (slot->count == 0) { return SIZE_MAX; }
-      if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
-    }
-    return SIZE_MAX;
-  }
-
-  size_t row_insert(std::string_view name, size_t index)
-  {
-    if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
-      row_names_long[name] = index;
-      return 0;
-    }
-    return row_insert_into(row_names_ht,
-                           row_hash_buckets,
-                           row_hash_mask,
-                           name,
-                           fnv1a_hash(name.data(), name.size()),
-                           index);
-  }
-
-  size_t row_insert_into(hash_slot_var_t* slots,
-                         size_t buckets,
-                         size_t mask,
-                         std::string_view name,
-                         uint32_t hash,
-                         size_t index)
-  {
-    hash_key_t key        = make_key(name.data(), name.size());
-    hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
-    for (size_t i = 0; i < buckets; ++i, ++slot) {
-      if (slot >= &slots[buckets]) { slot = &slots[0]; }
-      if (slot->count == 0) {
-        key_store(slot->key, key);            // Writes 32 bytes, including garbage in last 4
-        slot->count = (uint32_t)(index + 1);  // Overwrite last 4 bytes with actual count. i trust
-                                              // the compiler to optimize this
-        return i + 1;
-      }
-      if (key_cmpeq(slot->key, key)) {
-        slot->count = (uint32_t)(index + 1);
-        return i + 1;
-      }
-    }
-    // can't happen, the table is properly sized to fit all rows
-    __builtin_unreachable();
+    return row_hash_.lookup(row_name);
   }
 };
 
@@ -1736,19 +1570,28 @@ template <typename i_t, typename f_t>
 static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
                                     const column_merge_shape_t<i_t>& shape)
 {
-  scoped_timer_t timer("allocate_csr_arrays");
+  scoped_timer_t timer("allocate_temp_csr_arrays");
+  size_t values_bytes  = shape.total_nnz * sizeof(f_t);
+  size_t indices_bytes = shape.total_nnz * sizeof(i_t);
+  state.temp_csr_nnz   = shape.total_nnz;
 
-  // problem_t uses std::vector, so these resize() calls zero-initialize large arrays.
-  // Running them in parallel hides part of that page-fault and initialization cost.
 #pragma omp parallel sections num_threads(4)
   {
 #pragma omp section
     {
-      state.problem.A_.resize(shape.total_nnz);
+      state.temp_A_region = mmap_region_t::anonymous(
+        std::max<size_t>(values_bytes, 1), PROT_READ | PROT_WRITE, MAP_PRIVATE, "temp CSR values");
+      state.temp_A = (f_t*)state.temp_A_region.data();
+      state.temp_A_region.advise(MADV_HUGEPAGE);
     }
 #pragma omp section
     {
-      state.problem.A_indices_.resize(shape.total_nnz);
+      state.temp_A_indices_region = mmap_region_t::anonymous(std::max<size_t>(indices_bytes, 1),
+                                                             PROT_READ | PROT_WRITE,
+                                                             MAP_PRIVATE,
+                                                             "temp CSR column indices");
+      state.temp_A_indices        = (i_t*)state.temp_A_indices_region.data();
+      state.temp_A_indices_region.advise(MADV_HUGEPAGE);
     }
 #pragma omp section
     {
@@ -1788,16 +1631,16 @@ static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
         size_t col_start = chunk.col_offsets[local_col];
         size_t col_end   = chunk.col_offsets[local_col + 1];
         for (size_t idx = col_start; idx < col_end; idx++) {
-          i_t row                        = (i_t)chunk.row_indices[idx];
-          size_t row_idx                 = (size_t)row;
-          size_t block_id                = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
-          size_t local                   = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
-          int32_t block_pos              = chunk.row_count_block_dir[block_id];
-          row_count_block_t& block       = chunk.row_count_blocks[(size_t)block_pos];
-          int64_t& write_pos             = chunk.row_count_storage[block.storage_offset + local];
-          i_t dest                       = (i_t)write_pos++;
-          state.problem.A_[dest]         = (f_t)chunk.values[idx];
-          state.problem.A_indices_[dest] = global_col;
+          i_t row                    = (i_t)chunk.row_indices[idx];
+          size_t row_idx             = (size_t)row;
+          size_t block_id            = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+          size_t local               = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+          int32_t block_pos          = chunk.row_count_block_dir[block_id];
+          row_count_block_t& block   = chunk.row_count_blocks[(size_t)block_pos];
+          int64_t& write_pos         = chunk.row_count_storage[block.storage_offset + local];
+          i_t dest                   = (i_t)write_pos++;
+          state.temp_A[dest]         = (f_t)chunk.values[idx];
+          state.temp_A_indices[dest] = global_col;
         }
       }
 #ifdef MPS_FAST_PERF_COUNTERS
@@ -1905,6 +1748,66 @@ static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
   state.problem.nnz_    = (i_t)shape.total_nnz;
 }
 
+template <typename i_t, typename f_t>
+static void materialize_problem_csr(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_csr");
+  size_t nnz              = state.temp_csr_nnz;
+  const char* env_threads = std::getenv("MPS_CSR_COPY_THREADS");
+  int copy_threads        = env_threads ? std::atoi(env_threads) : 2;
+  copy_threads            = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP));
+
+  int resize_threads = copy_threads > 1 ? 2 : 1;
+#pragma omp parallel sections num_threads(resize_threads)
+  {
+#pragma omp section
+    {
+      state.problem.A_.resize(nnz);
+    }
+#pragma omp section
+    {
+      state.problem.A_indices_.resize(nnz);
+    }
+  }
+
+  size_t value_bytes = nnz * sizeof(f_t);
+  size_t index_bytes = nnz * sizeof(i_t);
+  size_t total_bytes = value_bytes + index_bytes;
+  // Copy A_ and A_indices overlapping with the other phases
+  // this hides the latency costs of heap alloc and default init with other parsing/IO
+  // instead of making it blocking for the column parse
+  // TODO: just have A_ and A_indices_ be mmap anon allocs directly in the mps_data_model_t
+  // but that'd require careful work around avoiding breaking changes and the API esp cython stuff
+  if (total_bytes != 0) {
+#pragma omp parallel for num_threads(copy_threads) schedule(static)
+    for (int t = 0; t < copy_threads; ++t) {
+      size_t begin = (total_bytes * (size_t)t) / (size_t)copy_threads;
+      size_t end   = (total_bytes * (size_t)(t + 1)) / (size_t)copy_threads;
+      if (begin < value_bytes) {
+        size_t value_end = std::min(end, value_bytes);
+        if (value_end > begin) {
+          std::memcpy((char*)state.problem.A_.data() + begin,
+                      (const char*)state.temp_A + begin,
+                      value_end - begin);
+        }
+      }
+      if (end > value_bytes) {
+        size_t index_begin = begin > value_bytes ? begin - value_bytes : 0;
+        size_t index_end   = end - value_bytes;
+        std::memcpy((char*)state.problem.A_indices_.data() + index_begin,
+                    (const char*)state.temp_A_indices + index_begin,
+                    index_end - index_begin);
+      }
+    }
+  }
+
+  state.temp_A                = nullptr;
+  state.temp_A_indices        = nullptr;
+  state.temp_csr_materialized = true;
+  state.temp_A_region.reset();
+  state.temp_A_indices_region.reset();
+}
+
 template <typename i_t, typename f_t>
 static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
                                            int num_threads,
@@ -2891,6 +2794,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
   int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0;
   int header_done = 0, rows_done = 0, columns_done = 0;
   int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
+  int csr_done = 0;
 
   const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size);
   const int parser_threads      = parser_thread_cap_for_size(parser_size);
@@ -2988,6 +2892,14 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
         });
       }
 
+#pragma omp task depend(in : columns_done) depend(out : csr_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_csr", nvtx::colors::alloc);
+          materialize_problem_csr(state);
+        });
+      }
+
 #pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done)
       {
         run_parser_task([&] {
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
deleted file mode 100644
index 0d14f059bc..0000000000
--- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* clang-format off */
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-/* clang-format on */
-
-#include <cuopt/linear_programming/io/parser.hpp>
-
-#include "fast_parser.hpp"
-
-#include <utilities/logger.hpp>
-
-#include <cstdint>
-
-namespace cuopt::linear_programming::io {
-
-template <typename i_t, typename f_t>
-mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
-{
-  CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
-  return mps_fast::parse_mps_fast_file<i_t, f_t>(mps_file_path);
-}
-
-template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
-template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
-template mps_data_model_t<int64_t, float> read_mps_fast_experimental(
-  const std::string& mps_file_path);
-template mps_data_model_t<int64_t, double> read_mps_fast_experimental(
-  const std::string& mps_file_path);
-
-}  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
index 7d367db941..4d47b23c10 100644
--- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -5,14 +5,33 @@
 
 #pragma once
 
+#include "mmap_region.hpp"
+
+#include <cuda/cmath>
+
 #include <simde/x86/avx2.h>
 
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#ifdef MPS_FAST_PERF_COUNTERS
+#include <cstdio>
+#endif
+#include <limits>
+#include <string_view>
+#include <unordered_map>
 
 namespace mps_fast {
 
+// below this threshold, the serial row-hash build is usually cheaper than partition setup
+inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024;
+inline constexpr int MPS_ROW_HASH_PARTITION_BITS          = 5;
+inline constexpr size_t MPS_ROW_HASH_PARTITIONS           = (1 << MPS_ROW_HASH_PARTITION_BITS);
+
 // FNV-1a over bytes in reverse order; row names commonly share long prefixes.
 static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len)
 {
@@ -30,9 +49,6 @@ static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len)
 }
 
 // 28-byte inline key + uint32 payload: two slots per 64-byte cache line.
-// key_store writes a full 32-byte vector starting at key[0], so callers must
-// publish the payload after storing the key. key_cmpeq masks those payload lanes
-// away, leaving the trailing uint32 free for the row index + 1 sentinel.
 struct alignas(32) hash_slot_28_t {
   char key[28];
   uint32_t count;
@@ -65,4 +81,224 @@ static inline void key_store(char* slot_key, hash_key_t key)
   simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(slot_key), key);
 }
 
+struct hash_partition_t {
+  hash_slot_var_t* slots = nullptr;
+  size_t buckets         = 0;
+  size_t mask            = 0;
+};
+
+static inline size_t hash_partition_for(uint32_t hash)
+{
+  return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
+}
+
+static inline size_t hash_bucket_count_for(size_t n_rows, bool compact)
+{
+  if (compact) { return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); }
+  return cuda::next_power_of_two(std::max(n_rows * 2, (size_t)64));
+}
+
+static inline size_t hash_lookup_in(
+  const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash)
+{
+  const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) { return std::numeric_limits<size_t>::max(); }
+    if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
+  }
+  return std::numeric_limits<size_t>::max();
+}
+
+static inline size_t hash_insert_into(hash_slot_var_t* slots,
+                                      size_t buckets,
+                                      size_t mask,
+                                      std::string_view name,
+                                      uint32_t hash,
+                                      size_t index)
+{
+  hash_key_t key        = make_key(name.data(), name.size());
+  hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) {
+      key_store(slot->key, key);
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+    if (key_cmpeq(slot->key, key)) {
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+  }
+  __builtin_unreachable();
+}
+
+#ifdef MPS_FAST_PERF_COUNTERS
+struct hash_build_probe_stats_t {
+  size_t total_probes = 0;
+  size_t max_probes   = 0;
+  size_t long_names   = 0;
+
+  void seed_long_names(size_t n) { long_names = n; }
+
+  void record_insert(size_t probes)
+  {
+    if (probes == 0) {
+      ++long_names;
+    } else {
+      total_probes += probes;
+      max_probes = std::max(max_probes, probes);
+    }
+  }
+
+  void merge(const hash_build_probe_stats_t& other)
+  {
+    total_probes += other.total_probes;
+    max_probes = std::max(max_probes, other.max_probes);
+    long_names += other.long_names;
+  }
+};
+#endif
+
+class smallstr_hash_table_t {
+ public:
+  void note_long_name(std::string_view name, size_t index) { long_names_[name] = index; }
+
+  size_t long_name_count() const { return long_names_.size(); }
+
+  void reset_build_probe_stats()
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_ = {};
+    build_probe_stats_.seed_long_names(long_names_.size());
+    partition_probe_stats_ = {};
+#endif
+  }
+
+  void print_build_probe_report(size_t n_rows) const
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    hash_build_probe_stats_t stats = build_probe_stats_;
+    if (partition_count_ != 0) {
+      for (size_t p = 0; p < partition_count_; ++p) {
+        stats.merge(partition_probe_stats_[p]);
+      }
+    }
+    size_t probed_rows = n_rows - stats.long_names;
+    double mean_probes = probed_rows == 0 ? 0.0 : (double)stats.total_probes / (double)probed_rows;
+    double load_factor = buckets_ == 0 ? 0.0 : (double)n_rows / (double)buckets_;
+    std::fprintf(stderr,
+                 "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n",
+                 n_rows,
+                 buckets_,
+                 load_factor,
+                 stats.long_names,
+                 mean_probes,
+                 stats.max_probes);
+#endif
+  }
+
+  void configure_serial_buckets(size_t n_rows, bool compact)
+  {
+    partition_count_ = 0;
+    buckets_         = hash_bucket_count_for(n_rows, compact);
+    mask_            = buckets_ - 1;
+  }
+
+  void configure_partitioned_buckets(
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS>& partition_counts, bool compact)
+  {
+    partition_count_ = MPS_ROW_HASH_PARTITIONS;
+    buckets_         = 0;
+    for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+      partitions_[p].buckets = hash_bucket_count_for(partition_counts[p], compact);
+      partitions_[p].mask    = partitions_[p].buckets - 1;
+      buckets_ += partitions_[p].buckets;
+    }
+    mask_ = buckets_ - 1;
+  }
+
+  void allocate_mmap(const char* label)
+  {
+    size_t mmap_size = buckets_ * sizeof(hash_slot_var_t);
+    region_ = mmap_region_t::anonymous(mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, label);
+    slots_  = (hash_slot_var_t*)region_.data();
+    if (partition_count_ != 0) {
+      hash_slot_var_t* next_slots = slots_;
+      for (size_t p = 0; p < partition_count_; ++p) {
+        partitions_[p].slots = next_slots;
+        next_slots += partitions_[p].buckets;
+      }
+    }
+    region_.advise(MADV_HUGEPAGE);
+  }
+
+  mmap_region_t& region() noexcept { return region_; }
+  const mmap_region_t& region() const noexcept { return region_; }
+
+  hash_slot_var_t* slots() noexcept { return slots_; }
+  const hash_slot_var_t* slots() const noexcept { return slots_; }
+
+  size_t buckets() const noexcept { return buckets_; }
+  size_t mask() const noexcept { return mask_; }
+  size_t partition_count() const noexcept { return partition_count_; }
+
+  const hash_partition_t& partition(size_t p) const noexcept { return partitions_[p]; }
+
+  size_t lookup(std::string_view name) const
+  {
+    if (name.size() > HASH_KEY_BYTES) {
+      auto it = long_names_.find(name);
+      return it != long_names_.end() ? it->second : std::numeric_limits<size_t>::max();
+    }
+    hash_key_t key = make_key(name.data(), name.size());
+    uint32_t hash  = fnv1a_hash(name.data(), name.size());
+    if (partition_count_ != 0) {
+      const auto& part = partitions_[hash_partition_for(hash)];
+      return hash_lookup_in(part.slots, part.buckets, part.mask, key, hash);
+    }
+    return hash_lookup_in(slots_, buckets_, mask_, key, hash);
+  }
+
+  size_t insert_serial(std::string_view name, size_t index)
+  {
+    size_t probes;
+    if (name.size() > HASH_KEY_BYTES) {
+      note_long_name(name, index);
+      probes = 0;
+    } else {
+      probes = hash_insert_into(
+        slots_, buckets_, mask_, name, fnv1a_hash(name.data(), name.size()), index);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_.record_insert(probes);
+#endif
+    return probes;
+  }
+
+  size_t insert_partition(size_t partition, std::string_view name, uint32_t hash, size_t index)
+  {
+    const auto& part = partitions_[partition];
+    size_t probes    = hash_insert_into(part.slots, part.buckets, part.mask, name, hash, index);
+#ifdef MPS_FAST_PERF_COUNTERS
+    partition_probe_stats_[partition].record_insert(probes);
+#endif
+    return probes;
+  }
+
+ private:
+  mmap_region_t region_;
+  hash_slot_var_t* slots_ = nullptr;
+  size_t buckets_         = 0;
+  size_t mask_            = 0;
+  size_t partition_count_ = 0;
+  std::array<hash_partition_t, MPS_ROW_HASH_PARTITIONS> partitions_{};
+  std::unordered_map<std::string_view, size_t> long_names_{};
+#ifdef MPS_FAST_PERF_COUNTERS
+  hash_build_probe_stats_t build_probe_stats_{};
+  std::array<hash_build_probe_stats_t, MPS_ROW_HASH_PARTITIONS> partition_probe_stats_{};
+#endif
+};
+
 }  // namespace mps_fast
diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp
index 93d9d9c73c..6392833ce3 100644
--- a/cpp/src/io/parser.cpp
+++ b/cpp/src/io/parser.cpp
@@ -7,8 +7,13 @@
 
 #include <cuopt/linear_programming/io/parser.hpp>
 
+#include <experimental_mps_fast/fast_parser.hpp>
 #include <mps_parser_internal.hpp>
 
+#include <utilities/logger.hpp>
+
+#include <cstdint>
+
 namespace cuopt::linear_programming::io {
 
 template <typename i_t, typename f_t>
@@ -35,4 +40,18 @@ template mps_data_model_t<int, float> read_mps_from_string(std::string_view mps_
 template mps_data_model_t<int, double> read_mps_from_string(std::string_view mps_contents,
                                                             bool fixed_mps_format);
 
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
+{
+  CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
+  return mps_fast::parse_mps_fast_file<i_t, f_t>(mps_file_path);
+}
+
+template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int64_t, float> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+template mps_data_model_t<int64_t, double> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+
 }  // namespace cuopt::linear_programming::io

From 26141370ec40acf7a80065f18c935a7269251a3e Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 12 Jun 2026 03:53:25 -0700
Subject: [PATCH 11/22] more cleanup, fix som eedge case failures

---
 .../fast_fp64_parser.hpp                      |   4 +-
 .../fast_parse_primitives.hpp                 |   4 +-
 .../io/experimental_mps_fast/fast_parser.cpp  | 129 ++++-----
 .../io/experimental_mps_fast/fast_parser.hpp  |   6 +-
 .../io/experimental_mps_fast/file_reader.cpp  | 151 ++++------
 .../io/experimental_mps_fast/file_reader.hpp  | 148 ++++++++--
 .../hash_table_smallstr.hpp                   |   4 +-
 .../experimental_mps_fast/lz4_file_reader.cpp | 270 ++++++++----------
 .../io/experimental_mps_fast/mmap_region.hpp  |   4 +-
 .../mps_section_scanner.cpp                   |  42 ++-
 .../mps_section_scanner.hpp                   |  21 +-
 .../io/experimental_mps_fast/nvtx_ranges.hpp  |  21 +-
 cpp/src/io/mps_parser.cpp                     |   6 +-
 cpp/src/io/parser.cpp                         |   2 +-
 cpp/src/utilities/perf_counters.hpp           |   4 +-
 .../fast_fp64_parser_test.cpp                 |  41 +--
 .../fast_parser_edge_test.cpp                 | 194 +++++--------
 cpp/tests/linear_programming/parser_test.cpp  |  16 +-
 18 files changed, 548 insertions(+), 519 deletions(-)

diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index b7987738fc..f007c0f707 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -15,7 +15,7 @@
 #include <stdexcept>
 #include <string_view>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_expects;
@@ -428,4 +428,4 @@ static inline double parse_fp64_advance(const char*& p, const char* end)
 }
 
 }  // namespace fp64
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index f35726a118..f77e14a410 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -20,7 +20,7 @@
 #define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #endif
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 enum scan_mode {
   skip_whitespace,
@@ -379,4 +379,4 @@ static inline bool accept_comment(cursor_t& cursor)
   return false;
 }
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 3e47c7ee8c..35e83a01aa 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -26,6 +26,7 @@
 #include <cctype>
 #include <cerrno>
 #include <chrono>
+#include <concepts>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
@@ -47,7 +48,7 @@
 #define MPS_FAST_COMPACT_ROW_HASH
 #define MPS_FAST_THP_PREFAULT
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 static constexpr size_t KiB = 1024;
 static constexpr size_t MiB = 1024 * KiB;
@@ -171,12 +172,11 @@ enum class materialize_touch_t {
 
 // instanciate a range using mmap anon pages with hugepage hints, and materialize them
 // by touching each to nudge the kernel into invoking its THP mechanism
-static void materialize_hugepages(const char* label,
+static void materialize_hugepages([[maybe_unused]] const char* label,
                                   void* data,
                                   size_t bytes,
                                   materialize_touch_t touch)
 {
-  (void)label;
   if (data == nullptr || bytes == 0) return;
 
   constexpr size_t two_mb = 2 * MiB;
@@ -208,7 +208,7 @@ static void materialize_vector_hugepages(const char* label,
 
 class scoped_timer_t {
  public:
-  scoped_timer_t(const char* name, double* accumulator = nullptr)
+  scoped_timer_t([[maybe_unused]] const char* name, double* accumulator = nullptr)
 #ifdef MPS_FAST_TIMERS
     : name_(name),
       accumulator_(accumulator),
@@ -217,7 +217,6 @@ class scoped_timer_t {
 #else
     : accumulator_(accumulator)
   {
-    (void)name;
   }
 #endif
 
@@ -441,7 +440,7 @@ static inline void observe_dense_name(bool& candidate,
 
 template <typename i_t, typename f_t>
 struct parse_state_t {
-  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem;
+  mps_data_model_t<i_t, f_t>& problem;
   cursor_t& cursor;
 
   // backed by the input buffer
@@ -484,10 +483,7 @@ struct parse_state_t {
   // some writers introduce zero-column variables only in BOUNDS.
   std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
 
-  parse_state_t(cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& p, cursor_t& c)
-    : problem(p), cursor(c)
-  {
-  }
+  parse_state_t(mps_data_model_t<i_t, f_t>& p, cursor_t& c) : problem(p), cursor(c) {}
 
   void init_row_hash_table()
   {
@@ -718,7 +714,8 @@ static void parse_objname_section(parse_state_t<i_t, f_t>& state)
 {
   scoped_timer_t timer("parse_objname");
   if (accept(state.cursor, "OBJNAME")) {
-    if (!state.cursor.eol()) { state.objective_name_sv = state.cursor.read_rest_of_line_trimmed(); }
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    state.objective_name_sv = state.cursor.read_field();
     accept_comment(state.cursor);
     expect_eol(state.cursor);
   }
@@ -1752,10 +1749,9 @@ template <typename i_t, typename f_t>
 static void materialize_problem_csr(parse_state_t<i_t, f_t>& state)
 {
   scoped_timer_t timer("materialize_problem_csr");
-  size_t nnz              = state.temp_csr_nnz;
-  const char* env_threads = std::getenv("MPS_CSR_COPY_THREADS");
-  int copy_threads        = env_threads ? std::atoi(env_threads) : 2;
-  copy_threads            = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP));
+  size_t nnz       = state.temp_csr_nnz;
+  int copy_threads = 2;
+  copy_threads     = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP));
 
   int resize_threads = copy_threads > 1 ? 2 : 1;
 #pragma omp parallel sections num_threads(resize_threads)
@@ -1904,8 +1900,7 @@ static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
   };
 
   while (cursor.ptr < cursor.end) {
-    auto rhs_name = cursor.read_field();
-    (void)rhs_name;
+    [[maybe_unused]] auto rhs_name = cursor.read_field();
     if (accept_comment(cursor)) {
       expect_eol(cursor);
       continue;
@@ -2075,9 +2070,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
             continue;
           }
 
-          auto bound_name = cursor.read_field();
-          (void)bound_name;
-          auto var_name = cursor.read_field();
+          [[maybe_unused]] auto bound_name = cursor.read_field();
+          auto var_name                    = cursor.read_field();
           if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
             cursor.skip_to_eol();
             expect_eol(cursor);
@@ -2274,10 +2268,9 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
   {
     scoped_timer_t timer("parse_bounds");
     while (!cursor.done()) {
-      auto bound_type = cursor.read_field();
-      auto bound_name = cursor.read_field();
-      (void)bound_name;
-      auto var_name = cursor.read_field();
+      auto bound_type                  = cursor.read_field();
+      [[maybe_unused]] auto bound_name = cursor.read_field();
+      auto var_name                    = cursor.read_field();
       if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
         cursor.skip_to_eol();
         expect_eol(cursor);
@@ -2335,7 +2328,7 @@ static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
         }
         cursor.error("%s", msg);
       };
-      (void)apply_bound_record(
+      [[maybe_unused]] bool bound_applied = apply_bound_record(
         bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error);
       if (aux_var == nullptr) { mark_bound(var_idx); }
 
@@ -2401,8 +2394,7 @@ static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& curso
   };
 
   while (cursor.ptr < cursor.end) {
-    auto range_name = cursor.read_field();
-    (void)range_name;
+    [[maybe_unused]] auto range_name = cursor.read_field();
     if (accept_comment(cursor)) {
       expect_eol(cursor);
       continue;
@@ -2716,8 +2708,8 @@ static void append_bounds_only_variables(parse_state_t<i_t, f_t>& state)
 }
 
 template <typename i_t, typename f_t>
-static std::size_t init_problem_storage(
-  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& problem, std::size_t reserve_hint)
+static std::size_t init_problem_storage(mps_data_model_t<i_t, f_t>& problem,
+                                        std::size_t reserve_hint)
 {
   problem.n_vars_                   = 0;
   problem.n_constraints_            = 0;
@@ -2741,15 +2733,30 @@ static std::size_t init_problem_storage(
   return reserve_dim;
 }
 
-template <typename Stream, typename i_t, typename f_t>
-static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_stream(
-  Stream& stream, const char* total_timer_name, const char* producer_task_name)
+// Contract every input stream fed to parse_mps_fast_stream must satisfy.
+template <typename Stream>
+concept InputStream = requires(Stream stream)
+{
+  {stream.data()}->std::convertible_to<const char*>;
+  {stream.mutable_data()}->std::convertible_to<char*>;
+  {stream.size()}->std::convertible_to<std::size_t>;
+  {stream.compressed_size()}->std::convertible_to<std::size_t>;
+  {stream.reserve_size_hint()}->std::convertible_to<std::size_t>;
+  {stream.registry()}->std::same_as<mps_phase_registry_t&>;
+  {stream.view()}->std::same_as<input_stream_view_t>;
+  {stream.run_decode_tasks()}->std::same_as<void>;
+};
+
+template <InputStream Stream, typename i_t, typename f_t>
+static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
+                                                        const char* total_timer_name,
+                                                        const char* producer_task_name)
 {
   omp_max_active_levels_guard_t omp_active_levels(2);
 
   input_stream_view_t input = stream.view();
   auto total_timer          = std::make_unique<scoped_timer_t>(total_timer_name);
-  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> problem;
+  mps_data_model_t<i_t, f_t> problem;
   std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint());
 
   cursor_t cursor(input.data, 0);
@@ -2758,24 +2765,14 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
 
   auto phase_end = [](const char*) { flush_timers(); };
 
-  std::mutex task_error_mutex;
-  std::exception_ptr first_task_error = nullptr;
-  std::atomic<bool> task_failed{false};
-
-  auto mark_task_error = [&](std::exception_ptr eptr) {
-    {
-      std::lock_guard<std::mutex> lock(task_error_mutex);
-      if (!first_task_error) { first_task_error = eptr; }
-    }
-    task_failed.store(true, std::memory_order_release);
-  };
+  parallel_error_latch_t parser_tasks;
 
   auto run_parser_task = [&](auto&& fn) {
-    if (task_failed.load(std::memory_order_acquire)) { return; }
+    if (parser_tasks.stopped()) { return; }
     try {
       fn();
     } catch (...) {
-      mark_task_error(std::current_exception());
+      parser_tasks.capture(std::current_exception());
     }
   };
 
@@ -2851,7 +2848,7 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
         try {
           stream.run_decode_tasks();
         } catch (...) {
-          mark_task_error(std::current_exception());
+          parser_tasks.capture(std::current_exception());
           unblock_phase_waiters_after_error();
         }
       }
@@ -2940,19 +2937,20 @@ static cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_
     }
   }
 
-  if (first_task_error) { std::rethrow_exception(first_task_error); }
+  parser_tasks.rethrow_if_error();
 
   append_bounds_only_variables(state);
 
   input.size = stream.size();
   cursor.end = input.data + input.size;
-  if (!input.registry->endata_ready() || !input.registry->endata_present()) {
-    cursor.ptr =
-      input.registry->endata_ready() ? input.registry->endata_begin() : input.data + input.size;
-    cursor.error("missing ENDATA");
+  if (!input.registry->endata_ready()) {
+    cursor.ptr = input.data + input.size;
+    cursor.error("input ended before ENDATA boundary was resolved");
+  }
+  if (input.registry->endata_present()) {
+    cursor.ptr = input.registry->endata_begin();
+    expect(cursor, "ENDATA");
   }
-  cursor.ptr = input.registry->endata_begin();
-  expect(cursor, "ENDATA");
 
   total_timer.reset();
   flush_timers();
@@ -2967,7 +2965,7 @@ struct padded_memory_input_t {
 
 static padded_memory_input_t read_compressed_mps_file(const std::string& path)
 {
-  std::vector<char> buffer = cuopt::linear_programming::io::detail::file_to_string(path);
+  std::vector<char> buffer = file_to_string(path);
   if (buffer.empty()) { buffer.push_back('\0'); }
 
   std::size_t input_size = buffer.size() - 1;
@@ -2976,8 +2974,7 @@ static padded_memory_input_t read_compressed_mps_file(const std::string& path)
 }
 
 template <typename i_t, typename f_t>
-cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method)
+mps_data_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path, FileReadMethod read_method)
 {
   FileReadMethod effective_method = effective_file_read_method(path, read_method);
   switch (effective_method) {
@@ -3006,13 +3003,13 @@ cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> parse_mps_fast_file(
   __builtin_unreachable();
 }
 
-template cuopt::linear_programming::io::mps_data_model_t<int, float> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method);
-template cuopt::linear_programming::io::mps_data_model_t<int, double> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method);
-template cuopt::linear_programming::io::mps_data_model_t<int64_t, float> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method);
-template cuopt::linear_programming::io::mps_data_model_t<int64_t, double> parse_mps_fast_file(
-  const std::string& path, FileReadMethod read_method);
+template mps_data_model_t<int, float> parse_mps_fast_file(const std::string& path,
+                                                          FileReadMethod read_method);
+template mps_data_model_t<int, double> parse_mps_fast_file(const std::string& path,
+                                                           FileReadMethod read_method);
+template mps_data_model_t<int64_t, float> parse_mps_fast_file(const std::string& path,
+                                                              FileReadMethod read_method);
+template mps_data_model_t<int64_t, double> parse_mps_fast_file(const std::string& path,
+                                                               FileReadMethod read_method);
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
index 9f6f0f107b..6047a55f05 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
@@ -10,13 +10,13 @@
 #include <cstddef>
 #include <string>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 template <typename i_t, typename f_t>
-using parser_model_t = cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>;
+using parser_model_t = mps_data_model_t<i_t, f_t>;
 
 template <typename i_t, typename f_t>
 parser_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path,
                                              FileReadMethod read_method = FileReadMethod::Read);
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 5eae15a46a..e874011db8 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -26,7 +26,7 @@
 #include <utility>
 #include <vector>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_fail;
@@ -104,6 +104,27 @@ std::size_t system_page_size()
   return page_size;
 }
 
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset)
+{
+  std::size_t done = 0;
+  while (done < bytes) {
+    std::size_t remaining = bytes - done;
+    std::size_t chunk =
+      std::min<std::size_t>(remaining, (std::size_t)std::numeric_limits<ssize_t>::max());
+    ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done));
+    if (got < 0) {
+      if (errno == EINTR) { continue; }
+      return false;
+    }
+    if (got == 0) {
+      errno = EIO;
+      return false;
+    }
+    done += (std::size_t)got;
+  }
+  return true;
+}
+
 raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
 {
   MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
@@ -118,9 +139,6 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
   file_size_         = get_file_size(buffered_fd_, path);
   fd_                = buffered_fd_;
   bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes;
-  if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) {
-    use_direct_io = raw_direct[0] != '0';
-  }
   if (use_direct_io) {
 #ifdef O_DIRECT
     int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
@@ -140,8 +158,6 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
   output_data_ = output_region_.char_data();
   output_region_.advise(MADV_HUGEPAGE);
 
-  block_done_.resize(window_count_, 0);
-  block_end_.resize(window_count_, 0);
   section_scanner_ =
     std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
 }
@@ -157,10 +173,20 @@ char* raw_input_stream_t::mutable_data() noexcept { return output_data_; }
 std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; }
 std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; }
 std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; }
-mps_phase_registry_t& raw_input_stream_t::registry() noexcept { return registry_; }
-input_stream_view_t raw_input_stream_t::view() noexcept
+
+void raw_input_stream_t::read_window_payload(std::size_t offset, std::size_t size)
 {
-  return {output_data_, output_data_, output_view_size_, file_size_, &registry_};
+  if (pread_full(fd_, output_data_ + offset, size, offset)) { return; }
+  // O_DIRECT can reject an unaligned request with EINVAL; fall back to the
+  // buffered descriptor for this window when that happens.
+  if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0 &&
+      pread_full(buffered_fd_, output_data_ + offset, size, offset)) {
+    return;
+  }
+  mps_parser_fail(error_type_t::RuntimeError,
+                  "Failed to pread raw MPS file '%s': %s",
+                  path_.c_str(),
+                  std::strerror(errno));
 }
 
 void raw_input_stream_t::run_decode_tasks()
@@ -177,96 +203,24 @@ void raw_input_stream_t::run_decode_tasks()
   std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
   thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
 
-  std::atomic_size_t next_window{0};
-  std::exception_ptr first_error = nullptr;
-  std::mutex error_mutex;
-  std::atomic_bool stop{false};
-
-  auto mark_error = [&](std::exception_ptr eptr) {
-    std::lock_guard<std::mutex> lock(error_mutex);
-    if (!first_error) {
-      first_error = eptr;
-      stop.store(true, std::memory_order_release);
-    }
-  };
-
-  auto read_window = [&](std::size_t index) {
-    MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
-    std::size_t offset = index * window_bytes_;
-    std::size_t size   = std::min(window_bytes_, file_size_ - offset);
-    std::size_t done   = 0;
-    {
-      MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
-      while (done < size) {
-        ssize_t got =
-          ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
-        if (got < 0) {
-          if (errno == EINTR) { continue; }
-          if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) {
-            got = ::pread(
-              buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done));
-            if (got >= 0) {
-              done += (std::size_t)got;
-              continue;
-            }
-            if (errno == EINTR) { continue; }
-          }
-          mps_parser_fail(error_type_t::RuntimeError,
-                          "Failed to pread raw MPS file '%s': %s",
-                          path_.c_str(),
-                          std::strerror(errno));
-        }
-        if (got == 0) {
-          mps_parser_fail(error_type_t::RuntimeError,
-                          "Unexpected EOF while reading raw MPS file '%s'",
-                          path_.c_str());
-        }
-        done += (std::size_t)got;
+  // Each window is read independently and handed to the scanner, which owns the
+  // contiguous decoded-byte frontier and the parallel section publication.
+  parallel_error_latch_t latch;
+  parallel_for_indexed(
+    window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) {
+      MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
+      std::size_t offset = index * window_bytes_;
+      std::size_t size   = std::min(window_bytes_, file_size_ - offset);
+      {
+        MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+        read_window_payload(offset, size);
       }
-    }
-
-    {
       MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
       section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
-      frontier_mutex_.lock();
-      block_done_[index] = 1;
-      block_end_[index]  = offset + size;
-      std::size_t before = ready_bytes_;
-      while (next_block_ < block_done_.size() && block_done_[next_block_]) {
-        ready_bytes_ = block_end_[next_block_];
-        ++next_block_;
-      }
-      std::size_t after = ready_bytes_;
-      frontier_mutex_.unlock();
-      if (after > before) { section_scanner_->publish_ready(after); }
-    }
-  };
-
-  std::vector<std::thread> workers;
-  workers.reserve(thread_count);
-  for (std::size_t t = 0; t < thread_count; ++t) {
-    workers.emplace_back([&, t] {
-      std::string thread_name = "raw-input-read-" + std::to_string(t);
-      nvtx::name_current_thread(thread_name.c_str());
-      MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io);
-      while (!stop.load(std::memory_order_acquire)) {
-        std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
-        if (index >= window_count_) { break; }
-        try {
-          read_window(index);
-        } catch (...) {
-          mark_error(std::current_exception());
-          return;
-        }
-      }
     });
-  }
-  for (auto& worker : workers) {
-    worker.join();
-  }
-  if (first_error) { std::rethrow_exception(first_error); }
+  latch.rethrow_if_error();
 
-  output_view_size_ = ready_bytes_;
+  output_view_size_ = section_scanner_->ready_bytes();
   section_scanner_->publish_ready(output_view_size_);
 }
 
@@ -283,17 +237,12 @@ char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); }
 std::size_t memory_input_stream_t::size() const noexcept { return input_size_; }
 std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
 std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; }
-mps_phase_registry_t& memory_input_stream_t::registry() noexcept { return registry_; }
-input_stream_view_t memory_input_stream_t::view() noexcept
-{
-  return {buffer_.data(), buffer_.data(), input_size_, compressed_size_, &registry_};
-}
 
 void memory_input_stream_t::run_decode_tasks()
 {
   MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io);
+  // Single block: observe_block advances the frontier and publishes.
   section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_);
-  section_scanner_->publish_ready(input_size_);
 }
 
 bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
@@ -332,4 +281,4 @@ const char* file_read_method_name(FileReadMethod method) noexcept
   }
 }
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index b0089be257..802d6fe191 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -1,20 +1,45 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
 // reserved. SPDX-License-Identifier: Apache-2.0
 
+// Input layer for the fast MPS parser: turns on-disk bytes (plain or .lz4) into one
+// contiguous parse buffer and publishes MPS section boundaries as data becomes available.
+//
+// Model:
+//   - Output is an anonymous mmap'd buffer (THP-hinted, tail-padded for SIMD/cursor safety).
+//     Raw inputs pread directly into fixed slots; LZ4 decodes into the same layout.
+//   - Work is split into windows (fixed spans of compressed/raw file bytes). Workers use
+//     parallel_for_indexed() — std::thread + shared-index dispatch, not OpenMP — because
+//     blocking pread()/decode does not compose cleanly with OMP team barriers.
+//   - Each completed window/block is handed to mps_section_block_scanner_t::observe_block().
+//     Blocks may finish out of order; the scanner advances a contiguous ready_bytes_
+//     frontier and publishes section ranges into mps_phase_registry_t only once the prefix
+//     up to a section title is contiguous and scannable.
+//   - The parser runs as OpenMP tasks on those published phases while run_decode_tasks()
+//     (raw parallel pread, or the LZ4 reader → metadata scanner → decoder pipeline) fills
+//     the buffer on separate threads. parallel_error_latch_t propagates the first worker
+//     failure and stops the rest.
+//
+// LZ4 adds a resident-window pool (parallel pread of compressed spans), block metadata
+// scanning with ptr_if_contiguous()/copy_to for window-boundary payloads, parallel decode
+// workers, window ref-counting/release, and lazy commit_up_to() of decoded output pages.
+
 #pragma once
 
 #include "mmap_region.hpp"
 #include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
 
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
+#include <exception>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <thread>
 #include <vector>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 inline constexpr std::size_t input_buffer_padding_bytes = 64;
 
@@ -62,6 +87,80 @@ std::size_t system_page_size();
 std::size_t get_file_size(int fd, const std::string& path);
 std::size_t get_file_size(const std::string& path);
 
+/**
+ * @brief Read exactly @p bytes at @p offset into @p dst, retrying on EINTR.
+ *
+ * Returns false and leaves errno set on error or unexpected EOF.
+ */
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset);
+
+// First-error-wins latch shared by the parallel reader/decoder pipelines. The
+// first captured exception is retained and a stop flag is raised so cooperating
+// workers can unwind promptly. The retained exception is rethrown by the
+// orchestrating thread once all workers have joined.
+class parallel_error_latch_t {
+ public:
+  void capture(std::exception_ptr eptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!first_error_) {
+      first_error_ = eptr;
+      stopped_.store(true, std::memory_order_release);
+    }
+  }
+
+  bool stopped() const noexcept { return stopped_.load(std::memory_order_acquire); }
+
+  void rethrow_if_error() const
+  {
+    if (first_error_) { std::rethrow_exception(first_error_); }
+  }
+
+ private:
+  std::mutex mutex_;
+  std::exception_ptr first_error_ = nullptr;
+  std::atomic_bool stopped_{false};
+};
+
+// Work-stealing parallel loop over [0, count). Each of thread_count workers pulls
+// the next index from a shared counter and invokes body(index). An exception
+// escaping body is captured into the latch and stops the loop; the caller is
+// responsible for calling latch.rethrow_if_error() after this returns. Workers
+// are named "<thread_name_prefix><worker-id>" when a prefix is supplied.
+// OMP just doesn't really play well with blocking pread()
+template <typename Body>
+void parallel_for_indexed(std::size_t count,
+                          std::size_t thread_count,
+                          parallel_error_latch_t& latch,
+                          const char* thread_name_prefix,
+                          Body body)
+{
+  std::atomic_size_t next{0};
+  std::vector<std::thread> workers;
+  workers.reserve(thread_count);
+  for (std::size_t t = 0; t < thread_count; ++t) {
+    workers.emplace_back([&, t] {
+      if (thread_name_prefix != nullptr) {
+        std::string name = thread_name_prefix + std::to_string(t);
+        nvtx::name_current_thread(name.c_str());
+      }
+      while (!latch.stopped()) {
+        std::size_t index = next.fetch_add(1, std::memory_order_relaxed);
+        if (index >= count) { break; }
+        try {
+          body(index);
+        } catch (...) {
+          latch.capture(std::current_exception());
+          return;
+        }
+      }
+    });
+  }
+  for (auto& worker : workers) {
+    worker.join();
+  }
+}
+
 struct input_stream_view_t {
   const char* data               = nullptr;
   char* mutable_data             = nullptr;
@@ -70,7 +169,28 @@ struct input_stream_view_t {
   mps_phase_registry_t* registry = nullptr;
 };
 
-class lz4_input_stream_t {
+/**
+ * @brief CRTP base supplying the registry and view() shared by every input
+ * stream. Derived classes provide data()/mutable_data()/size()/compressed_size().
+ */
+template <typename Derived>
+class input_stream_base_t {
+ public:
+  mps_phase_registry_t& registry() noexcept { return registry_; }
+
+  input_stream_view_t view() noexcept
+  {
+    auto* self = static_cast<Derived*>(this);
+    return {self->data(), self->mutable_data(), self->size(), self->compressed_size(), &registry_};
+  }
+
+ protected:
+  mps_phase_registry_t registry_;
+};
+
+// Handles lz4 compressed files (useful since lz4 is very fast, works well for MPS, and makes
+// parallel decompression trivial)
+class lz4_input_stream_t : public input_stream_base_t<lz4_input_stream_t> {
  public:
   explicit lz4_input_stream_t(const std::string& path);
   ~lz4_input_stream_t();
@@ -83,8 +203,6 @@ class lz4_input_stream_t {
   std::size_t size() const noexcept;
   std::size_t compressed_size() const noexcept;
   std::size_t reserve_size_hint() const noexcept;
-  mps_phase_registry_t& registry() noexcept;
-  input_stream_view_t view() noexcept;
 
   void run_decode_tasks();
 
@@ -108,13 +226,13 @@ class lz4_input_stream_t {
   bool block_checksum_               = false;
   bool content_checksum_             = false;
   bool dict_id_                      = false;
-  mps_phase_registry_t registry_;
   std::mutex commit_mutex_;
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
   std::size_t block_slot_count_ = 0;
 };
 
-class raw_input_stream_t {
+// Takes a file path
+class raw_input_stream_t : public input_stream_base_t<raw_input_stream_t> {
  public:
   explicit raw_input_stream_t(const std::string& path);
   ~raw_input_stream_t();
@@ -127,12 +245,12 @@ class raw_input_stream_t {
   std::size_t size() const noexcept;
   std::size_t compressed_size() const noexcept;
   std::size_t reserve_size_hint() const noexcept;
-  mps_phase_registry_t& registry() noexcept;
-  input_stream_view_t view() noexcept;
 
   void run_decode_tasks();
 
  private:
+  void read_window_payload(std::size_t offset, std::size_t size);
+
   std::string path_;
   int fd_          = -1;
   int buffered_fd_ = -1;
@@ -144,16 +262,11 @@ class raw_input_stream_t {
   std::size_t file_size_          = 0;
   std::size_t window_bytes_       = 0;
   std::size_t window_count_       = 0;
-  mps_phase_registry_t registry_;
-  std::mutex frontier_mutex_;
-  std::vector<unsigned char> block_done_;
-  std::vector<std::size_t> block_end_;
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
-  std::size_t next_block_  = 0;
-  std::size_t ready_bytes_ = 0;
 };
 
-class memory_input_stream_t {
+// Takes an in-memory buffer
+class memory_input_stream_t : public input_stream_base_t<memory_input_stream_t> {
  public:
   memory_input_stream_t(std::vector<char> buffer,
                         std::size_t input_size,
@@ -167,8 +280,6 @@ class memory_input_stream_t {
   std::size_t size() const noexcept;
   std::size_t compressed_size() const noexcept;
   std::size_t reserve_size_hint() const noexcept;
-  mps_phase_registry_t& registry() noexcept;
-  input_stream_view_t view() noexcept;
 
   void run_decode_tasks();
 
@@ -176,8 +287,7 @@ class memory_input_stream_t {
   std::vector<char> buffer_;
   std::size_t input_size_      = 0;
   std::size_t compressed_size_ = 0;
-  mps_phase_registry_t registry_;
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
 };
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
index 4d47b23c10..b7138fedb6 100644
--- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -25,7 +25,7 @@
 #include <string_view>
 #include <unordered_map>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 // below this threshold, the serial row-hash build is usually cheaper than partition setup
 inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024;
@@ -301,4 +301,4 @@ class smallstr_hash_table_t {
 #endif
 };
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 9c47ba63c7..2c40d6745b 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -38,7 +38,7 @@
 #include <utility>
 #include <vector>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_expects;
@@ -108,15 +108,14 @@ const lz4_runtime_t& lz4_runtime()
 }
 #endif
 
-int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, int dst_capacity)
+int lz4_decompress_safe_runtime([[maybe_unused]] const char* src,
+                                [[maybe_unused]] char* dst,
+                                [[maybe_unused]] int compressed_size,
+                                [[maybe_unused]] int dst_capacity)
 {
 #if defined(MPS_PARSER_WITH_LZ4)
   return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity);
 #else
-  (void)src;
-  (void)dst;
-  (void)compressed_size;
-  (void)dst_capacity;
   mps_parser_fail(
     error_type_t::RuntimeError,
     "Experimental fast MPS parser was built without LZ4 decompression support. "
@@ -127,7 +126,7 @@ int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size,
 void ensure_lz4_runtime_available()
 {
 #if defined(MPS_PARSER_WITH_LZ4)
-  (void)lz4_runtime();
+  [[maybe_unused]] auto& runtime = lz4_runtime();
 #else
   mps_parser_fail(
     error_type_t::RuntimeError,
@@ -176,27 +175,6 @@ std::size_t block_max_size_from_bd(unsigned char bd)
   }
 }
 
-bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset)
-{
-  std::size_t done = 0;
-  while (done < bytes) {
-    std::size_t remaining = bytes - done;
-    std::size_t chunk =
-      std::min<std::size_t>(remaining, (std::size_t)std::numeric_limits<ssize_t>::max());
-    ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done));
-    if (got < 0) {
-      if (errno == EINTR) { continue; }
-      return false;
-    }
-    if (got == 0) {
-      errno = EIO;
-      return false;
-    }
-    done += (std::size_t)got;
-  }
-  return true;
-}
-
 struct lz4_resident_window_t {
   std::size_t index       = 0;
   std::size_t file_offset = 0;
@@ -210,6 +188,9 @@ class lz4_resident_windows_t {
   {
   }
 
+  // Compressed file bytes arrive in fixed resident windows; block payloads may span a boundary.
+  // Return a direct pointer when the whole payload sits in one window (LZ4 decompress + pin);
+  // otherwise nullptr and the caller stages via copy_to.
   const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const
   {
     if (size == 0) return nullptr;
@@ -277,26 +258,29 @@ class lz4_resident_windows_t {
   std::vector<lz4_resident_window_t>& windows_;
 };
 
-}  // namespace
+// Parsed fields of the leading LZ4 frame descriptor (RFC: magic, FLG, BD, and
+// optional content size / dictionary id / header checksum).
+struct lz4_frame_header_t {
+  std::size_t block_max_size = 0;
+  std::size_t content_size   = 0;
+  std::size_t header_size    = 0;
+  bool content_size_present  = false;
+  bool block_checksum        = false;
+  bool content_checksum      = false;
+  bool dict_id               = false;
+};
 
-lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
+lz4_frame_header_t parse_lz4_frame_header(int fd,
+                                          const std::string& path,
+                                          std::size_t compressed_size)
 {
-  MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io);
-
-  ensure_lz4_runtime_available();
-
-  fd_ = open_lz4_fd(path);
-  ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
-
-  compressed_size_ = get_file_size(fd_, path);
-
-  char header[32];
-  if (compressed_size_ < 7) {
+  if (compressed_size < 7) {
     mps_parser_fail(error_type_t::ValidationError,
                     "LZ4 input is too small to contain a frame header");
   }
-  std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size_);
-  if (!pread_full_plain(fd_, header, header_bytes, 0)) {
+  char header[32];
+  std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size);
+  if (!pread_full(fd, header, header_bytes, 0)) {
     mps_parser_fail(error_type_t::RuntimeError,
                     "Failed to read LZ4 frame header '%s': %s",
                     path.c_str(),
@@ -317,24 +301,26 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
     mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version");
   }
   bool block_independent = (flg & 0x20u) != 0;
-  block_checksum_        = (flg & 0x10u) != 0;
-  content_size_present_  = (flg & 0x08u) != 0;
-  content_checksum_      = (flg & 0x04u) != 0;
-  dict_id_               = (flg & 0x01u) != 0;
   if (!block_independent) {
     mps_parser_fail(error_type_t::ValidationError,
                     "parallel LZ4 reader requires independent blocks; compress with -BI");
   }
-  block_max_size_ = block_max_size_from_bd(bd);
-  if (content_size_present_) {
+
+  lz4_frame_header_t info;
+  info.block_checksum       = (flg & 0x10u) != 0;
+  info.content_size_present = (flg & 0x08u) != 0;
+  info.content_checksum     = (flg & 0x04u) != 0;
+  info.dict_id              = (flg & 0x01u) != 0;
+  info.block_max_size       = block_max_size_from_bd(bd);
+  if (info.content_size_present) {
     if (offset + 8 > header_bytes) {
       mps_parser_fail(error_type_t::ValidationError,
                       "truncated LZ4 frame while reading content size");
     }
-    content_size_ = (std::size_t)read_le64(header + offset);
+    info.content_size = (std::size_t)read_le64(header + offset);
     offset += 8;
   }
-  if (dict_id_) {
+  if (info.dict_id) {
     if (offset + 4 > header_bytes) {
       mps_parser_fail(error_type_t::ValidationError,
                       "truncated LZ4 frame while reading dictionary id");
@@ -346,7 +332,31 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
                     "truncated LZ4 frame while reading header checksum");
   }
   offset += 1;
-  header_size_ = offset;
+  info.header_size = offset;
+  return info;
+}
+
+}  // namespace
+
+lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("lz4_input_constructor", nvtx::colors::io);
+
+  ensure_lz4_runtime_available();
+
+  fd_ = open_lz4_fd(path);
+  ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+
+  compressed_size_ = get_file_size(fd_, path);
+
+  lz4_frame_header_t header = parse_lz4_frame_header(fd_, path, compressed_size_);
+  block_max_size_           = header.block_max_size;
+  content_size_             = header.content_size;
+  header_size_              = header.header_size;
+  content_size_present_     = header.content_size_present;
+  block_checksum_           = header.block_checksum;
+  content_checksum_         = header.content_checksum;
+  dict_id_                  = header.dict_id;
 
   std::size_t reserve_size = content_size_;
   if (!content_size_present_) {
@@ -355,7 +365,7 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
   }
   reserve_size += input_buffer_padding_bytes;
 
-  constexpr std::size_t huge_alignment = 2 * 1024 * 1024;
+  constexpr std::size_t huge_alignment = 2 * 1024 * 1024;  // 2MiB
   output_mapped_size_                  = cuda::round_up(reserve_size, system_page_size());
   output_region_                       = mmap_region_t::anonymous_aligned(output_mapped_size_,
                                                     huge_alignment,
@@ -385,11 +395,6 @@ std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept
            ? content_size_
            : std::max<std::size_t>(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024);
 }
-mps_phase_registry_t& lz4_input_stream_t::registry() noexcept { return registry_; }
-input_stream_view_t lz4_input_stream_t::view() noexcept
-{
-  return {output_data_, output_data_, output_view_size_, compressed_size_, &registry_};
-}
 
 void lz4_input_stream_t::commit_up_to(std::size_t bytes)
 {
@@ -419,6 +424,12 @@ struct resident_block_desc_t {
   bool uncompressed               = false;
 };
 
+// Two distinct units flow through this pipeline:
+//   * window  - a fixed-size span of the compressed file read by the I/O stage.
+//   * block   - a single independent LZ4 data block (decompressed unit) that the
+//               metadata scanner discovers inside the resident windows.
+// Windows feed blocks; the decoded blocks are handed to the section scanner,
+// which owns the contiguous decoded-byte frontier and section publication.
 struct lz4_pipeline_t {
   explicit lz4_pipeline_t(lz4_input_stream_t& input_)
     : input(input_),
@@ -428,9 +439,7 @@ struct lz4_pipeline_t {
       window_done(window_count, 0),
       window_refs(window_count),
       window_scanned(window_count),
-      window_released(window_count),
-      block_done(input.block_slot_count_, 0),
-      block_end(input.block_slot_count_, 0)
+      window_released(window_count)
   {
     for (std::size_t i = 0; i < window_count; ++i) {
       std::size_t offset     = i * window_bytes;
@@ -446,39 +455,27 @@ struct lz4_pipeline_t {
 
   void run()
   {
-    start_readers();
     std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this);
     start_decoders();
+    run_readers();
 
-    for (auto& reader : readers) {
-      reader.join();
-    }
     scanner.join();
     for (auto& worker : decoders) {
       worker.join();
     }
-    if (first_error) { std::rethrow_exception(first_error); }
+    latch.rethrow_if_error();
   }
 
   void finalize()
   {
-    input.output_view_size_ = ready_bytes;
+    input.output_view_size_ = input.section_scanner_->ready_bytes();
     input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes);
     input.section_scanner_->publish_ready(input.output_view_size_);
   }
 
-  void mark_error(std::exception_ptr eptr)
-  {
-    std::lock_guard<std::mutex> lock(error_mutex);
-    if (!first_error) {
-      first_error = eptr;
-      stop_workers.store(true, std::memory_order_release);
-    }
-  }
-
   void fail_and_notify(std::exception_ptr eptr)
   {
-    mark_error(eptr);
+    latch.capture(eptr);
     window_cv.notify_all();
     desc_cv.notify_all();
   }
@@ -513,46 +510,42 @@ struct lz4_pipeline_t {
     }
   }
 
-  void start_readers()
+  void run_readers()
   {
-    readers.reserve(io_threads);
-    for (std::size_t t = 0; t < io_threads; ++t) {
-      readers.emplace_back(&lz4_pipeline_t::run_reader_stage, this, t);
-    }
+    parallel_for_indexed(
+      window_count, io_threads, latch, "lz4-window-read-", [this](std::size_t index) {
+        read_window(index);
+      });
   }
 
-  void run_reader_stage(std::size_t tid)
+  void read_window(std::size_t index)
   {
-    std::string thread_name = "lz4-window-read-" + std::to_string(tid);
-    nvtx::name_current_thread(thread_name.c_str());
-    while (!stop_workers.load(std::memory_order_acquire)) {
-      std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed);
-      if (index >= windows.size()) { break; }
-      auto& w = windows[index];
-      w.data.reset(new char[w.size]);
-      add_compressed_resident(w.size);
-      bool ok = false;
-      {
-        MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
-        ok = pread_full_plain(input.fd_, w.data.get(), w.size, w.file_offset);
-      }
-      if (!ok) {
-        try {
-          mps_parser_fail(error_type_t::RuntimeError,
-                          "Failed to pread LZ4 resident window: %s",
-                          std::strerror(errno));
-        } catch (...) {
-          fail_and_notify(std::current_exception());
-        }
-        return;
-      }
-      {
-        MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
-        std::lock_guard<std::mutex> lock(window_mutex);
-        window_done[index] = 1;
+    auto& w = windows[index];
+    w.data.reset(new char[w.size]);
+    add_compressed_resident(w.size);
+    bool ok = false;
+    {
+      MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+      ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset);
+    }
+    if (!ok) {
+      // Capture-and-notify locally so scanner/decoder waiters wake; do not let
+      // the exception escape to parallel_for_indexed without the cv notify.
+      try {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "Failed to pread LZ4 resident window: %s",
+                        std::strerror(errno));
+      } catch (...) {
+        fail_and_notify(std::current_exception());
       }
-      window_cv.notify_all();
+      return;
     }
+    {
+      MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+      std::lock_guard<std::mutex> lock(window_mutex);
+      window_done[index] = 1;
+    }
+    window_cv.notify_all();
   }
 
   void start_decoders()
@@ -582,10 +575,8 @@ struct lz4_pipeline_t {
   {
     MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
     std::unique_lock<std::mutex> lock(desc_mutex);
-    desc_cv.wait(lock, [&] {
-      return stop_workers.load(std::memory_order_acquire) || scanner_done || !desc_queue.empty();
-    });
-    if (stop_workers.load(std::memory_order_acquire) || desc_queue.empty()) { return {}; }
+    desc_cv.wait(lock, [&] { return latch.stopped() || scanner_done || !desc_queue.empty(); });
+    if (latch.stopped() || desc_queue.empty()) { return {}; }
     std::vector<resident_block_desc_t> batch = std::move(desc_queue.front());
     desc_queue.pop_front();
     return batch;
@@ -628,35 +619,16 @@ struct lz4_pipeline_t {
   {
     if (block.window_index == std::numeric_limits<std::size_t>::max()) { return; }
     uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel);
-    (void)old;
     assert(old > 0);
     if (old == 1) { try_release_window(block.window_index); }
   }
 
   void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size)
   {
-    {
-      MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
-      input.section_scanner_->observe_block(block.index, dst, dst + actual_size);
-    }
-    std::size_t before = 0;
-    std::size_t after  = 0;
-    {
-      MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic);
-      std::lock_guard<std::mutex> lock(frontier_mutex);
-      block_done[block.index] = 1;
-      block_end[block.index]  = block.decompressed_offset + actual_size;
-      before                  = ready_bytes;
-      while (next_block < block_done.size() && block_done[next_block]) {
-        ready_bytes = block_end[next_block];
-        ++next_block;
-      }
-      after = ready_bytes;
-    }
-    if (after > before) {
-      MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic);
-      input.section_scanner_->publish_ready(after);
-    }
+    MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
+    // The scanner advances the contiguous decoded-byte frontier and publishes
+    // section ranges as blocks complete, regardless of decode order.
+    input.section_scanner_->observe_block(block.index, dst, dst + actual_size);
   }
 
   void wait_range_ready(std::size_t begin, std::size_t size)
@@ -675,9 +647,8 @@ struct lz4_pipeline_t {
     for (std::size_t wi = first; wi <= last; ++wi) {
       MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
       std::unique_lock<std::mutex> lock(window_mutex);
-      window_cv.wait(
-        lock, [&] { return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; });
-      if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) {
+      window_cv.wait(lock, [&] { return latch.stopped() || window_done[wi] != 0; });
+      if (latch.stopped() && window_done[wi] == 0) {
         mps_parser_fail(error_type_t::RuntimeError,
                         "LZ4 metadata scanner stopped before required window was ready");
       }
@@ -745,7 +716,7 @@ struct lz4_pipeline_t {
       batch_decoded_bytes += block.decompressed_size;
       batch.push_back(block);
       blocks_scanned.fetch_add(1, std::memory_order_relaxed);
-      if (blocks_scanned.load(std::memory_order_relaxed) > block_done.size()) {
+      if (blocks_scanned.load(std::memory_order_relaxed) > input.block_slot_count_) {
         mps_parser_fail(error_type_t::OutOfMemoryError,
                         "LZ4 input block count exceeded reserved metadata slots");
       }
@@ -857,11 +828,8 @@ struct lz4_pipeline_t {
   std::vector<lz4_resident_window_t> windows;
   const std::size_t io_threads;
 
-  std::exception_ptr first_error = nullptr;
-  std::mutex error_mutex;
-  std::atomic_bool stop_workers{false};
+  parallel_error_latch_t latch;
 
-  std::atomic_size_t next_window{0};
   std::vector<unsigned char> window_done;
   std::vector<std::atomic<uint32_t>> window_refs;
   std::vector<std::atomic<uint8_t>> window_scanned;
@@ -878,15 +846,7 @@ struct lz4_pipeline_t {
 
   std::atomic_size_t blocks_scanned{0};
   std::vector<std::vector<char>> crossing_payloads;
-  std::vector<std::thread> readers;
   std::vector<std::thread> decoders;
-
-  // Tracks the contiguous decoded-byte frontier across out-of-order block completions.
-  std::mutex frontier_mutex;
-  std::vector<unsigned char> block_done;
-  std::vector<std::size_t> block_end;
-  std::size_t next_block  = 0;
-  std::size_t ready_bytes = 0;
 };
 
 void lz4_input_stream_t::run_decode_tasks()
@@ -897,4 +857,4 @@ void lz4_input_stream_t::run_decode_tasks()
   pipeline.finalize();
 }
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
index 389f563efa..7727e0d2f7 100644
--- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -19,7 +19,7 @@
 #include <stdexcept>
 #include <string>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_expects;
@@ -131,4 +131,4 @@ class mmap_region_t {
   std::size_t size_ = 0;
 };
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index 9eee8708e0..a3c9fe87a3 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -16,7 +16,7 @@
 #include <simde/x86/avx2.h>
 #include <simde/x86/sse4.2.h>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 using cuopt::linear_programming::io::error_type_t;
 using cuopt::linear_programming::io::mps_parser_expects;
@@ -202,7 +202,7 @@ void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, cons
   const char* expected           = nullptr;
   if (slot.compare_exchange_strong(
         expected, ptr, std::memory_order_release, std::memory_order_acquire)) {
-    publish_section_ranges();
+    notify_ready_phases();
   }
 }
 
@@ -252,13 +252,14 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin, const ch
   }
 
   // In compliant MPS, indicator records begin in column 1 while data records
-  // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap
-  // candidate signal, then run the exact section matcher only for candidates.
+  // begin in column 2+. use "\n[nonblank]" as a needle for the SIMD scan
   const simde__m256i newline = simde_mm256_set1_epi8('\n');
   while ((std::size_t)(end - p) >= kSimdWidth) {
     // The first-line path above increments p when p == data_, so p - 1 is
     // in-bounds here. Loading the previous vector lets us test "\nX" for all
     // 32 candidate column-1 bytes with one AVX2 mask.
+    // loadu is comparable to aligned reads on modern SSE/AVX.
+    // might warrant some checks on ARM though
     simde__m256i current  = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p));
     simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p - 1));
     std::uint32_t mask    = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256(
@@ -290,6 +291,8 @@ void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::siz
   scan_section_range(data_ + begin, data_ + end);
 }
 
+// scans a freshly decoded block for section titles, along with the start/end boundaries if a
+// section title straddles blocks
 void mps_section_block_scanner_t::observe_block(std::size_t block_index,
                                                 const char* begin,
                                                 const char* end)
@@ -311,6 +314,26 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
       block_decoded_[block_index + 1].load(std::memory_order_acquire)) {
     scan_boundary(block_index, block_index + 1);
   }
+
+  advance_ready_frontier();
+}
+
+void mps_section_block_scanner_t::advance_ready_frontier()
+{
+  std::size_t new_ready = 0;
+  bool grew             = false;
+  {
+    // block_decoded_ is stored with release after the begin/end offsets, so an
+    // acquire load of a set flag makes the matching end offset visible here.
+    std::lock_guard<std::mutex> lock(frontier_mutex_);
+    while (next_block_ < block_count_ &&
+           block_decoded_[next_block_].load(std::memory_order_acquire)) {
+      new_ready = block_end_offsets_[next_block_].load(std::memory_order_acquire);
+      ++next_block_;
+      grew = true;
+    }
+  }
+  if (grew) { publish_ready(new_ready); }
 }
 
 void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
@@ -318,10 +341,15 @@ void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
   ready_bytes_.store(ready_bytes, std::memory_order_release);
   std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0;
   scan_section_range(data_ + begin, data_ + ready_bytes);
-  publish_section_ranges();
+  notify_ready_phases();
+}
+
+std::size_t mps_section_block_scanner_t::ready_bytes() const noexcept
+{
+  return ready_bytes_.load(std::memory_order_acquire);
 }
 
-void mps_section_block_scanner_t::publish_section_ranges()
+void mps_section_block_scanner_t::notify_ready_phases()
 {
   // Publication model: each present phase runs from its own section header to
   // the first later section header that has been discovered. Optional sections
@@ -430,4 +458,4 @@ void mps_section_block_scanner_t::publish_section_ranges()
   }
 }
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index 9fcffa6ea7..7fd249a7e8 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -11,7 +11,11 @@
 
 #include <omp.h>
 
-namespace mps_fast {
+// The section scanner handles freshly read/decoded blocks and scans them for section titles while
+// they're still warm in cache it then publishes read/decoded input ranges to the parser workers,
+// which handle their respective sections in parallel.
+
+namespace cuopt::linear_programming::io::detail {
 
 enum class mps_section_kind {
   rows,
@@ -78,9 +82,17 @@ class mps_section_block_scanner_t {
                               std::size_t block_count,
                               mps_phase_registry_t& registry);
 
+  // Records a freshly decoded block, scans it for section titles, advances the
+  // contiguous decoded-byte frontier across out-of-order completions, and
+  // publishes any newly available section ranges. Producers only need to feed
+  // blocks in any order; the frontier and publication live entirely here.
   void observe_block(std::size_t block_index, const char* begin, const char* end);
   void publish_ready(std::size_t ready_bytes);
 
+  // Current contiguous decoded-byte frontier; producers use this as the final
+  // view size once all blocks have been observed.
+  std::size_t ready_bytes() const noexcept;
+
  private:
   static constexpr std::size_t section_count = 9;
   // Section titles are short; 128 bytes is enough to rescan around a decoded
@@ -92,7 +104,8 @@ class mps_section_block_scanner_t {
   void scan_section_range(const char* begin, const char* end);
   void scan_boundary(std::size_t left_index, std::size_t right_index);
   void record_section_hit(mps_section_kind kind, const char* ptr);
-  void publish_section_ranges();
+  void notify_ready_phases();
+  void advance_ready_frontier();
 
   const char* data_        = nullptr;
   std::size_t block_count_ = 0;
@@ -103,6 +116,8 @@ class mps_section_block_scanner_t {
   std::unique_ptr<std::atomic_size_t[]> block_end_offsets_;
   std::atomic_size_t ready_bytes_{0};
   std::atomic<const char*> section_hits_[section_count]{};
+  std::mutex frontier_mutex_;
+  std::size_t next_block_ = 0;
 };
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
index fac9e64d78..0f47b45f56 100644
--- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -14,7 +14,7 @@
 #include <unistd.h>
 #endif
 
-namespace mps_fast::nvtx {
+namespace cuopt::linear_programming::io::detail::nvtx {
 
 namespace colors {
 constexpr std::uint32_t generic  = 0xff8b949e;
@@ -92,7 +92,9 @@ class scoped_range_t {
   scoped_range_t& operator=(const scoped_range_t&) = delete;
 
  private:
-  void push(const char* name, std::uint32_t color, std::uint32_t category)
+  void push([[maybe_unused]] const char* name,
+            [[maybe_unused]] std::uint32_t color,
+            [[maybe_unused]] std::uint32_t category)
   {
 #ifdef MPS_FAST_NVTX
     nvtxEventAttributes_t event{};
@@ -105,10 +107,6 @@ class scoped_range_t {
     event.category      = category;
     nvtxRangePushEx(&event);
     active_ = true;
-#else
-    (void)name;
-    (void)color;
-    (void)category;
 #endif
   }
 
@@ -118,18 +116,17 @@ class scoped_range_t {
 #endif
 };
 
-inline void name_current_thread(const char* name)
+inline void name_current_thread([[maybe_unused]] const char* name)
 {
 #ifdef MPS_FAST_NVTX
   nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name);
-#else
-  (void)name;
 #endif
 }
 
-}  // namespace mps_fast::nvtx
+}  // namespace cuopt::linear_programming::io::detail::nvtx
 
 #define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b
 #define MPS_FAST_NVTX_CONCAT(a, b)       MPS_FAST_NVTX_CONCAT_INNER(a, b)
-#define MPS_NVTX_RANGE(name, color) \
-  ::mps_fast::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color)
+#define MPS_NVTX_RANGE(name, color)                                                   \
+  ::cuopt::linear_programming::io::detail::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT( \
+    _mps_nvtx_range_, __LINE__)(name, color)
diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp
index 5f7cecda94..9d4dea2bbf 100644
--- a/cpp/src/io/mps_parser.cpp
+++ b/cpp/src/io/mps_parser.cpp
@@ -797,9 +797,9 @@ void mps_parser_t<i_t, f_t>::parse_rows(std::string_view line)
   }
   if (type == Objective) {
     // Keep only the first name or OBJNAME since it was set before
-    if (objective_name.empty())
-      objective_name = name;
-    else
+    if (objective_name.empty()) objective_name = name;
+    // aligns with CPLEX/SCIP behavior
+    else if (name != objective_name)
       ignored_objective_names.emplace(name);
     // If we wanted to strictly follow MPS definition: a new objective row ('N') should be treated
     // as an unbounded constraints, aka an extra contraints row with lower bound -infinity and upper
diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp
index 6392833ce3..c9b3a351c6 100644
--- a/cpp/src/io/parser.cpp
+++ b/cpp/src/io/parser.cpp
@@ -44,7 +44,7 @@ template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
 {
   CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
-  return mps_fast::parse_mps_fast_file<i_t, f_t>(mps_file_path);
+  return detail::parse_mps_fast_file<i_t, f_t>(mps_file_path);
 }
 
 template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp
index 96a881c880..70658aa9b3 100644
--- a/cpp/src/utilities/perf_counters.hpp
+++ b/cpp/src/utilities/perf_counters.hpp
@@ -14,7 +14,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace mps_fast {
+namespace cuopt::linear_programming::io::detail {
 
 // Utils to return to total resident set size (used physical pages)
 static size_t parse_status_kb_line(const char* line, const char* key)
@@ -191,4 +191,4 @@ static inline void print_perf_totals(const char* label,
   std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate);
 }
 
-}  // namespace mps_fast
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
index f07d84ebde..2ef8339da3 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -17,6 +17,8 @@
 #include <string_view>
 #include <vector>
 
+namespace cuopt::linear_programming::io::detail {
+
 namespace {
 
 uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
@@ -27,36 +29,37 @@ double reference_strtod(std::string_view token)
   for (char& c : normalized) {
     if (c == 'd' || c == 'D') { c = 'e'; }
   }
-  char* end    = nullptr;
-  errno        = 0;
-  double value = std::strtod(normalized.c_str(), &end);
-  ASSERT_EQ(end, normalized.c_str() + normalized.size());
-  return value;
+  char* end = nullptr;
+  errno     = 0;
+  return std::strtod(normalized.c_str(), &end);
 }
 
 double parse_token(std::string_view token)
 {
   const char* p = token.data();
-  return mps_fast::fp64::parse_fp64_advance(p, token.data() + token.size());
+  return fp64::parse_fp64_advance(p, token.data() + token.size());
 }
 
-double parse_padded_token(std::string_view token)
+void check_bitwise_strtod(std::string_view token)
 {
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end        = nullptr;
+  errno            = 0;
+  const double ref = std::strtod(normalized.c_str(), &end);
+  EXPECT_EQ(end, normalized.c_str() + normalized.size());
+
   std::string padded(token);
   padded.append(40, ' ');
-  const char* p = padded.data();
-  double value  = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size());
-  ASSERT_EQ(p, padded.data() + token.size());
-  return value;
-}
+  const char* p             = padded.data();
+  const double padded_value = fp64::parse_fp64_advance(p, padded.data() + padded.size());
+  EXPECT_EQ(p, padded.data() + token.size());
 
-void check_bitwise_strtod(std::string_view token)
-{
-  const double ref        = reference_strtod(token);
   const uint64_t ref_bits = bits(ref);
   EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'";
-  EXPECT_EQ(ref_bits, bits(parse_padded_token(token)))
-    << "padded parse mismatch for '" << token << "'";
+  EXPECT_EQ(ref_bits, bits(padded_value)) << "padded parse mismatch for '" << token << "'";
 }
 
 std::string random_token(std::mt19937_64& rng)
@@ -155,7 +158,7 @@ TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd)
   std::setlocale(LC_NUMERIC, "C");
   std::string text = "123.45  ABC";
   const char* p    = text.data();
-  double value     = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size());
+  double value     = fp64::parse_fp64_advance(p, text.data() + text.size());
 
   EXPECT_EQ(bits(reference_strtod("123.45")), bits(value));
   EXPECT_EQ(text.data() + 6, p);
@@ -172,3 +175,5 @@ TEST(FastFp64ParserTest, FixedSeedRandomDifferential)
     check_bitwise_strtod(token);
   }
 }
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
index aa05736616..fe349b47e0 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -25,6 +25,8 @@
 
 #include <unistd.h>
 
+namespace cuopt::linear_programming::io::detail {
+
 namespace {
 
 struct TempMpsFile {
@@ -36,20 +38,24 @@ struct TempMpsFile {
                   "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps",
                   static_cast<long>(getpid()));
     int fd = mkstemps(path_template, 4);
-    if (fd < 0) { FAIL() << "mkstemps failed: " << std::strerror(errno); }
+    if (fd < 0) {
+      throw std::runtime_error(std::string("mkstemps failed: ") + std::strerror(errno));
+    }
     path       = path_template;
     FILE* file = fdopen(fd, "wb");
     if (file == nullptr) {
       close(fd);
-      FAIL() << "fdopen failed: " << std::strerror(errno);
+      throw std::runtime_error(std::string("fdopen failed: ") + std::strerror(errno));
     }
     if (!contents.empty() &&
         std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) {
       std::fclose(file);
-      FAIL() << "failed to write temporary MPS file: " << std::strerror(errno);
+      throw std::runtime_error(std::string("failed to write temporary MPS file: ") +
+                               std::strerror(errno));
     }
     if (std::fclose(file) != 0) {
-      FAIL() << "failed to close temporary MPS file: " << std::strerror(errno);
+      throw std::runtime_error(std::string("failed to close temporary MPS file: ") +
+                               std::strerror(errno));
     }
   }
 
@@ -77,7 +83,7 @@ struct TempOwnedPath {
   std::string path;
 };
 
-std::string_view range_text(const mps_fast::mps_phase_range_t& range)
+std::string_view range_text(const mps_phase_range_t& range)
 {
   if (!range.present) { return {}; }
   return std::string_view(range.begin, static_cast<size_t>(range.end - range.begin));
@@ -85,15 +91,14 @@ std::string_view range_text(const mps_fast::mps_phase_range_t& range)
 
 uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
 
-void check_models_match_reference_bitwise(
-  const mps_fast::parser_model_t<int, double>& fast,
-  const cuopt::linear_programming::io::mps_data_model_t<int, double>& reference,
-  std::string_view context)
+void check_models_match_reference_bitwise(const parser_model_t<int, double>& fast,
+                                          const mps_data_model_t<int, double>& reference,
+                                          std::string_view context)
 {
   EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars";
   EXPECT_EQ(reference.n_constraints_, fast.n_constraints_)
     << std::string(context) + " n_constraints";
-  EXPECT_EQ(reference.nnz_, fast.nnz_) << std::string(context) + " nnz";
+  EXPECT_EQ(reference.get_nnz(), fast.get_nnz()) << std::string(context) + " nnz";
   EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize";
   EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name";
   EXPECT_EQ(reference.objective_name_, fast.objective_name_)
@@ -126,8 +131,8 @@ void check_models_match_reference_bitwise(
 void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
 {
   TempMpsFile file(std::move(contents));
-  auto fast = mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  auto reference = cuopt::linear_programming::io::read_mps<int, double>(file.path, false);
+  auto fast      = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  auto reference = read_mps<int, double>(file.path, false);
   check_models_match_reference_bitwise(fast, reference, fixture_name);
 }
 
@@ -138,7 +143,7 @@ std::string row_name(size_t i)
   return out.str();
 }
 
-int find_var_index(const mps_fast::parser_model_t<int, double>& model, std::string_view name)
+int find_var_index(const parser_model_t<int, double>& model, std::string_view name)
 {
   for (size_t i = 0; i < model.var_names_.size(); ++i) {
     if (model.var_names_[i] == name) { return static_cast<int>(i); }
@@ -146,11 +151,8 @@ int find_var_index(const mps_fast::parser_model_t<int, double>& model, std::stri
   return -1;
 }
 
-void check_model_shapes(const mps_fast::parser_model_t<int, double>& model,
-                        int rows,
-                        int vars,
-                        int nnz,
-                        std::string_view context)
+void check_model_shapes(
+  const parser_model_t<int, double>& model, int rows, int vars, int nnz, std::string_view context)
 {
   EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows";
   EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars";
@@ -210,24 +212,23 @@ TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks)
   EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split";
   const size_t split = columns_pos + 3;
 
-  mps_fast::mps_phase_registry_t registry;
-  mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+  mps_phase_registry_t registry;
+  mps_section_block_scanner_t scanner(mps.data(), 2, registry);
 
   scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
   scanner.publish_ready(0);
   scanner.observe_block(0, mps.data(), mps.data() + split);
   scanner.publish_ready(mps.size());
 
-  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::header)) << "header not ready";
-  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready";
-  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) << "columns not ready";
-  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready";
-  EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic))
-    << "quadratic sentinel not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::header)) << "header not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready";
 
-  EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"))
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::columns)).starts_with("COLUMNS"))
     << "columns range begins at wrong boundary";
-  EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"))
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::rhs)).starts_with("RHS"))
     << "rhs range begins at wrong boundary";
 }
 
@@ -241,20 +242,18 @@ TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte)
     EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture";
     for (size_t offset = 1; offset < header.size(); ++offset) {
       const size_t split = pos + offset;
-      mps_fast::mps_phase_registry_t registry;
-      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 2, registry);
 
       scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
       scanner.observe_block(0, mps.data(), mps.data() + split);
       scanner.publish_ready(mps.size());
 
-      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready after split";
-      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns))
-        << "columns not ready after split";
-      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready after split";
-      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::bounds))
-        << "bounds not ready after split";
-      EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic))
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::bounds)) << "bounds not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic))
         << "quadratic sentinel not ready after split";
     }
   }
@@ -273,8 +272,8 @@ TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows)
 
   EXPECT_THROW(
     {
-      mps_fast::mps_phase_registry_t registry;
-      mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry);
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 1, registry);
       scanner.observe_block(0, mps.data(), mps.data() + mps.size());
       scanner.publish_ready(mps.size());
     },
@@ -328,8 +327,7 @@ TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins)
 
   verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents);
   TempMpsFile file(contents);
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   EXPECT_EQ(1, model.n_vars_) << "n_vars";
   EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound";
   EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound";
@@ -371,8 +369,7 @@ TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath)
     " RHS1 rowA 0\n"
     "ENDATA\n");
 
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars";
   EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints";
   EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default";
@@ -397,8 +394,7 @@ TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically)
     " SC B AUX_S 5\n"
     "ENDATA\n");
 
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   check_model_shapes(model, 1, 4, 1, "bounds-only");
   EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name";
   EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1";
@@ -439,8 +435,7 @@ TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds)
     " RHS1 R1 10\n"
     "ENDATA\n");
 
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   check_model_shapes(model, 1, 3, 3, "integer markers");
   const int xint  = find_var_index(model, "XINT");
   const int xcont = find_var_index(model, "XCONT");
@@ -530,36 +525,23 @@ TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise)
 
 TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective)
 {
-  TempMpsFile file(
-    "NAME OBJMETA\n"
-    "OBJSENSE\n"
-    " MAX\n"
-    "OBJNAME\n"
-    " COST\n"
-    "ROWS\n"
-    " N ALT\n"
-    " N COST\n"
-    " L R1\n"
-    "COLUMNS\n"
-    " X1 ALT 100 COST 5\n"
-    " X1 R1 1\n"
-    " X2 COST -2 R1 3\n"
-    "RHS\n"
-    " RHS1 COST 7 R1 11\n"
-    "ENDATA\n");
-
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-  EXPECT_TRUE(model.maximize_) << "OBJSENSE MAX not applied";
-  EXPECT_EQ(std::string("OBJMETA"), model.problem_name_) << "problem name";
-  EXPECT_EQ(std::string("COST"), model.objective_name_) << "objective name";
-  EXPECT_EQ(-7.0, model.objective_offset_) << "objective RHS offset";
-  const int x1 = find_var_index(model, "X1");
-  const int x2 = find_var_index(model, "X2");
-  ASSERT_GE(x1, 0);
-  ASSERT_GE(x2, 0);
-  EXPECT_EQ(5.0, model.c_.at(x1)) << "named objective coefficient X1";
-  EXPECT_EQ(-2.0, model.c_.at(x2)) << "named objective coefficient X2";
+  verify_fixture_bitwise("objective_metadata",
+                         "NAME OBJMETA\n"
+                         "OBJSENSE\n"
+                         " MAX\n"
+                         "OBJNAME\n"
+                         " COST\n"
+                         "ROWS\n"
+                         " N ALT\n"
+                         " N COST\n"
+                         " L R1\n"
+                         "COLUMNS\n"
+                         " X1 ALT 100 COST 5\n"
+                         " X1 R1 1\n"
+                         " X2 COST -2 R1 3\n"
+                         "RHS\n"
+                         " RHS1 COST 7 R1 11\n"
+                         "ENDATA\n");
 }
 
 TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
@@ -577,11 +559,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
       "RHS\n"
       " RHS1 R1 0\n"
       "ENDATA\n");
-    EXPECT_THROW(
-      {
-        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-      },
-      std::logic_error);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
   }
 
   {
@@ -595,11 +574,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
       "RHS\n"
       " RHS1 R1 0\n"
       "ENDATA\n");
-    EXPECT_THROW(
-      {
-        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-      },
-      std::logic_error);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
   }
 
   {
@@ -613,11 +589,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
       "RHS\n"
       " RHS1 MISSING 1\n"
       "ENDATA\n");
-    EXPECT_THROW(
-      {
-        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-      },
-      std::logic_error);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
   }
 
   {
@@ -633,11 +606,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
       "BOUNDS\n"
       " XX B X1 1\n"
       "ENDATA\n");
-    EXPECT_THROW(
-      {
-        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-      },
-      std::logic_error);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
   }
 
   {
@@ -653,11 +623,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
       "BOUNDS\n"
       " SC B X1\n"
       "ENDATA\n");
-    EXPECT_THROW(
-      {
-        (void)mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
-      },
-      std::logic_error);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
   }
 }
 
@@ -685,8 +652,7 @@ TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary)
   mps += " 0\nENDATA\n";
 
   TempMpsFile file(std::move(mps));
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   check_model_shapes(
     model, static_cast<int>(row_count), 2, static_cast<int>(row_count + 1), "large columns");
   EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name";
@@ -708,8 +674,7 @@ TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered)
   mps += "ENDATA\n";
 
   TempMpsFile file(std::move(mps));
-  auto model =
-    mps_fast::parse_mps_fast_file<int, double>(file.path, mps_fast::FileReadMethod::Read);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
   check_model_shapes(model, 1, 1, 1, "large bounds");
   EXPECT_EQ(static_cast<double>((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0))
     << "large repeated bounds last value";
@@ -743,10 +708,8 @@ TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput)
   const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path;
   if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; }
 
-  auto raw =
-    mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
-  auto lz4 =
-    mps_fast::parse_mps_fast_file<int, double>(lz4_file.path, mps_fast::FileReadMethod::Read);
+  auto raw = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto lz4 = parse_mps_fast_file<int, double>(lz4_file.path, FileReadMethod::Read);
 
   check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
   EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count";
@@ -777,12 +740,9 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
   if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; }
   if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; }
 
-  auto raw =
-    mps_fast::parse_mps_fast_file<int, double>(raw_file.path, mps_fast::FileReadMethod::Read);
-  auto gzip =
-    mps_fast::parse_mps_fast_file<int, double>(gzip_file.path, mps_fast::FileReadMethod::Read);
-  auto bzip2 =
-    mps_fast::parse_mps_fast_file<int, double>(bzip2_file.path, mps_fast::FileReadMethod::Read);
+  auto raw   = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto gzip  = parse_mps_fast_file<int, double>(gzip_file.path, FileReadMethod::Read);
+  auto bzip2 = parse_mps_fast_file<int, double>(bzip2_file.path, FileReadMethod::Read);
 
   check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
   check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
@@ -803,3 +763,5 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
   EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types";
   EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types";
 }
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index 3b01f10227..6a47471c09 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -931,18 +931,24 @@ TEST_F(mip_partial_bounds_test, lp)
                            ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \
                            mps_reader_param_name)
 
+#define INSTANTIATE_DEFAULT_MPS_READER_TEST(Fixture) \
+  INSTANTIATE_TEST_SUITE_P(                          \
+    mps_readers, Fixture, ::testing::Values(default_mps_reader_param), mps_reader_param_name)
+
 INSTANTIATE_MPS_READER_TEST(good_mps_1_test);
 INSTANTIATE_MPS_READER_TEST(up_low_bounds_test);
-INSTANTIATE_MPS_READER_TEST(some_var_bounds_test);
-INSTANTIATE_MPS_READER_TEST(fixed_var_bound_test);
-INSTANTIATE_MPS_READER_TEST(free_var_bound_test);
-INSTANTIATE_MPS_READER_TEST(lower_inf_var_bound_test);
-INSTANTIATE_MPS_READER_TEST(upper_inf_var_bound_test);
 INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test);
 INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test);
 INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test);
+// fast mps parser doesn't support fixed format
+INSTANTIATE_DEFAULT_MPS_READER_TEST(some_var_bounds_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(fixed_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test);
 
 #undef INSTANTIATE_MPS_READER_TEST
+#undef INSTANTIATE_DEFAULT_MPS_READER_TEST
 
 #ifdef MPS_PARSER_WITH_BZIP2
 TEST(mps_parser, good_mps_file_bzip2_compressed)

From 9185d7aa0586fe35496d2976086ea4b6e73c2494 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 12 Jun 2026 06:42:54 -0700
Subject: [PATCH 12/22] ai review comments

---
 cpp/cuopt_cli.cpp                             |  2 +-
 .../fast_parse_primitives.hpp                 | 18 ++++----
 .../io/experimental_mps_fast/fast_parser.cpp  | 20 +++++----
 .../io/experimental_mps_fast/file_reader.cpp  | 42 +++++++++++++++----
 .../io/experimental_mps_fast/file_reader.hpp  | 31 +++++++++++---
 .../experimental_mps_fast/lz4_file_reader.cpp | 39 +++++++++--------
 6 files changed, 103 insertions(+), 49 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 55c506721a..f06e568208 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -309,7 +309,7 @@ int main(int argc, char* argv[])
     .help(
       "MPS reader implementation: default uses the production parser; experimental-fast uses the "
       "experimental "
-      "SIMD parser for LP/MIP .mps and .mps.lz4 files")
+      "SIMD parser for LP/MIP .mps, .mps.lz4, .mps.gz, and .mps.bz2 files")
     .default_value(std::string("default"))
     .choices("default", "experimental-fast");
 
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
index f77e14a410..8897bfef1c 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -246,18 +246,22 @@ struct cursor_t {
     if (UNLIKELY(ws_mask == 0)) { return slow(); }
     int field1_end_off = __builtin_ctz(ws_mask);
 
-    unsigned int printable_after_field1 = printable_mask >> field1_end_off;
-    if (UNLIKELY(printable_after_field1 == 0)) { return slow(); }
-    int field2_start_off = field1_end_off + __builtin_ctz(printable_after_field1);
-
-    if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); }
+    simde__m256i is_nl                = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int nl_mask              = (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int barrier_after_field1 = (printable_mask | nl_mask) >> field1_end_off;
+    if (UNLIKELY(barrier_after_field1 == 0)) { return slow(); }
+    int field2_rel_off = __builtin_ctz(barrier_after_field1);
+    if (UNLIKELY(ptr[field1_end_off + field2_rel_off] == '\n' ||
+                 ptr[field1_end_off + field2_rel_off] == '\r')) {
+      return slow();
+    }
+    int field2_start_off = field1_end_off + field2_rel_off;
 
     unsigned int ws_after_field2_start = ws_mask >> field2_start_off;
     if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); }
     int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start);
 
-    simde__m256i is_nl     = simde_mm256_cmpeq_epi8(data, vnl);
-    unsigned int stop_mask = printable_mask | (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int stop_mask         = printable_mask | nl_mask;
     unsigned int stop_after_field2 = stop_mask >> field2_end_off;
     if (LIKELY(stop_after_field2 != 0)) {
       ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2);
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 35e83a01aa..45eccce23d 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -2,7 +2,6 @@
 // reserved. SPDX-License-Identifier: Apache-2.0
 
 #include "fast_parser.hpp"
-#include <file_to_string.hpp>
 #include "fast_parse_primitives.hpp"
 #include "file_reader.hpp"
 #include "hash_table_smallstr.hpp"
@@ -20,12 +19,11 @@
 
 #include <omp.h>
 #include <algorithm>
-#include <array>
-#include <atomic>
 #include <cassert>
 #include <cctype>
 #include <cerrno>
-#include <chrono>
+#include <charconv>
+#include <climits>
 #include <concepts>
 #include <cstdint>
 #include <cstdio>
@@ -36,15 +34,14 @@
 #include <map>
 #include <memory>
 #include <mutex>
-#include <stdexcept>
 #include <string>
 #include <string_view>
-#include <tuple>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include <file_to_string.hpp>
+
 #define MPS_FAST_COMPACT_ROW_HASH
 #define MPS_FAST_THP_PREFAULT
 
@@ -863,6 +860,9 @@ static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
   }
 
   size_t total_rows = offsets[(size_t)num_threads];
+  if (UNLIKELY(total_rows > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", total_rows);
+  }
   {
     scoped_timer_t timer("rows_resize_outputs");
     state.row_names_sv.resize(total_rows);
@@ -1003,6 +1003,10 @@ static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const
     }
     expect_eol(state.cursor);
   }
+  if (UNLIKELY(state.row_names_sv.size() > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu",
+                       state.row_names_sv.size());
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -2969,7 +2973,7 @@ static padded_memory_input_t read_compressed_mps_file(const std::string& path)
   if (buffer.empty()) { buffer.push_back('\0'); }
 
   std::size_t input_size = buffer.size() - 1;
-  buffer.resize(input_size + input_buffer_padding_bytes, '\0');
+  ensure_input_buffer_padding(buffer, input_size);
   return {std::move(buffer), input_size, get_file_size(path)};
 }
 
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index e874011db8..76ee5b6b5b 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -5,6 +5,7 @@
 #include "nvtx_ranges.hpp"
 
 #include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
 
 #include <fcntl.h>
 #include <sys/mman.h>
@@ -66,6 +67,18 @@ std::size_t add_input_padding(std::size_t size)
 
 }  // namespace
 
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size)
+{
+  if (input_size > buffer.size()) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "input_size %zu exceeds buffer size %zu",
+                    input_size,
+                    buffer.size());
+  }
+  std::size_t required = add_input_padding(input_size);
+  if (buffer.size() < required) { buffer.resize(required, '\0'); }
+}
+
 std::size_t get_file_size(int fd, const std::string& path)
 {
   struct stat st;
@@ -128,22 +141,29 @@ bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset)
 raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
 {
   MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
-  buffered_fd_ = ::open(path.c_str(), O_RDONLY);
-  if (buffered_fd_ < 0) {
+  int buffered_fd = ::open(path.c_str(), O_RDONLY);
+  cuopt::scope_guard close_buffered([&] {
+    if (buffered_fd >= 0) { ::close(buffered_fd); }
+  });
+  if (buffered_fd < 0) {
     mps_parser_fail(error_type_t::RuntimeError,
                     "Failed to open raw MPS file '%s': %s",
                     path.c_str(),
                     std::strerror(errno));
   }
 
-  file_size_         = get_file_size(buffered_fd_, path);
-  fd_                = buffered_fd_;
-  bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes;
-  if (use_direct_io) {
+  int direct_fd = -1;
+  cuopt::scope_guard close_direct([&] {
+    if (direct_fd >= 0) { ::close(direct_fd); }
+  });
+
+  file_size_  = get_file_size(buffered_fd, path);
+  int read_fd = buffered_fd;
+  if (file_size_ > raw_input_direct_io_threshold_bytes) {
 #ifdef O_DIRECT
-    int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
+    direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
     if (direct_fd >= 0) {
-      fd_        = direct_fd;
+      read_fd    = direct_fd;
       direct_io_ = true;
     }
 #endif
@@ -160,6 +180,11 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
 
   section_scanner_ =
     std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
+
+  buffered_fd_ = buffered_fd;
+  buffered_fd  = -1;
+  fd_          = read_fd;
+  if (read_fd == direct_fd) { direct_fd = -1; }
 }
 
 raw_input_stream_t::~raw_input_stream_t()
@@ -229,6 +254,7 @@ memory_input_stream_t::memory_input_stream_t(std::vector<char> buffer,
                                              std::size_t compressed_size)
   : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size)
 {
+  ensure_input_buffer_padding(buffer_, input_size_);
   section_scanner_ = std::make_unique<mps_section_block_scanner_t>(buffer_.data(), 1, registry_);
 }
 
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index 802d6fe191..8c24a3d297 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -37,12 +37,15 @@
 #include <mutex>
 #include <string>
 #include <thread>
+#include <utility>
 #include <vector>
 
 namespace cuopt::linear_programming::io::detail {
 
 inline constexpr std::size_t input_buffer_padding_bytes = 64;
 
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size);
+
 struct lz4_pipeline_t;
 
 /**
@@ -122,6 +125,27 @@ class parallel_error_latch_t {
   std::atomic_bool stopped_{false};
 };
 
+class scoped_thread_group {
+ public:
+  void reserve(std::size_t count) { threads_.reserve(count); }
+
+  template <typename F>
+  void emplace(F&& f)
+  {
+    threads_.emplace_back(std::forward<F>(f));
+  }
+
+  ~scoped_thread_group()
+  {
+    for (auto& thread : threads_) {
+      if (thread.joinable()) { thread.join(); }
+    }
+  }
+
+ private:
+  std::vector<std::thread> threads_;
+};
+
 // Work-stealing parallel loop over [0, count). Each of thread_count workers pulls
 // the next index from a shared counter and invokes body(index). An exception
 // escaping body is captured into the latch and stops the loop; the caller is
@@ -136,10 +160,10 @@ void parallel_for_indexed(std::size_t count,
                           Body body)
 {
   std::atomic_size_t next{0};
-  std::vector<std::thread> workers;
+  scoped_thread_group workers;
   workers.reserve(thread_count);
   for (std::size_t t = 0; t < thread_count; ++t) {
-    workers.emplace_back([&, t] {
+    workers.emplace([&, t] {
       if (thread_name_prefix != nullptr) {
         std::string name = thread_name_prefix + std::to_string(t);
         nvtx::name_current_thread(name.c_str());
@@ -156,9 +180,6 @@ void parallel_for_indexed(std::size_t count,
       }
     });
   }
-  for (auto& worker : workers) {
-    worker.join();
-  }
 }
 
 struct input_stream_view_t {
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 2c40d6745b..d26109b011 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -6,6 +6,7 @@
 #include "nvtx_ranges.hpp"
 
 #include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
 
 #include <cuda/cmath>
 
@@ -344,12 +345,15 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
 
   ensure_lz4_runtime_available();
 
-  fd_ = open_lz4_fd(path);
-  ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+  int fd = open_lz4_fd(path);
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
 
-  compressed_size_ = get_file_size(fd_, path);
+  compressed_size_ = get_file_size(fd, path);
 
-  lz4_frame_header_t header = parse_lz4_frame_header(fd_, path, compressed_size_);
+  lz4_frame_header_t header = parse_lz4_frame_header(fd, path, compressed_size_);
   block_max_size_           = header.block_max_size;
   content_size_             = header.content_size;
   header_size_              = header.header_size;
@@ -378,6 +382,9 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
 
   section_scanner_ =
     std::make_unique<mps_section_block_scanner_t>(output_data_, block_slot_count_, registry_);
+
+  fd_ = fd;
+  fd  = -1;
 }
 
 lz4_input_stream_t::~lz4_input_stream_t()
@@ -455,13 +462,14 @@ struct lz4_pipeline_t {
 
   void run()
   {
-    std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this);
-    start_decoders();
-    run_readers();
-
-    scanner.join();
-    for (auto& worker : decoders) {
-      worker.join();
+    {
+      scoped_thread_group background;
+      background.reserve(io_threads + 1);
+      background.emplace([this] { run_scanner_stage(); });
+      for (std::size_t t = 0; t < io_threads; ++t) {
+        background.emplace([this, t] { run_decoder_stage(t); });
+      }
+      run_readers();
     }
     latch.rethrow_if_error();
   }
@@ -548,14 +556,6 @@ struct lz4_pipeline_t {
     window_cv.notify_all();
   }
 
-  void start_decoders()
-  {
-    decoders.reserve(io_threads);
-    for (std::size_t t = 0; t < io_threads; ++t) {
-      decoders.emplace_back(&lz4_pipeline_t::run_decoder_stage, this, t);
-    }
-  }
-
   void run_decoder_stage(std::size_t tid)
   {
     try {
@@ -846,7 +846,6 @@ struct lz4_pipeline_t {
 
   std::atomic_size_t blocks_scanned{0};
   std::vector<std::vector<char>> crossing_payloads;
-  std::vector<std::thread> decoders;
 };
 
 void lz4_input_stream_t::run_decode_tasks()

From a1e14d504425e54448a480c7e17d6b35553119bb Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 12 Jun 2026 07:11:03 -0700
Subject: [PATCH 13/22] ai review

---
 .../experimental_mps_fast/lz4_file_reader.cpp   | 17 ++++++++++++-----
 .../mps_section_scanner.cpp                     |  3 ++-
 .../mps_section_scanner.hpp                     |  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index d26109b011..85309efaa2 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -462,15 +462,22 @@ struct lz4_pipeline_t {
 
   void run()
   {
+    std::exception_ptr startup_error;
     {
       scoped_thread_group background;
-      background.reserve(io_threads + 1);
-      background.emplace([this] { run_scanner_stage(); });
-      for (std::size_t t = 0; t < io_threads; ++t) {
-        background.emplace([this, t] { run_decoder_stage(t); });
+      try {
+        background.reserve(io_threads + 1);
+        background.emplace([this] { run_scanner_stage(); });
+        for (std::size_t t = 0; t < io_threads; ++t) {
+          background.emplace([this, t] { run_decoder_stage(t); });
+        }
+        run_readers();
+      } catch (...) {
+        startup_error = std::current_exception();
+        fail_and_notify(startup_error);
       }
-      run_readers();
     }
+    if (startup_error) { std::rethrow_exception(startup_error); }
     latch.rethrow_if_error();
   }
 
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index a3c9fe87a3..8d39233e4d 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -117,7 +117,8 @@ bool mps_phase_registry_t::ready(mps_phase_kind phase) const
 mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
 {
   std::size_t idx = phase_index(phase);
-  assert(ready_[idx].load(std::memory_order_acquire));
+  bool is_ready   = ready_[idx].load(std::memory_order_acquire);
+  assert(is_ready);
   return ranges_[idx];
 }
 
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index 7fd249a7e8..824e976c4f 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -51,8 +51,8 @@ class mps_phase_registry_t {
   void attach_event(mps_phase_kind phase, omp_event_handle_t event);
 
   bool ready(mps_phase_kind phase) const;
-  // range() is lock-free: callers must observe ready(phase)==true first. The
-  // acquire load in ready() pairs with publish()'s release store before ranges_.
+  // range() acquire-loads ready_[phase] (pairs with publish()'s release store) before
+  // reading ranges_[phase]. Callers must not invoke range() until the phase is published.
   mps_phase_range_t range(mps_phase_kind phase) const;
 
   void publish_endata(const char* begin, bool present);

From fe0aa31301ac08467a714a12c41d4ec7c7a7f7d1 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 12 Jun 2026 07:55:12 -0700
Subject: [PATCH 14/22] Comments on the build flags

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4ecb1e9a46..e50dc52172 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -461,7 +461,7 @@ if (HOST_LINEINFO)
     set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
 endif ()
 
-# Needed for the fast MPS parser
+# Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
         CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
     set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}

From cfaccc3eb52120c36cd8a7b42121d1c6e6c9d6d5 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Sat, 13 Jun 2026 04:36:45 -0700
Subject: [PATCH 15/22] gate O_DIRECT behind non-nfs, add missing license
 notices

---
 .../io/experimental_mps_fast/file_reader.cpp  | 65 ++++++++++++++++++-
 .../io/experimental_mps_fast/file_reader.hpp  |  3 +
 thirdparty/THIRD_PARTY_LICENSES               | 60 +++++++++++++++++
 3 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 76ee5b6b5b..c00f84eb5e 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -10,11 +10,13 @@
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <unistd.h>
 
 #include <algorithm>
 #include <atomic>
 #include <cerrno>
+#include <chrono>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
@@ -37,6 +39,7 @@ namespace {
 constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
 constexpr std::size_t raw_input_max_read_threads          = 8;
 constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
+constexpr long nfs_super_magic                            = 0x6969;
 
 bool path_has_suffix(const std::string& path, const char* suffix) noexcept
 {
@@ -65,6 +68,12 @@ std::size_t add_input_padding(std::size_t size)
   return size + input_buffer_padding_bytes;
 }
 
+bool is_nfs_backed_path(const std::string& path) noexcept
+{
+  struct statfs fs;
+  return ::statfs(path.c_str(), &fs) == 0 && fs.f_type == nfs_super_magic;
+}
+
 }  // namespace
 
 void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size)
@@ -157,9 +166,13 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
     if (direct_fd >= 0) { ::close(direct_fd); }
   });
 
-  file_size_  = get_file_size(buffered_fd, path);
-  int read_fd = buffered_fd;
-  if (file_size_ > raw_input_direct_io_threshold_bytes) {
+  file_size_                   = get_file_size(buffered_fd, path);
+  int read_fd                  = buffered_fd;
+  bool large_enough_for_direct = file_size_ > raw_input_direct_io_threshold_bytes;
+  bool nfs_backed              = is_nfs_backed_path(path);
+  // Buffered reads are consistently faster than O_DIRECT on our NFS mounts;
+  // keep direct I/O for large local files where it wins.
+  if (large_enough_for_direct && !nfs_backed) {
 #ifdef O_DIRECT
     direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
     if (direct_fd >= 0) {
@@ -231,6 +244,9 @@ void raw_input_stream_t::run_decode_tasks()
   // Each window is read independently and handed to the scanner, which owns the
   // contiguous decoded-byte frontier and the parallel section publication.
   parallel_error_latch_t latch;
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_start = std::chrono::steady_clock::now();
+#endif
   parallel_for_indexed(
     window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) {
       MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
@@ -238,13 +254,56 @@ void raw_input_stream_t::run_decode_tasks()
       std::size_t size   = std::min(window_bytes_, file_size_ - offset);
       {
         MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+#ifdef MPS_FAST_TIMERS
+        auto start = std::chrono::steady_clock::now();
+#endif
         read_window_payload(offset, size);
+#ifdef MPS_FAST_TIMERS
+        auto end     = std::chrono::steady_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        read_window_ms_[index] =
+          (uint32_t)std::min<long long>(elapsed.count(), std::numeric_limits<uint32_t>::max());
+#endif
       }
       MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
       section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
     });
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_end = std::chrono::steady_clock::now();
+#endif
   latch.rethrow_if_error();
 
+#ifdef MPS_FAST_TIMERS
+  if (!read_window_ms_.empty()) {
+    std::vector<uint32_t> sorted = read_window_ms_;
+    std::sort(sorted.begin(), sorted.end());
+    auto percentile = [&](double pct) {
+      std::size_t idx = (std::size_t)std::min<double>((double)(sorted.size() - 1),
+                                                      pct * (double)(sorted.size() - 1));
+      return sorted[idx];
+    };
+    uint64_t total_ms = 0;
+    for (uint32_t value : read_window_ms_) {
+      total_ms += value;
+    }
+    std::fprintf(
+      stderr,
+      "[RAW_READ_LATENCY] windows=%zu wall_ms=%lld total_window_ms=%llu avg_ms=%.3f min_ms=%u "
+      "p50_ms=%u p90_ms=%u p99_ms=%u max_ms=%u\n",
+      read_window_ms_.size(),
+      (long long)std::chrono::duration_cast<std::chrono::milliseconds>(read_wall_end -
+                                                                       read_wall_start)
+        .count(),
+      (unsigned long long)total_ms,
+      (double)total_ms / (double)read_window_ms_.size(),
+      sorted.front(),
+      percentile(0.50),
+      percentile(0.90),
+      percentile(0.99),
+      sorted.back());
+  }
+#endif
+
   output_view_size_ = section_scanner_->ready_bytes();
   section_scanner_->publish_ready(output_view_size_);
 }
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index 8c24a3d297..5472434b1a 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -283,6 +283,9 @@ class raw_input_stream_t : public input_stream_base_t<raw_input_stream_t> {
   std::size_t file_size_          = 0;
   std::size_t window_bytes_       = 0;
   std::size_t window_count_       = 0;
+#ifdef MPS_FAST_TIMERS
+  std::vector<uint32_t> read_window_ms_;
+#endif
   std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
 };
 
diff --git a/thirdparty/THIRD_PARTY_LICENSES b/thirdparty/THIRD_PARTY_LICENSES
index a70fa8ce1c..e09000b56d 100644
--- a/thirdparty/THIRD_PARTY_LICENSES
+++ b/thirdparty/THIRD_PARTY_LICENSES
@@ -512,3 +512,63 @@ Copyright notice:
 
   Jean-loup Gailly        Mark Adler
   jloup@gzip.org          madler@alumni.caltech.edu
+
+
+-----------------------------------------------------------------------------------------
+== LZ4
+
+Usage: cuopt uses LZ4 through dynamically loaded library symbols
+
+Copyright (c) Yann Collet. All rights reserved.
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+-----------------------------------------------------------------------------------------
+== SIMDe
+
+Usage: cuopt uses SIMDe in experimental fast MPS parser SIMD compatibility code
+
+Copyright (c) 2017 Evan Nemerson <evan@nemerson.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

From 72208385fea719e1deef478ca1d04505f2789f97 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Mon, 15 Jun 2026 02:08:42 -0700
Subject: [PATCH 16/22] fix bitwise comps, more cleanup and comments

---
 .../io/experimental_mps_fast/fast_parser.cpp  | 112 +++++++++++-----
 .../io/experimental_mps_fast/file_reader.cpp  |  18 +--
 .../experimental_mps_fast/lz4_file_reader.cpp | 125 +++++++++++++-----
 .../mps_section_scanner.cpp                   |  13 +-
 .../mps_section_scanner.hpp                   |  23 ++++
 .../fast_parser_edge_test.cpp                 |  44 ++++--
 6 files changed, 244 insertions(+), 91 deletions(-)

diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 45eccce23d..8eae082e25 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -435,6 +435,20 @@ static inline void observe_dense_name(bool& candidate,
   observed_count++;
 }
 
+// Maps MPS row/column names to indices via one of two strategies, chosen per problem:
+//
+//   * dense_ordered - when every name in a section is a shared prefix followed by a
+//     contiguous run of integers (e.g. R0001, R0002, ... or x1, x2, ...). The index is
+//     then computed straight from the parsed integer (value - min_id), so no hash table
+//     is built or probed. This is the common, fast case for solver-generated models.
+//   * hash          - the general fallback (smallstr_hash_table_t) for arbitrary names.
+//
+// Each section decides its own mode while scanning: it stays a dense_ordered "candidate"
+// as long as names keep matching the prefix + consecutive-integer + zero-pad-width rule
+// (see observe_dense_name), and the first violation drops it to the hash path. The chosen
+// mode lives in row_index_mode / col_index_mode, and every lookup branches on it
+// (row_lookup / read_row_lookup vs the dense_ordered variants below). Holding this in mind
+// explains most of the paired/dual code paths throughout this file.
 template <typename i_t, typename f_t>
 struct parse_state_t {
   mps_data_model_t<i_t, f_t>& problem;
@@ -510,6 +524,51 @@ struct parse_state_t {
     return true;
   }
 
+  // Insert all rows into the hash table. The perf-counter instrumentation is isolated in
+  // these two helpers so its #ifdefs do not fragment init_row_hash_table_impl's setup flow;
+  // both compile down to a bare insert loop when MPS_FAST_PERF_COUNTERS is off.
+  void insert_rows_partitioned(
+    int num_threads,
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1>& partition_offsets,
+    const std::vector<size_t>& row_order,
+    const std::vector<uint32_t>& row_hashes)
+  {
+    scoped_timer_t timer("row_hash_insert_partitioned");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
+#endif
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
+      size_t p = (size_t)part_id;
+#ifdef MPS_FAST_PERF_COUNTERS
+      thread_perf_counters_t perf_counters;
+#endif
+      for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
+        size_t idx = row_order[pos];
+        row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx);
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      perf_snapshots[p] = perf_counters.stop();
+#endif
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
+#endif
+  }
+
+  void insert_rows_serial(size_t n_rows)
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    thread_perf_counters_t perf_counters;
+#endif
+    for (size_t idx = 0; idx < n_rows; ++idx) {
+      row_hash_.insert_serial(row_names_sv[idx], idx);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
+#endif
+  }
+
   void init_row_hash_table_impl()
   {
     scoped_timer_t timer("row_hash_init_total");
@@ -580,37 +639,9 @@ struct parse_state_t {
       scoped_timer_t timer("row_hash_insert_all");
       row_hash_.reset_build_probe_stats();
       if (use_partitioned) {
-        scoped_timer_t timer("row_hash_insert_partitioned");
-#ifdef MPS_FAST_PERF_COUNTERS
-        std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
-#endif
-#pragma omp parallel for schedule(static) num_threads(num_threads)
-        for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
-          size_t p = (size_t)part_id;
-#ifdef MPS_FAST_PERF_COUNTERS
-          thread_perf_counters_t perf_counters;
-#endif
-          for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
-            size_t idx = row_order[pos];
-            row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx);
-          }
-#ifdef MPS_FAST_PERF_COUNTERS
-          perf_snapshots[p] = perf_counters.stop();
-#endif
-        }
-#ifdef MPS_FAST_PERF_COUNTERS
-        print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
-#endif
+        insert_rows_partitioned(num_threads, partition_offsets, row_order, row_hashes);
       } else {
-#ifdef MPS_FAST_PERF_COUNTERS
-        thread_perf_counters_t perf_counters;
-#endif
-        for (size_t idx = 0; idx < n_rows; ++idx) {
-          row_hash_.insert_serial(row_names_sv[idx], idx);
-        }
-#ifdef MPS_FAST_PERF_COUNTERS
-        print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
-#endif
+        insert_rows_serial(n_rows);
       }
       row_hash_.print_build_probe_report(n_rows);
     }
@@ -798,6 +829,11 @@ static std::vector<row_chunk_boundary_t> compute_row_chunk_boundaries(const char
 }
 
 // reads the row section in chunks and inserts into the worker's hash table partition
+// Parallel ROWS parser: count constraints per chunk, prefix-sum, then fill the output arrays
+// in parallel (with per-chunk dense-name reconciliation at the end). Must keep the same line
+// grammar as its serial twin parse_rows_section_serial_impl; parse_rows_section chooses between
+// them by size. Returns false if a chunk hit a malformed line (nothing committed for the fill
+// pass), so the caller can reset and retry serially for clean error reporting.
 template <typename i_t, typename f_t>
 static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
                                              const char* rows_start,
@@ -1808,6 +1844,9 @@ static void materialize_problem_csr(parse_state_t<i_t, f_t>& state)
   state.temp_A_indices_region.reset();
 }
 
+// COLUMNS is always parsed chunk-parallel: each chunk is counted/parsed by parse_columns_chunk
+// and the per-chunk results are stitched together by merge_chunk_results_to_csr. There is no
+// separate serial implementation -- a single thread just runs one chunk through the same path.
 template <typename i_t, typename f_t>
 static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
                                            int num_threads,
@@ -1997,6 +2036,10 @@ static bool apply_bound_record(std::string_view bound_type,
   return true;
 }
 
+// Parallel BOUNDS parser for the common dense/ordered-name case. Returns false when the section
+// is too small or not safely parallelizable, so parse_bounds_section resets and falls back to its
+// serial path. Bound-type semantics (LO/UP/FX/...) are shared with the serial path through
+// apply_bound_record, so the two cannot drift.
 template <typename i_t, typename f_t>
 static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
                                                 cursor_t& cursor,
@@ -2791,6 +2834,10 @@ static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
     input.registry->publish(mps_phase_kind::quadratic, empty);
   };
 
+  // These ints carry no data; they exist only as OpenMP task-dependency tokens. A task's
+  // depend(out: X) "produces" X and depend(in: X) waits on it, so the phase ordering in the
+  // task graph below (e.g. bounds after columns_done, because bounds reference variable names)
+  // is expressed purely through which tokens each task depends on.
   int header_ready = 0, rows_ready = 0, columns_ready = 0;
   int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0;
   int header_done = 0, rows_done = 0, columns_done = 0;
@@ -2807,6 +2854,11 @@ static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
 
 #pragma omp single
     {
+      // Bridge between the producer and the parse tasks: each detached task below blocks
+      // until run_decode_tasks() publishes that phase's byte range into the registry, then
+      // completes its event and fulfills depend(out: <phase>_ready) -- releasing the matching
+      // parse task. This is what lets ROWS parsing start the instant the ROWS bytes are
+      // decoded, overlapping with the decode of later sections.
       omp_event_handle_t ev_header;
 #pragma omp task detach(ev_header) depend(out : header_ready)
       {
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index c00f84eb5e..1ccedba52e 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -7,6 +7,8 @@
 #include <utilities/error.hpp>
 #include <utilities/scope_guard.hpp>
 
+#include <cuda/cmath>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -48,18 +50,6 @@ bool path_has_suffix(const std::string& path, const char* suffix) noexcept
          path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
 }
 
-std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment)
-{
-  if (alignment == 0) { return value; }
-  std::size_t remainder = value % alignment;
-  if (remainder == 0) { return value; }
-  std::size_t increment = alignment - remainder;
-  if (value > std::numeric_limits<std::size_t>::max() - increment) {
-    mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow");
-  }
-  return value + increment;
-}
-
 std::size_t add_input_padding(std::size_t size)
 {
   if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
@@ -184,8 +174,8 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
   window_bytes_ = raw_input_window_bytes;
   window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
 
-  output_mapped_size_ = round_up_to_multiple(
-    std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
+  output_mapped_size_ =
+    cuda::round_up(std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
   output_region_ = mmap_region_t::anonymous(
     output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
   output_data_ = output_region_.char_data();
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 85309efaa2..4696b0ae81 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -240,20 +240,16 @@ class lz4_resident_windows_t {
     if (windows_.empty()) {
       mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows");
     }
-    std::size_t lo = 0;
-    std::size_t hi = windows_.size();
-    while (lo < hi) {
-      std::size_t mid = lo + (hi - lo) / 2;
-      const auto& w   = windows_[mid];
-      if (offset < w.file_offset) {
-        hi = mid;
-      } else if (offset >= w.file_offset + w.size) {
-        lo = mid + 1;
-      } else {
-        return w;
-      }
+    std::size_t window_stride = windows_.size() > 1 ? windows_[1].file_offset : windows_[0].size;
+    std::size_t idx           = offset / window_stride;
+    if (idx >= windows_.size()) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
     }
-    mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    const auto& w = windows_[idx];
+    if (offset >= w.file_offset + w.size) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    }
+    return w;
   }
 
   std::vector<lz4_resident_window_t>& windows_;
@@ -431,22 +427,34 @@ struct resident_block_desc_t {
   bool uncompressed               = false;
 };
 
+struct window_state_t {
+  std::atomic<uint32_t> decode_refs{0};
+  std::atomic<uint8_t> released{0};
+};
+
 // Two distinct units flow through this pipeline:
 //   * window  - a fixed-size span of the compressed file read by the I/O stage.
 //   * block   - a single independent LZ4 data block (decompressed unit) that the
 //               metadata scanner discovers inside the resident windows.
 // Windows feed blocks; the decoded blocks are handed to the section scanner,
 // which owns the contiguous decoded-byte frontier and section publication.
+//
+// Locking (the grouped members below repeat each guard in context):
+//   * window_mutex          - guards window_done[]   (reader -> scanner readiness)
+//   * desc_mutex            - guards desc_queue + scanner_done (scanner -> decoders)
+//   * window_release_mutex  - serializes freeing a window buffer + RSS accounting
+//   * window_state_[].decode_refs/.released, scanned_through_, blocks_scanned,
+//     compressed_resident_bytes - lock-free atomics
+// Locks are never nested. The scanner thread is the sole writer of the frame walk,
+// so offset / decompressed_offset are mutated without locking.
 struct lz4_pipeline_t {
   explicit lz4_pipeline_t(lz4_input_stream_t& input_)
     : input(input_),
       window_count(cuda::ceil_div(input.compressed_size_, window_bytes)),
       windows(window_count),
+      window_state_(std::make_unique<window_state_t[]>(window_count)),
       io_threads(std::min(lz4_input_max_io_threads, window_count)),
-      window_done(window_count, 0),
-      window_refs(window_count),
-      window_scanned(window_count),
-      window_released(window_count)
+      window_done(window_count, 0)
   {
     for (std::size_t i = 0; i < window_count; ++i) {
       std::size_t offset     = i * window_bytes;
@@ -454,12 +462,23 @@ struct lz4_pipeline_t {
       windows[i].index       = i;
       windows[i].file_offset = offset;
       windows[i].size        = size;
-      window_refs[i].store(0, std::memory_order_relaxed);
-      window_scanned[i].store(0, std::memory_order_relaxed);
-      window_released[i].store(0, std::memory_order_relaxed);
     }
   }
 
+  // Runs the three-stage pipeline to completion:
+  //
+  //   readers --window_done/window_cv--> scanner --desc_queue/desc_cv--> decoders
+  //
+  //   * readers  (io_threads): pread fixed compressed windows into RAM, mark ready.
+  //   * scanner  (1 thread)  : walk the LZ4 frame in order, slice it into block
+  //                            descriptors, push them to decoders in batches.
+  //   * decoders (io_threads): decompress blocks into the output buffer and hand
+  //                            each to the section scanner, which advances the
+  //                            decoded-byte frontier and publishes section ranges.
+  //
+  // Consumers are spawned first so they are parked waiting before the readers (which
+  // run on this thread) start producing. scoped_thread_group joins the background
+  // threads on scope exit; any stage's failure is captured in `latch` and rethrown here.
   void run()
   {
     std::exception_ptr startup_error;
@@ -471,7 +490,7 @@ struct lz4_pipeline_t {
         for (std::size_t t = 0; t < io_threads; ++t) {
           background.emplace([this, t] { run_decoder_stage(t); });
         }
-        run_readers();
+        run_readers();  // produce on the calling thread, now that consumers are parked
       } catch (...) {
         startup_error = std::current_exception();
         fail_and_notify(startup_error);
@@ -503,12 +522,11 @@ struct lz4_pipeline_t {
   void try_release_window(std::size_t index)
   {
     if (index >= window_count) { return; }
-    if (window_scanned[index].load(std::memory_order_acquire) == 0) { return; }
-    if (window_refs[index].load(std::memory_order_acquire) != 0) { return; }
+    if (index >= scanned_through_.load(std::memory_order_acquire)) { return; }
+    window_state_t& state = window_state_[index];
+    if (state.decode_refs.load(std::memory_order_acquire) != 0) { return; }
     uint8_t expected = 0;
-    if (!window_released[index].compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) {
-      return;
-    }
+    if (!state.released.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { return; }
     std::lock_guard<std::mutex> lock(window_release_mutex);
     if (windows[index].data) {
       windows[index].data.reset();
@@ -518,9 +536,13 @@ struct lz4_pipeline_t {
 
   void mark_windows_scanned_before(std::size_t offset)
   {
-    std::size_t last_excl = std::min(window_count, offset / window_bytes);
-    for (std::size_t wi = 0; wi < last_excl; ++wi) {
-      window_scanned[wi].store(1, std::memory_order_release);
+    assert(offset >= last_mark_offset_);
+    last_mark_offset_               = offset;
+    std::size_t new_scanned_through = std::min(window_count, offset / window_bytes);
+    std::size_t prev                = scanned_through_.load(std::memory_order_relaxed);
+    if (new_scanned_through <= prev) { return; }
+    scanned_through_.store(new_scanned_through, std::memory_order_release);
+    for (std::size_t wi = prev; wi < new_scanned_through; ++wi) {
       try_release_window(wi);
     }
   }
@@ -625,7 +647,8 @@ struct lz4_pipeline_t {
   void release_block_window_ref(const resident_block_desc_t& block)
   {
     if (block.window_index == std::numeric_limits<std::size_t>::max()) { return; }
-    uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel);
+    uint32_t old =
+      window_state_[block.window_index].decode_refs.fetch_sub(1, std::memory_order_acq_rel);
     assert(old > 0);
     if (old == 1) { try_release_window(block.window_index); }
   }
@@ -743,6 +766,7 @@ struct lz4_pipeline_t {
                                        std::size_t& offset,
                                        std::size_t& decompressed_offset)
   {
+    // --- Decode the block-size word and validate it ---------------------------
     bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
     std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
     if (block_payload_size == 0) {
@@ -757,12 +781,16 @@ struct lz4_pipeline_t {
                       "LZ4 frame contains more blocks than content size allows");
     }
 
+    // --- Wait until the payload bytes are resident ----------------------------
     wait_range_ready(offset, block_payload_size);
     if (offset + block_payload_size > input.compressed_size_) {
       mps_parser_fail(error_type_t::ValidationError,
                       "truncated LZ4 frame while reading block payload");
     }
 
+    // --- Determine the decompressed size --------------------------------------
+    // Compressed blocks expand to block_max_size_ (or the content-size remainder
+    // for the final block); uncompressed blocks keep their payload size.
     std::size_t decompressed_size = block_payload_size;
     if (!uncompressed) {
       decompressed_size =
@@ -775,6 +803,12 @@ struct lz4_pipeline_t {
       mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
     }
 
+    // --- Stage the payload for the decoder ------------------------------------
+    // Fast path: the whole payload lives in one window, so point the decoder
+    // straight at it (zero copy) and pin that window with a decode_refs bump until
+    // the decode completes. Otherwise it straddles a window boundary: copy it out
+    // into crossing_payloads, which stays alive for the whole run, so no window pin
+    // is needed (and the source window can be released as soon as it is scanned).
     const char* src          = resident.ptr_if_contiguous(offset, block_payload_size);
     std::size_t window_index = std::numeric_limits<std::size_t>::max();
     if (src == nullptr) {
@@ -783,9 +817,10 @@ struct lz4_pipeline_t {
       src = crossing_payloads.back().data();
     } else {
       window_index = offset / window_bytes;
-      window_refs[window_index].fetch_add(1, std::memory_order_acq_rel);
+      window_state_[window_index].decode_refs.fetch_add(1, std::memory_order_acq_rel);
     }
 
+    // --- Record the descriptor and advance past the block (+ optional checksum) -
     resident_block_desc_t block{src,
                                 block_payload_size,
                                 decompressed_offset,
@@ -829,28 +864,50 @@ struct lz4_pipeline_t {
     }
   }
 
+  // ---- Input + chunking (immutable after construction) ------------------------
+  // The compressed file is split into fixed-size `windows`; `io_threads` reader
+  // threads pull them by index.
   lz4_input_stream_t& input;
   const std::size_t window_bytes = lz4_pipeline_batch_bytes;
   const std::size_t window_count;
   std::vector<lz4_resident_window_t> windows;
   const std::size_t io_threads;
 
+  // First-error-wins latch shared by all three stages: stops the pipeline and
+  // retains the first exception for run() to rethrow after the threads join.
   parallel_error_latch_t latch;
 
+  // ---- Reader -> scanner readiness  (guarded by window_mutex) -----------------
+  // A reader sets window_done[i]=1 once window i is resident; the scanner blocks
+  // on window_cv until every window covering the bytes it needs is ready.
   std::vector<unsigned char> window_done;
-  std::vector<std::atomic<uint32_t>> window_refs;
-  std::vector<std::atomic<uint8_t>> window_scanned;
-  std::vector<std::atomic<uint8_t>> window_released;
   std::mutex window_mutex;
   std::condition_variable window_cv;
+
+  // ---- Window lifecycle / early release ---------------------------------------
+  // windows[i].data is freed exactly once, when the metadata scan has passed window i
+  // (scanned_through_ > i) AND no decoder still pins it (window_state_[i].decode_refs == 0).
+  // scanned_through_ advances monotonically in mark_windows_scanned_before (last_mark_offset_
+  // asserts that monotonicity); decode_refs bumps in scan_one_block and drops in
+  // release_block_window_ref; the per-window `released` CAS makes the free exactly-once.
+  // window_release_mutex serializes the data.reset() + compressed_resident_bytes accounting.
+  std::unique_ptr<window_state_t[]> window_state_;
+  std::atomic_size_t scanned_through_{0};
+  std::size_t last_mark_offset_{0};
   std::mutex window_release_mutex;
   std::atomic_size_t compressed_resident_bytes{0};
 
+  // ---- Scanner -> decoder queue  (guarded by desc_mutex) ----------------------
+  // The scanner pushes batches of block descriptors; decoders pop them via desc_cv.
+  // scanner_done signals the scanner has emitted its final batch.
   std::deque<std::vector<resident_block_desc_t>> desc_queue;
   bool scanner_done = false;
   std::mutex desc_mutex;
   std::condition_variable desc_cv;
 
+  // ---- Scanner scratch / progress ---------------------------------------------
+  // blocks_scanned doubles as the running block index; crossing_payloads holds staged
+  // copies of blocks that straddle a window boundary (see scan_one_block).
   std::atomic_size_t blocks_scanned{0};
   std::vector<std::vector<char>> crossing_payloads;
 };
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index 8d39233e4d..b6b04afbff 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -303,11 +303,14 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
                     "MPS section scanner observed invalid LZ4 block index");
   }
 
+  // --- Scan this block, then record its extent and mark it decoded. The release store on
+  //     block_decoded_ publishes the two relaxed offset stores above it.
   scan_section_range(begin, end);
   block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed);
   block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed);
   block_decoded_[block_index].store(1, std::memory_order_release);
 
+  // --- Rescan the seams with already-decoded neighbors, in case a title straddles the boundary.
   if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) {
     scan_boundary(block_index - 1, block_index);
   }
@@ -316,6 +319,7 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index,
     scan_boundary(block_index, block_index + 1);
   }
 
+  // --- Extend the contiguous decoded-byte frontier and publish any newly bounded phases.
   advance_ready_frontier();
 }
 
@@ -324,8 +328,6 @@ void mps_section_block_scanner_t::advance_ready_frontier()
   std::size_t new_ready = 0;
   bool grew             = false;
   {
-    // block_decoded_ is stored with release after the begin/end offsets, so an
-    // acquire load of a set flag makes the matching end offset visible here.
     std::lock_guard<std::mutex> lock(frontier_mutex_);
     while (next_block_ < block_count_ &&
            block_decoded_[next_block_].load(std::memory_order_acquire)) {
@@ -409,6 +411,13 @@ void mps_section_block_scanner_t::notify_ready_phases()
     }
   };
 
+  // Three publication shapes follow:
+  //   (1) mandatory header/rows/columns -- each spans from its start to the next mandatory
+  //       section; published as soon as that bounding section is available.
+  //   (2) optional rhs/ranges/bounds via publish_optional -- present=true once bounded, or
+  //       present=false once a later section proves the optional one cannot still appear.
+  //   (3) quadratic -- starts at the earliest of the three quad markers (quadobj/qmatrix/qcmatrix).
+  // final_boundary (ENDATA, or the final ready frontier for truncated files) closes the tail.
   if (available(rows) && !registry_.ready(mps_phase_kind::header)) {
     registry_.publish(mps_phase_kind::header, {data_, rows, true});
   }
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
index 824e976c4f..5d05e8b2f8 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -61,6 +61,9 @@ class mps_phase_registry_t {
   bool endata_present() const;
 
  private:
+  // mutex_ guards ranges_/events_/has_event_/event_fulfilled_ and the endata_* fields for writers.
+  // Readers observe ready_[phase] / endata_ready_ (release-stored under the lock on publish,
+  // acquire-loaded here) and may then read the matching range lock-free -- see range()'s contract.
   static constexpr std::size_t phase_count = 7;
 
   static std::size_t phase_index(mps_phase_kind phase);
@@ -76,6 +79,19 @@ class mps_phase_registry_t {
   mutable std::mutex mutex_;
 };
 
+// Turns out-of-order decoded blocks into ordered section-range publications for the parser:
+//
+//   producer --observe_block(i,...)--> [SIMD-scan block i for section titles] --> section_hits_
+//                                       [advance contiguous decoded-byte frontier (ready_bytes_)]
+//                                       --> notify_ready_phases --> registry --> parser tasks
+//
+// Producers (the LZ4 decoders / raw readers) call observe_block for each block in any order.
+// Per block the scanner (1) SIMD-scans it for section titles starting in column 1 and records
+// the first byte of each section via a first-writer-wins CAS; (2) advances a contiguous
+// decoded-byte frontier across whatever leading blocks are now present; and (3) recomputes which
+// phases are fully bounded and publishes their [begin,end) ranges to the registry, unblocking the
+// matching parser task. A title can straddle two blocks, so adjacent decoded blocks are also
+// rescanned over a small overlap (boundary_overlap).
 class mps_section_block_scanner_t {
  public:
   mps_section_block_scanner_t(const char* data,
@@ -107,6 +123,13 @@ class mps_section_block_scanner_t {
   void notify_ready_phases();
   void advance_ready_frontier();
 
+  // Concurrency: observe_block runs concurrently on many producer threads.
+  //   * frontier_mutex_ guards next_block_ and the ready_bytes_ frontier advance.
+  //   * publish_mutex_  serializes notify_ready_phases so each phase publishes once, in order.
+  //   * block_decoded_[i] is release-stored after block_begin/end_offsets_[i] (relaxed), so an
+  //     acquire-load of a set flag makes those offsets visible to the reader.
+  //   * section_hits_[k] is a first-writer-wins CAS holding the earliest byte of section k.
+  //   * registry_ carries its own internal lock.
   const char* data_        = nullptr;
   std::size_t block_count_ = 0;
   mps_phase_registry_t& registry_;
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
index fe349b47e0..07cc0139fc 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -21,6 +21,7 @@
 #include <stdexcept>
 #include <string>
 #include <string_view>
+#include <type_traits>
 #include <vector>
 
 #include <unistd.h>
@@ -91,6 +92,19 @@ std::string_view range_text(const mps_phase_range_t& range)
 
 uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
 
+template <typename T>
+void expect_vectors_bitwise_equal(const std::vector<T>& reference,
+                                  const std::vector<T>& fast,
+                                  std::string_view field,
+                                  std::string_view context)
+{
+  static_assert(std::is_trivially_copyable_v<T>);
+  SCOPED_TRACE(std::string(context) + " " + std::string(field));
+  ASSERT_EQ(reference.size(), fast.size()) << "size";
+  if (reference.empty()) { return; }
+  EXPECT_EQ(0, std::memcmp(reference.data(), fast.data(), reference.size() * sizeof(T)));
+}
+
 void check_models_match_reference_bitwise(const parser_model_t<int, double>& fast,
                                           const mps_data_model_t<int, double>& reference,
                                           std::string_view context)
@@ -109,19 +123,27 @@ void check_models_match_reference_bitwise(const parser_model_t<int, double>& fas
   EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_))
     << std::string(context) + " objective_offset";
 
-  EXPECT_EQ(reference.A_, fast.A_) << std::string(context) + " A";
+  expect_vectors_bitwise_equal(reference.A_, fast.A_, "A", context);
   EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices";
   EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets";
-  EXPECT_EQ(reference.b_, fast.b_) << std::string(context) + " b";
-  EXPECT_EQ(reference.c_, fast.c_) << std::string(context) + " c";
-  EXPECT_EQ(reference.variable_lower_bounds_, fast.variable_lower_bounds_)
-    << std::string(context) + " variable_lower_bounds";
-  EXPECT_EQ(reference.variable_upper_bounds_, fast.variable_upper_bounds_)
-    << std::string(context) + " variable_upper_bounds";
-  EXPECT_EQ(reference.constraint_lower_bounds_, fast.constraint_lower_bounds_)
-    << std::string(context) + " constraint_lower_bounds";
-  EXPECT_EQ(reference.constraint_upper_bounds_, fast.constraint_upper_bounds_)
-    << std::string(context) + " constraint_upper_bounds";
+  expect_vectors_bitwise_equal(reference.b_, fast.b_, "b", context);
+  expect_vectors_bitwise_equal(reference.c_, fast.c_, "c", context);
+  expect_vectors_bitwise_equal(reference.variable_lower_bounds_,
+                               fast.variable_lower_bounds_,
+                               "variable_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.variable_upper_bounds_,
+                               fast.variable_upper_bounds_,
+                               "variable_upper_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_lower_bounds_,
+                               fast.constraint_lower_bounds_,
+                               "constraint_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_upper_bounds_,
+                               fast.constraint_upper_bounds_,
+                               "constraint_upper_bounds",
+                               context);
   EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types";
   EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types";
   EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names";

From 1990c067d267798334bdc651cbf7483810898d0a Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Mon, 15 Jun 2026 09:00:00 -0700
Subject: [PATCH 17/22] AI review comments

---
 cpp/cuopt_cli.cpp                             |   4 +-
 .../cuopt/linear_programming/io/parser.hpp    |  25 ++-
 .../fast_fp64_parser.hpp                      |   7 +-
 .../io/experimental_mps_fast/fast_parser.cpp  | 162 +++++++++++++++++-
 .../io/experimental_mps_fast/file_reader.cpp  |  13 +-
 .../io/experimental_mps_fast/file_reader.hpp  |   2 +
 .../experimental_mps_fast/lz4_file_reader.cpp |  39 ++---
 .../io/experimental_mps_fast/mmap_region.hpp  |  51 ++++--
 .../mps_section_scanner.cpp                   |   7 +
 .../fast_fp64_parser_test.cpp                 |   9 +
 .../fast_parser_edge_test.cpp                 | 149 +++++++++++++++-
 cpp/tests/linear_programming/parser_test.cpp  |  43 +++--
 12 files changed, 435 insertions(+), 76 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index f06e568208..13991ad1e3 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -308,8 +308,8 @@ int main(int argc, char* argv[])
   program.add_argument("--mps-reader")
     .help(
       "MPS reader implementation: default uses the production parser; experimental-fast uses the "
-      "experimental "
-      "SIMD parser for LP/MIP .mps, .mps.lz4, .mps.gz, and .mps.bz2 files")
+      "experimental SIMD parser for free-format LP/MIP/QP/QCQP (SOCP) .mps/.qps files and their "
+      ".gz/.bz2/.lz4 compressed variants")
     .default_value(std::string("default"))
     .choices("default", "experimental-fast");
 
diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index 4e46d43224..2c678f4f4e 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -20,7 +20,8 @@ namespace cuopt::linear_programming::io {
 /**
  * @brief Selects which MPS reader implementation should be used by dispatching entry points.
  *
- * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP/QP problems.
+ * The experimental fast reader is intentionally opt-in. It supports the same free-format
+ * MPS/QPS scope as read_mps(): LP, MIP, QP (QUADOBJ/QMATRIX), and QCQP/SOCP (QCMATRIX).
  */
 enum class mps_reader_type_t { default_reader, fast_experimental };
 
@@ -51,11 +52,14 @@ mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
                                     bool fixed_mps_format = false);
 
 /**
- * @brief Reads a raw LP/MIP/QP MPS problem with the experimental SIMD-optimized reader. SOCP is
- * unsupported for now.
+ * @brief Reads an MPS/QPS problem with the experimental SIMD-optimized reader.
  *
- * @param[in] mps_file_path Path to a raw or compressed .mps file.
- * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file.
+ * Supports the same free-format LP/MIP/QP/QCQP (SOCP-relevant QCMATRIX) scope as read_mps().
+ * Fixed MPS format forcing is not supported. Accepts .mps/.qps and their .gz/.bz2/.lz4 variants
+ * (compression is detected from the file path, same as read_mps()).
+ *
+ * @param[in] mps_file_path Path to a raw or compressed .mps or .qps file.
+ * @return mps_data_model_t A fully formed LP/MIP/QP problem which represents the given file.
  */
 template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path);
@@ -128,9 +132,10 @@ mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
  * @brief Reads an optimization problem from a file, dispatching on the file
  *        extension. Extension matching is case-insensitive.
  *
- * Routing:
+ * Routing (case-insensitive extensions):
  *   - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4
- *     → read_mps()
+ *     → read_mps() when mps_reader == default_reader, or read_mps_fast_experimental()
+ *       when mps_reader == fast_experimental (fixed_mps_format must be false)
  *   - .lp,  .lp.gz,  .lp.bz2, .lp.lz4 → read_lp()
  *   - anything else → std::logic_error
  *
@@ -160,12 +165,6 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
         throw std::logic_error(
           "experimental fast MPS reader does not support fixed MPS format forcing");
       }
-      if (lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") ||
-          lower.ends_with(".qps.lz4")) {
-        throw std::logic_error(
-          "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 "
-          "LP/MIP files only");
-      }
       return read_mps_fast_experimental<i_t, f_t>(path);
     }
     return read_mps<i_t, f_t>(path, fixed_mps_format);
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
index f007c0f707..02aca44dc3 100644
--- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -423,7 +423,12 @@ static inline double parse_fp64_advance(const char*& p, const char* end)
   }
 
   double v = assemble_fp64(dec);
-  if (v == v) return v;
+  if (v == v) {
+    if (p < end && (unsigned char)*p > 32) {
+      mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+    }
+    return v;
+  }
   return fallback_strtod(std::string_view(start, (size_t)(p - start)));
 }
 
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 8eae082e25..165d16d066 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -36,6 +36,7 @@
 #include <mutex>
 #include <string>
 #include <string_view>
+#include <tuple>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -494,6 +495,14 @@ struct parse_state_t {
   // some writers introduce zero-column variables only in BOUNDS.
   std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
 
+  struct qcmatrix_block_t {
+    size_t row_idx = SIZE_MAX;
+    std::string_view row_name;
+    std::vector<std::tuple<i_t, i_t, f_t>> entries;
+  };
+
+  std::vector<qcmatrix_block_t> qcmatrix_blocks;
+
   parse_state_t(mps_data_model_t<i_t, f_t>& p, cursor_t& c) : problem(p), cursor(c) {}
 
   void init_row_hash_table()
@@ -2558,11 +2567,13 @@ static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& c
   auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) {
     size_t var1_idx = lookup_quadratic_var(state, var1);
     if (var1_idx == SIZE_MAX) {
-      cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var1.size(), var1.data());
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var1.size(), var1.data());
     }
     size_t var2_idx = lookup_quadratic_var(state, var2);
     if (var2_idx == SIZE_MAX) {
-      cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var2.size(), var2.data());
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var2.size(), var2.data());
     }
     active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value);
   };
@@ -2576,18 +2587,42 @@ static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& c
       active_entries = &qmatrix_entries;
       continue;
     }
-    if (accept_section(cursor, "QCMATRIX")) {
-      cursor.error("QCMATRIX sections are not supported by the experimental fast MPS parser");
+    if (accept(cursor, "QCMATRIX")) {
+      auto row_name = cursor.read_field();
+      if (row_name.empty()) { cursor.error("QCMATRIX missing constraint row name"); }
+      size_t row_idx = state.row_lookup(row_name);
+      if (row_idx == SIZE_MAX) {
+        cursor.error(
+          "unknown constraint row name in QCMATRIX: %.*s", (int)row_name.size(), row_name.data());
+      }
+      char row_type = state.problem.row_types_[row_idx];
+      if (row_type != 'L' && row_type != 'G') {
+        cursor.error(
+          "QCMATRIX row must have ROWS type L or G: %.*s", (int)row_name.size(), row_name.data());
+      }
+      expect_eol(cursor);
+      typename parse_state_t<i_t, f_t>::qcmatrix_block_t block;
+      block.row_idx  = row_idx;
+      block.row_name = row_name;
+      state.qcmatrix_blocks.push_back(std::move(block));
+      active_entries = &state.qcmatrix_blocks.back().entries;
+      continue;
     }
     if (active_entries == nullptr) { break; }
 
-    auto var1 = cursor.read_field();
+    const char* field_start = cursor.ptr;
+    auto var1               = cursor.read_field();
     if (UNLIKELY(var1.empty())) { break; }
-    if (UNLIKELY(var1[0] == '$')) {
+    if (UNLIKELY(var1[0] == '$' || var1[0] == '*')) {
       cursor.skip_to_eol();
       expect_eol(cursor);
       continue;
     }
+    const bool starts_column_one =
+      field_start == cursor.start || field_start[-1] == '\n' || field_start[-1] == '\r';
+    if (UNLIKELY(starts_column_one)) {
+      cursor.error("unknown quadratic section record: %.*s", (int)var1.size(), var1.data());
+    }
     auto var2 = cursor.read_field();
     if (UNLIKELY(!var2.empty() && var2[0] == '$')) {
       cursor.skip_to_eol();
@@ -2679,6 +2714,120 @@ static void parse_quadratic_range(parse_state_t<i_t, f_t>& state, mps_phase_rang
   parse_quadratic_sections(state, cursor);
 }
 
+template <typename i_t, typename f_t>
+static void finalize_qcmatrix_constraints(parse_state_t<i_t, f_t>& state)
+{
+  if (state.qcmatrix_blocks.empty()) { return; }
+  scoped_timer_t timer("finalize_qcmatrix_constraints");
+  const size_t original_rows = (size_t)state.problem.n_constraints_;
+  std::vector<uint8_t> quadratic_rows(original_rows, 0);
+  std::vector<uint8_t> seen_rows(original_rows, 0);
+  size_t active_blocks = 0;
+
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+    if (block.row_idx >= original_rows) {
+      state.cursor.error("QCMATRIX row index is out of range");
+    }
+    if (seen_rows[block.row_idx]) {
+      state.cursor.error("duplicate QCMATRIX block for constraint row: %.*s",
+                         (int)block.row_name.size(),
+                         block.row_name.data());
+    }
+    seen_rows[block.row_idx]      = 1;
+    quadratic_rows[block.row_idx] = 1;
+    ++active_blocks;
+  }
+
+  if (active_blocks == 0) { return; }
+
+  // rebuild the A_ matrix. fairly ugly and brute force, could do better if we parsed the QCMATRIX
+  // entries before building the CSR in COLUMNS but unclear if worth it
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+
+    size_t linear_begin = (size_t)state.problem.A_offsets_[block.row_idx];
+    size_t linear_end   = (size_t)state.problem.A_offsets_[block.row_idx + 1];
+    typename mps_data_model_t<i_t, f_t>::quadratic_constraint_t qc;
+    qc.constraint_row_index = (i_t)block.row_idx;
+    qc.constraint_row_name  = state.problem.row_names_[block.row_idx];
+    qc.constraint_row_type  = state.problem.row_types_[block.row_idx];
+    qc.rhs_value            = state.problem.b_[block.row_idx];
+    qc.linear_values.assign(state.problem.A_.begin() + linear_begin,
+                            state.problem.A_.begin() + linear_end);
+    qc.linear_indices.assign(state.problem.A_indices_.begin() + linear_begin,
+                             state.problem.A_indices_.begin() + linear_end);
+
+    std::vector<size_t> perm(block.entries.size());
+    for (size_t i = 0; i < perm.size(); ++i) {
+      perm[i] = i;
+    }
+    std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) {
+      const auto& ea = block.entries[a];
+      const auto& eb = block.entries[b];
+      if (std::get<0>(ea) != std::get<0>(eb)) { return std::get<0>(ea) < std::get<0>(eb); }
+      return std::get<1>(ea) < std::get<1>(eb);
+    });
+
+    qc.rows.reserve(block.entries.size());
+    qc.cols.reserve(block.entries.size());
+    qc.vals.reserve(block.entries.size());
+    for (size_t idx : perm) {
+      const auto& [row, col, val] = block.entries[idx];
+      qc.rows.push_back(row);
+      qc.cols.push_back(col);
+      qc.vals.push_back(val);
+    }
+    state.problem.quadratic_constraints_.push_back(std::move(qc));
+  }
+
+  std::vector<f_t> new_A;
+  std::vector<i_t> new_A_indices;
+  std::vector<i_t> new_A_offsets;
+  std::vector<f_t> new_b;
+  std::vector<f_t> new_clb;
+  std::vector<f_t> new_cub;
+  std::vector<std::string> new_row_names;
+  std::vector<char> new_row_types;
+
+  new_A.reserve(state.problem.A_.size());
+  new_A_indices.reserve(state.problem.A_indices_.size());
+  new_A_offsets.reserve(original_rows + 1 - active_blocks);
+  new_b.reserve(original_rows - active_blocks);
+  new_clb.reserve(original_rows - active_blocks);
+  new_cub.reserve(original_rows - active_blocks);
+  new_row_names.reserve(original_rows - active_blocks);
+  new_row_types.reserve(original_rows - active_blocks);
+  new_A_offsets.push_back(0);
+
+  for (size_t row = 0; row < original_rows; ++row) {
+    if (quadratic_rows[row]) { continue; }
+    size_t begin = (size_t)state.problem.A_offsets_[row];
+    size_t end   = (size_t)state.problem.A_offsets_[row + 1];
+    new_A.insert(new_A.end(), state.problem.A_.begin() + begin, state.problem.A_.begin() + end);
+    new_A_indices.insert(new_A_indices.end(),
+                         state.problem.A_indices_.begin() + begin,
+                         state.problem.A_indices_.begin() + end);
+    new_A_offsets.push_back((i_t)new_A.size());
+    new_b.push_back(state.problem.b_[row]);
+    new_clb.push_back(state.problem.constraint_lower_bounds_[row]);
+    new_cub.push_back(state.problem.constraint_upper_bounds_[row]);
+    new_row_names.push_back(std::move(state.problem.row_names_[row]));
+    new_row_types.push_back(state.problem.row_types_[row]);
+  }
+
+  state.problem.A_                       = std::move(new_A);
+  state.problem.A_indices_               = std::move(new_A_indices);
+  state.problem.A_offsets_               = std::move(new_A_offsets);
+  state.problem.b_                       = std::move(new_b);
+  state.problem.constraint_lower_bounds_ = std::move(new_clb);
+  state.problem.constraint_upper_bounds_ = std::move(new_cub);
+  state.problem.row_names_               = std::move(new_row_names);
+  state.problem.row_types_               = std::move(new_row_types);
+  state.problem.n_constraints_           = (i_t)state.problem.b_.size();
+  state.problem.nnz_                     = (i_t)state.problem.A_.size();
+}
+
 template <typename i_t, typename f_t>
 static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
 {
@@ -2995,6 +3144,7 @@ static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
 
   parser_tasks.rethrow_if_error();
 
+  finalize_qcmatrix_constraints(state);
   append_bounds_only_variables(state);
 
   input.size = stream.size();
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 1ccedba52e..48397ae11e 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -17,6 +17,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cctype>
 #include <cerrno>
 #include <chrono>
 #include <cstdint>
@@ -46,8 +47,12 @@ constexpr long nfs_super_magic                            = 0x6969;
 bool path_has_suffix(const std::string& path, const char* suffix) noexcept
 {
   std::size_t suffix_len = std::strlen(suffix);
-  return path.size() >= suffix_len &&
-         path.compare(path.size() - suffix_len, suffix_len, suffix) == 0;
+  if (path.size() < suffix_len) { return false; }
+  for (std::size_t i = 0; i < suffix_len; ++i) {
+    unsigned char path_char = path[path.size() - suffix_len + i];
+    if (std::tolower(path_char) != suffix[i]) { return false; }
+  }
+  return true;
 }
 
 std::size_t add_input_padding(std::size_t size)
@@ -97,6 +102,7 @@ std::size_t get_file_size(const std::string& path)
 {
   int fd = ::open(path.c_str(), O_RDONLY);
   if (fd < 0) {
+    ::close(fd);
     mps_parser_fail(error_type_t::RuntimeError,
                     "Failed to open file '%s': %s",
                     path.c_str(),
@@ -173,6 +179,9 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
   }
   window_bytes_ = raw_input_window_bytes;
   window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
+#ifdef MPS_FAST_TIMERS
+  read_window_ms_.assign(window_count_, 0);
+#endif
 
   output_mapped_size_ =
     cuda::round_up(std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
index 5472434b1a..8ca3456401 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.hpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -159,6 +159,8 @@ void parallel_for_indexed(std::size_t count,
                           const char* thread_name_prefix,
                           Body body)
 {
+  assert(thread_count > 0);
+
   std::atomic_size_t next{0};
   scoped_thread_group workers;
   workers.reserve(thread_count);
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
index 4696b0ae81..5e535ce7f2 100644
--- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -557,32 +557,29 @@ struct lz4_pipeline_t {
 
   void read_window(std::size_t index)
   {
-    auto& w = windows[index];
-    w.data.reset(new char[w.size]);
-    add_compressed_resident(w.size);
-    bool ok = false;
-    {
-      MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
-      ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset);
-    }
-    if (!ok) {
-      // Capture-and-notify locally so scanner/decoder waiters wake; do not let
-      // the exception escape to parallel_for_indexed without the cv notify.
-      try {
+    try {
+      auto& w = windows[index];
+      w.data.reset(new char[w.size]);
+      add_compressed_resident(w.size);
+      bool ok = false;
+      {
+        MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+        ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset);
+      }
+      if (!ok) {
         mps_parser_fail(error_type_t::RuntimeError,
                         "Failed to pread LZ4 resident window: %s",
                         std::strerror(errno));
-      } catch (...) {
-        fail_and_notify(std::current_exception());
       }
-      return;
-    }
-    {
-      MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
-      std::lock_guard<std::mutex> lock(window_mutex);
-      window_done[index] = 1;
+      {
+        MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+        std::lock_guard<std::mutex> lock(window_mutex);
+        window_done[index] = 1;
+      }
+      window_cv.notify_all();
+    } catch (...) {
+      fail_and_notify(std::current_exception());
     }
-    window_cv.notify_all();
   }
 
   void run_decoder_stage(std::size_t tid)
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
index 7727e0d2f7..9d5469e860 100644
--- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -35,20 +35,30 @@ class mmap_region_t {
   mmap_region_t(const mmap_region_t&)            = delete;
   mmap_region_t& operator=(const mmap_region_t&) = delete;
 
-  mmap_region_t(mmap_region_t&& other) noexcept : ptr_(other.ptr_), size_(other.size_)
+  mmap_region_t(mmap_region_t&& other) noexcept
+    : ptr_(other.ptr_),
+      size_(other.size_),
+      unmap_ptr_(other.unmap_ptr_),
+      unmap_size_(other.unmap_size_)
   {
-    other.ptr_  = nullptr;
-    other.size_ = 0;
+    other.ptr_        = nullptr;
+    other.size_       = 0;
+    other.unmap_ptr_  = nullptr;
+    other.unmap_size_ = 0;
   }
 
   mmap_region_t& operator=(mmap_region_t&& other) noexcept
   {
     if (this != &other) {
       reset();
-      ptr_        = other.ptr_;
-      size_       = other.size_;
-      other.ptr_  = nullptr;
-      other.size_ = 0;
+      ptr_              = other.ptr_;
+      size_             = other.size_;
+      unmap_ptr_        = other.unmap_ptr_;
+      unmap_size_       = other.unmap_size_;
+      other.ptr_        = nullptr;
+      other.size_       = 0;
+      other.unmap_ptr_  = nullptr;
+      other.unmap_size_ = 0;
     }
     return *this;
   }
@@ -93,11 +103,7 @@ class mmap_region_t {
 
     uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
     uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1);
-    std::size_t prefix     = (std::size_t)(aligned_addr - raw_addr);
-    std::size_t suffix     = raw_size - prefix - size;
-    if (prefix > 0) { ::munmap(raw, prefix); }
-    if (suffix > 0) { ::munmap(reinterpret_cast<void*>(aligned_addr + size), suffix); }
-    return mmap_region_t(reinterpret_cast<void*>(aligned_addr), size);
+    return mmap_region_t(reinterpret_cast<void*>(aligned_addr), size, raw, raw_size);
   }
 
   static void map_fixed_or_throw(
@@ -112,9 +118,13 @@ class mmap_region_t {
 
   void reset() noexcept
   {
-    if (ptr_ != nullptr && size_ != 0) { ::munmap(ptr_, size_); }
-    ptr_  = nullptr;
-    size_ = 0;
+    void* base      = unmap_ptr_ != nullptr ? unmap_ptr_ : ptr_;
+    std::size_t len = unmap_ptr_ != nullptr ? unmap_size_ : size_;
+    if (base != nullptr && len != 0) { ::munmap(base, len); }
+    ptr_        = nullptr;
+    size_       = 0;
+    unmap_ptr_  = nullptr;
+    unmap_size_ = 0;
   }
 
   void advise(int advice) const noexcept
@@ -127,8 +137,15 @@ class mmap_region_t {
   std::size_t size() const noexcept { return size_; }
 
  private:
-  void* ptr_        = nullptr;
-  std::size_t size_ = 0;
+  mmap_region_t(void* ptr, std::size_t size, void* unmap_ptr, std::size_t unmap_size) noexcept
+    : ptr_(ptr), size_(size), unmap_ptr_(unmap_ptr), unmap_size_(unmap_size)
+  {
+  }
+
+  void* ptr_              = nullptr;
+  std::size_t size_       = 0;
+  void* unmap_ptr_        = nullptr;
+  std::size_t unmap_size_ = 0;
 };
 
 }  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
index b6b04afbff..3924e2dcd5 100644
--- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -125,6 +125,7 @@ mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
 void mps_phase_registry_t::publish_endata(const char* begin, bool present)
 {
   std::lock_guard<std::mutex> lock(mutex_);
+  if (endata_ready_.load(std::memory_order_acquire)) { return; }
   endata_begin_   = begin;
   endata_present_ = present;
   endata_ready_.store(true, std::memory_order_release);
@@ -168,6 +169,12 @@ static section_record_match_t is_section_record(const char* line_start,
     while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
       ++after;
     }
+    // QCMATRIX records are of the form "QCMATRIX <row>"
+    if (record.kind == mps_section_kind::qcmatrix) {
+      if (after == line_end) { return section_record_match_t::invalid; }
+      *kind = record.kind;
+      return section_record_match_t::section;
+    }
     if (after != line_end) { return section_record_match_t::invalid; }
     *kind = record.kind;
     return section_record_match_t::section;
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
index 2ef8339da3..8bde21bb61 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -165,6 +165,15 @@ TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd)
   EXPECT_EQ(std::string_view("  ABC"), std::string_view(p, 5));
 }
 
+TEST(FastFp64ParserTest, RejectsMalformedNumericSuffix)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  for (const char* token : {"1x", "1e", "1d+", "1e+"}) {
+    SCOPED_TRACE(token);
+    EXPECT_THROW(parse_token(token), std::exception);
+  }
+}
+
 TEST(FastFp64ParserTest, FixedSeedRandomDifferential)
 {
   std::setlocale(LC_NUMERIC, "C");
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
index 07cc0139fc..771462a9ab 100644
--- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -5,6 +5,7 @@
 #include "mps_section_scanner.hpp"
 
 #include <cuopt/linear_programming/io/parser.hpp>
+#include <mps_parser_internal.hpp>
 
 #include <gtest/gtest.h>
 
@@ -148,13 +149,38 @@ void check_models_match_reference_bitwise(const parser_model_t<int, double>& fas
   EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types";
   EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names";
   EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names";
+
+  ASSERT_EQ(reference.quadratic_constraints_.size(), fast.quadratic_constraints_.size())
+    << std::string(context) + " quadratic_constraints size";
+  for (size_t q = 0; q < reference.quadratic_constraints_.size(); ++q) {
+    const auto& ref_qc  = reference.quadratic_constraints_[q];
+    const auto& fast_qc = fast.quadratic_constraints_[q];
+    SCOPED_TRACE(std::string(context) + " quadratic_constraint " + std::to_string(q));
+    EXPECT_EQ(ref_qc.constraint_row_index, fast_qc.constraint_row_index);
+    EXPECT_EQ(ref_qc.constraint_row_name, fast_qc.constraint_row_name);
+    EXPECT_EQ(ref_qc.constraint_row_type, fast_qc.constraint_row_type);
+    EXPECT_EQ(bits(ref_qc.rhs_value), bits(fast_qc.rhs_value));
+    expect_vectors_bitwise_equal(
+      ref_qc.linear_values, fast_qc.linear_values, "linear_values", context);
+    EXPECT_EQ(ref_qc.linear_indices, fast_qc.linear_indices);
+    expect_vectors_bitwise_equal(ref_qc.vals, fast_qc.vals, "qc_vals", context);
+    EXPECT_EQ(ref_qc.rows, fast_qc.rows);
+    EXPECT_EQ(ref_qc.cols, fast_qc.cols);
+  }
+}
+
+mps_data_model_t<int, double> parse_reference_model(const std::string& path)
+{
+  mps_data_model_t<int, double> reference;
+  mps_parser_t<int, double> parser(reference, path, false);
+  return reference;
 }
 
 void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
 {
   TempMpsFile file(std::move(contents));
   auto fast      = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
-  auto reference = read_mps<int, double>(file.path, false);
+  auto reference = parse_reference_model(file.path);
   check_models_match_reference_bitwise(fast, reference, fixture_name);
 }
 
@@ -302,6 +328,27 @@ TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows)
     std::logic_error);
 }
 
+TEST(FastMpsParserEdgeTest, ParserRejectsUnknownSectionRecords)
+{
+  TempMpsFile file(
+    "NAME BAD_UNKNOWN_SECTION\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 3\n"
+    "BOUNDS\n"
+    " FR BND1 X1\n"
+    "QSECTION      R1\n"
+    " X1 X1 1\n"
+    "ENDATA\n");
+
+  EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+               std::exception);
+}
+
 TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference)
 {
   verify_fixture_bitwise("bounds_defaults_and_types",
@@ -786,4 +833,104 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
   EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types";
 }
 
+TEST(FastMpsParserEdgeTest, QcMatrixRowsMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("qcmatrix rows",
+                         "NAME QCMATRIX_TEST\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L LIN\n"
+                         " L QC1\n"
+                         " G QC2\n"
+                         "COLUMNS\n"
+                         " X1 OBJ 1 LIN 2\n"
+                         " X1 QC1 3 QC2 4\n"
+                         " X2 OBJ 2 LIN 5\n"
+                         " X2 QC1 6 QC2 7\n"
+                         "RHS\n"
+                         " RHS1 LIN 10 QC1 11\n"
+                         " RHS1 QC2 12\n"
+                         "QCMATRIX   QC1\n"
+                         " X1 X1 1.25\n"
+                         " X1 X2 -2.5\n"
+                         "QCMATRIX   QC2\n"
+                         " X2 X2 3.75\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, QcMatrixMalformedCasesMatchReference)
+{
+  const std::vector<std::string> cases = {
+    "NAME DUP_QC\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 1\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 2\n"
+    "ENDATA\n",
+    "NAME BAD_QC_ROW\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX UNKNOWN\n"
+    " X1 X1 1\n"
+    "ENDATA\n",
+    "NAME BAD_QC_VAR\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 XBAD 1\n"
+    "ENDATA\n"};
+
+  for (const auto& mps : cases) {
+    TempMpsFile file(mps);
+    EXPECT_THROW(((void)parse_reference_model(file.path)), std::exception);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception);
+  }
+}
+
+TEST(FastMpsParserEdgeTest, QuadraticParserRejectsUnknownColumnOneRecords)
+{
+  const std::vector<std::string> records = {"QSECTION      QC1",
+                                            "CSECTION      QC1        0              QUAD"};
+
+  for (const auto& record : records) {
+    TempMpsFile file(
+      "NAME BAD_QUAD_RECORD\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L QC1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 QC1 2\n"
+      " X2 OBJ 3 QC1 4\n"
+      "RHS\n"
+      " RHS1 QC1 5\n"
+      "QMATRIX\n"
+      " X1 X1 1\n" +
+      record +
+      "\n"
+      " X2 X2 2\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception)
+      << record;
+  }
+}
+
 }  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index 6a47471c09..70f7beb2dc 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -947,8 +947,9 @@ INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test);
 INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test);
 INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test);
 
-#undef INSTANTIATE_MPS_READER_TEST
-#undef INSTANTIATE_DEFAULT_MPS_READER_TEST
+// NOTE: INSTANTIATE_MPS_READER_TEST / INSTANTIATE_DEFAULT_MPS_READER_TEST are intentionally
+// left defined here; the QP/QCQP file fixtures below reuse them. They are #undef-ed after the
+// last instantiation.
 
 #ifdef MPS_PARSER_WITH_BZIP2
 TEST(mps_parser, good_mps_file_bzip2_compressed)
@@ -1051,13 +1052,14 @@ TEST(qps_parser, quadratic_objective_basic)
   EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]);
 }
 
+class qps_file_reader_test : public parser_fixture_base {};
+
 // Test actual QPS files from the dataset
-TEST(qps_parser, test_qps_files)
+TEST_P(qps_file_reader_test, test_qps_files)
 {
   // Test QP_Test_1.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_1.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_1.qps", false);
 
     EXPECT_EQ("QP_Test_1", parsed_data.get_problem_name());
     EXPECT_EQ(2, parsed_data.get_n_variables());    // C------1 and C------2
@@ -1076,8 +1078,7 @@ TEST(qps_parser, test_qps_files)
 
   // Test QP_Test_2.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_2.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_2.qps", false);
 
     EXPECT_EQ("QP_Test_2", parsed_data.get_problem_name());
     EXPECT_EQ(3, parsed_data.get_n_variables());    // C------1, C------2, C------3
@@ -2635,6 +2636,19 @@ TEST(read, qps_extension_dispatches_to_mps_parser)
   EXPECT_EQ(m.get_variable_names()[0], "x");
 }
 
+TEST(read, qps_extension_dispatches_to_fast_experimental_reader)
+{
+  temp_file_t tmp(".qps");
+  {
+    std::ofstream out(tmp.string());
+    out << kTrivialMps;
+  }
+  auto m = read<int, double>(tmp.string(), mps_reader_type_t::fast_experimental);
+  ASSERT_EQ(m.get_variable_names().size(), 1u);
+  EXPECT_EQ(m.get_variable_names()[0], "x");
+  EXPECT_NEAR(m.get_variable_upper_bounds()[0], 10.0, tolerance);
+}
+
 TEST(read, mps_gz_extension_dispatches_to_mps_parser)
 {
   auto m = read<int, double>(cuopt::test::get_rapids_dataset_root_dir() +
@@ -2849,13 +2863,12 @@ TEST(qps_parser, qcmatrix_append_api)
 }
 
 // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic.
-TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
+TEST_P(qps_file_reader_test, qcmatrix_mps_linear_rhs_and_bounds)
 {
   if (!file_exists("qcqp/QC_Test_1.mps")) {
     GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false);
+  const auto model = read_mps_file("qcqp/QC_Test_1.mps", false);
 
   ASSERT_TRUE(model.has_quadratic_constraints());
   const auto& qcs = model.get_quadratic_constraints();
@@ -2901,13 +2914,12 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
   EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value);
 }
 
-TEST(qps_parser, qcqp_p0033_mps_sections)
+TEST_P(qps_file_reader_test, qcqp_p0033_mps_sections)
 {
   if (!file_exists("qcqp/p0033_qc1.mps")) {
     GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false);
+  const auto model = read_mps_file("qcqp/p0033_qc1.mps", false);
 
   EXPECT_EQ(12, model.get_n_constraints());
   EXPECT_EQ(33, model.get_n_variables());
@@ -2950,4 +2962,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1)
   auto reloaded_2 = read_mps<int, double>(temp_file_2.string(), false);
   compare_data_models(reloaded, reloaded_2);
 }
+
+INSTANTIATE_MPS_READER_TEST(qps_file_reader_test);
+
+#undef INSTANTIATE_MPS_READER_TEST
+#undef INSTANTIATE_DEFAULT_MPS_READER_TEST
 }  // namespace cuopt::linear_programming::io

From d7358f68ee4448a3bfb8db380be0518870637549 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Mon, 15 Jun 2026 09:07:37 -0700
Subject: [PATCH 18/22] fix sloppy fix

---
 cpp/src/io/experimental_mps_fast/file_reader.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
index 48397ae11e..78e4219e06 100644
--- a/cpp/src/io/experimental_mps_fast/file_reader.cpp
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -102,12 +102,15 @@ std::size_t get_file_size(const std::string& path)
 {
   int fd = ::open(path.c_str(), O_RDONLY);
   if (fd < 0) {
-    ::close(fd);
     mps_parser_fail(error_type_t::RuntimeError,
                     "Failed to open file '%s': %s",
                     path.c_str(),
                     std::strerror(errno));
   }
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+
   std::size_t size = get_file_size(fd, path);
   ::close(fd);
   return size;

From 225ae33b5620ce957af99a177d0e09516b47628c Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 23 Jun 2026 01:58:26 -0700
Subject: [PATCH 19/22] review comments

---
 .../cuopt/linear_programming/io/parser.hpp       | 16 +++++++++-------
 cpp/src/io/experimental_mps_fast/fast_parser.cpp |  2 --
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index 2c678f4f4e..7122282e70 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <cctype>
+#include <cstring>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@@ -157,9 +158,13 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  if (lower.ends_with(".mps.lz4") || lower.ends_with(".mps.bz2") || lower.ends_with(".mps.gz") ||
-      lower.ends_with(".mps") || lower.ends_with(".qps.lz4") || lower.ends_with(".qps.bz2") ||
-      lower.ends_with(".qps.gz") || lower.ends_with(".qps")) {
+  for (const char* compression_suffix : {".bz2", ".gz", ".lz4"}) {
+    if (lower.ends_with(compression_suffix)) {
+      lower.resize(lower.size() - std::strlen(compression_suffix));
+      break;
+    }
+  }
+  if (lower.ends_with(".mps") || lower.ends_with(".qps")) {
     if (mps_reader == mps_reader_type_t::fast_experimental) {
       if (fixed_mps_format) {
         throw std::logic_error(
@@ -169,10 +174,7 @@ inline mps_data_model_t<i_t, f_t> read(const std::string& path,
     }
     return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
-  if (lower.ends_with(".lp.lz4") || lower.ends_with(".lp.bz2") || lower.ends_with(".lp.gz") ||
-      lower.ends_with(".lp")) {
-    return read_lp<i_t, f_t>(path);
-  }
+  if (lower.ends_with(".lp")) { return read_lp<i_t, f_t>(path); }
   throw std::logic_error(
     "read: unrecognized input file extension. Supported (case-insensitive): "
     ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, "
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
index 165d16d066..02038c6fd9 100644
--- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -3137,8 +3137,6 @@ static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
           phase_end("quadratic");
         });
       }
-
-#pragma omp taskwait
     }
   }
 

From 3f8f9fbfb95a7b26fda3182776c009cffb865eaf Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 23 Jun 2026 02:33:37 -0700
Subject: [PATCH 20/22] hopefully fix wheel CI builds

---
 cpp/CMakeLists.txt | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e50dc52172..d830f501c8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -458,13 +458,13 @@ set(CUOPT_SRC_FILES)
 set(MPS_FAST_SRC_FILES)
 add_subdirectory(src)
 if (HOST_LINEINFO)
-    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
 endif ()
 
 # Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
         CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
-    set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2")
 endif ()
 
@@ -475,7 +475,7 @@ endif ()
 # Must happen before gRPC files are appended to CUOPT_SRC_FILES.
 # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO).
 if (DEFINE_ASSERT)
-    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG")
 endif ()
 
@@ -500,7 +500,7 @@ if (NOT SKIP_GRPC_BUILD)
     # The conda-forge abseil shared library is built with NDEBUG and does not
     # export that symbol (abseil-cpp#1624).  Without this, Debug builds fail
     # at runtime with "undefined symbol: absl::…::Mutex::Dtor".
-    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG")
 endif (NOT SKIP_GRPC_BUILD)
 
@@ -627,10 +627,17 @@ target_link_libraries(cuopt
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
         simde::simde
+        OpenMP::OpenMP_CXX
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
 
+# Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp
+# and orders its lib dir ahead of the system one. Without it, OpenMP is supplied
+# only as a bare -lgomp, which can resolve to an older system libgomp missing
+# OpenMP 5.0 symbols such as omp_fulfill_event (used by the fast MPS parser).
+target_link_options(cuopt PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+
 
 # ##################################################################################################
 # - generate tests --------------------------------------------------------------------------------
@@ -754,6 +761,7 @@ if (NOT BUILD_LP_ONLY)
     )
 
     target_link_options(cuopt_cli PRIVATE -pie)
+    target_link_options(cuopt_cli PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
 
     target_include_directories(cuopt_cli
             PRIVATE
@@ -813,6 +821,7 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY)
             OpenMP::OpenMP_CXX
             PRIVATE
     )
+    target_link_options(solve_MIP PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -843,6 +852,7 @@ if (BUILD_LP_BENCHMARKS)
             OpenMP::OpenMP_CXX
             PRIVATE
     )
+    target_link_options(solve_LP PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -874,6 +884,7 @@ if (NOT SKIP_GRPC_BUILD)
     )
 
     target_link_options(cuopt_grpc_server PRIVATE -pie)
+    target_link_options(cuopt_grpc_server PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
 
     target_include_directories(cuopt_grpc_server
             PRIVATE

From 57e48a2bf8d298ca3bc0e0f054bedee5db4457d0 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 23 Jun 2026 04:57:19 -0700
Subject: [PATCH 21/22] wheel fix

---
 cpp/CMakeLists.txt | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d830f501c8..385b43b3e2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -633,10 +633,11 @@ target_link_libraries(cuopt
 )
 
 # Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp
-# and orders its lib dir ahead of the system one. Without it, OpenMP is supplied
-# only as a bare -lgomp, which can resolve to an older system libgomp missing
-# OpenMP 5.0 symbols such as omp_fulfill_event (used by the fast MPS parser).
-target_link_options(cuopt PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+# and orders its lib dir ahead of the system one. OpenMP::OpenMP_CXX alone supplies
+# a bare -lgomp, which can resolve to an older system libgomp missing OpenMP 5.0
+# symbols such as omp_fulfill_event (used by the fast MPS parser). Plain -fopenmp
+# (not gated on LINK_LANGUAGE:CXX) is required because cuopt is CUDA-linked.
+target_link_options(cuopt PRIVATE -fopenmp)
 
 
 # ##################################################################################################
@@ -760,8 +761,7 @@ if (NOT BUILD_LP_ONLY)
             "$<$<COMPILE_LANGUAGE:CUDA>:${CUOPT_CUDA_FLAGS}>"
     )
 
-    target_link_options(cuopt_cli PRIVATE -pie)
-    target_link_options(cuopt_cli PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+    target_link_options(cuopt_cli PRIVATE -pie -fopenmp)
 
     target_include_directories(cuopt_cli
             PRIVATE
@@ -821,7 +821,7 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY)
             OpenMP::OpenMP_CXX
             PRIVATE
     )
-    target_link_options(solve_MIP PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+    target_link_options(solve_MIP PRIVATE -fopenmp)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -852,7 +852,7 @@ if (BUILD_LP_BENCHMARKS)
             OpenMP::OpenMP_CXX
             PRIVATE
     )
-    target_link_options(solve_LP PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+    target_link_options(solve_LP PRIVATE -fopenmp)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -883,8 +883,7 @@ if (NOT SKIP_GRPC_BUILD)
             PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
     )
 
-    target_link_options(cuopt_grpc_server PRIVATE -pie)
-    target_link_options(cuopt_grpc_server PRIVATE $<$<LINK_LANGUAGE:CXX>:-fopenmp>)
+    target_link_options(cuopt_grpc_server PRIVATE -pie -fopenmp)
 
     target_include_directories(cuopt_grpc_server
             PRIVATE

From 2c5fec2fa20343c50c2a7cda8f5c7cc299112849 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 23 Jun 2026 07:21:26 -0700
Subject: [PATCH 22/22] wheel fix, hopefully

---
 cpp/CMakeLists.txt | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 385b43b3e2..98f7848fed 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -202,6 +202,21 @@ endif ()
 find_package(OpenMP REQUIRED)
 message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
 
+# Resolve libgomp from the active C++ compiler, not FindOpenMP's generic -lgomp (which can
+# resolve to an older system libgomp on Rocky/RHEL wheel builders). The fast MPS parser uses
+# OpenMP 5.0 detached tasks (omp_fulfill_event); compile and link must use the same libgomp.
+execute_process(
+        COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libgomp.so
+        OUTPUT_VARIABLE CUOPT_LIBGOMP_FILE
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if (NOT IS_ABSOLUTE "${CUOPT_LIBGOMP_FILE}")
+    message(FATAL_ERROR "Could not resolve libgomp from ${CMAKE_CXX_COMPILER}: '${CUOPT_LIBGOMP_FILE}'")
+endif ()
+get_filename_component(CUOPT_LIBGOMP_DIR "${CUOPT_LIBGOMP_FILE}" DIRECTORY)
+message(STATUS "cuOpt: libgomp for OpenMP link = ${CUOPT_LIBGOMP_FILE}")
+list(APPEND CUOPT_CXX_FLAGS -fopenmp)
+
 # MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4
 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON)
 option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON)
@@ -513,6 +528,7 @@ set_target_properties(cuopt
         INSTALL_RPATH "\$ORIGIN"
         INTERFACE_POSITION_INDEPENDENT_CODE ON
         CXX_SCAN_FOR_MODULES OFF
+        LINKER_LANGUAGE CXX
 )
 
 target_compile_definitions(cuopt
@@ -582,8 +598,7 @@ add_dependencies(cuopt PSLP)
 set(CUOPT_PRIVATE_CUDA_LIBS
         CUDA::curand
         CUDA::cusolver
-        TBB::tbb
-        OpenMP::OpenMP_CXX)
+        TBB::tbb)
 
 list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
 
@@ -627,17 +642,15 @@ target_link_libraries(cuopt
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
         simde::simde
-        OpenMP::OpenMP_CXX
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
 
-# Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp
-# and orders its lib dir ahead of the system one. OpenMP::OpenMP_CXX alone supplies
-# a bare -lgomp, which can resolve to an older system libgomp missing OpenMP 5.0
-# symbols such as omp_fulfill_event (used by the fast MPS parser). Plain -fopenmp
-# (not gated on LINK_LANGUAGE:CXX) is required because cuopt is CUDA-linked.
-target_link_options(cuopt PRIVATE -fopenmp)
+# Force libgomp from the active C++ toolchain into libcuopt.so. OpenMP::OpenMP_CXX and/or
+# -fopenmp alone can leave omp_fulfill_event undefined (CUDA-linked target + --as-needed) or
+# resolve a trailing bare -lgomp to an older system libgomp at executable link time.
+target_link_directories(cuopt PRIVATE ${CUOPT_LIBGOMP_DIR})
+target_link_libraries(cuopt PRIVATE "-Wl,--no-as-needed" gomp "-Wl,--as-needed")
 
 
 # ##################################################################################################
@@ -761,7 +774,7 @@ if (NOT BUILD_LP_ONLY)
             "$<$<COMPILE_LANGUAGE:CUDA>:${CUOPT_CUDA_FLAGS}>"
     )
 
-    target_link_options(cuopt_cli PRIVATE -pie -fopenmp)
+    target_link_options(cuopt_cli PRIVATE -pie)
 
     target_include_directories(cuopt_cli
             PRIVATE
@@ -776,7 +789,6 @@ if (NOT BUILD_LP_ONLY)
     target_link_libraries(cuopt_cli
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             ${CUDSS_LIBRARIES}
             TBB::tbb
             PRIVATE
@@ -818,10 +830,8 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY)
     target_link_libraries(solve_MIP
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
     )
-    target_link_options(solve_MIP PRIVATE -fopenmp)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -849,10 +859,8 @@ if (BUILD_LP_BENCHMARKS)
     target_link_libraries(solve_LP
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
     )
-    target_link_options(solve_LP PRIVATE -fopenmp)
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
         target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags)
     endif ()
@@ -883,7 +891,7 @@ if (NOT SKIP_GRPC_BUILD)
             PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUOPT_CXX_FLAGS}>"
     )
 
-    target_link_options(cuopt_grpc_server PRIVATE -pie -fopenmp)
+    target_link_options(cuopt_grpc_server PRIVATE -pie)
 
     target_include_directories(cuopt_grpc_server
             PRIVATE
@@ -903,7 +911,6 @@ if (NOT SKIP_GRPC_BUILD)
     target_link_libraries(cuopt_grpc_server
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
             protobuf::libprotobuf
             gRPC::grpc++