diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7e2dd099c1..98f7848fed 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -202,9 +202,25 @@ endif ()
 find_package(OpenMP REQUIRED)
 message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
 
-# MPS/QPS parser supports compressed inputs via bzip2 and zlib
+# Resolve libgomp from the active C++ compiler, not FindOpenMP's generic -lgomp (which can
+# resolve to an older system libgomp on Rocky/RHEL wheel builders). The fast MPS parser uses
+# OpenMP 5.0 detached tasks (omp_fulfill_event); compile and link must use the same libgomp.
+execute_process(
+        COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libgomp.so
+        OUTPUT_VARIABLE CUOPT_LIBGOMP_FILE
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if (NOT IS_ABSOLUTE "${CUOPT_LIBGOMP_FILE}")
+    message(FATAL_ERROR "Could not resolve libgomp from ${CMAKE_CXX_COMPILER}: '${CUOPT_LIBGOMP_FILE}'")
+endif ()
+get_filename_component(CUOPT_LIBGOMP_DIR "${CUOPT_LIBGOMP_FILE}" DIRECTORY)
+message(STATUS "cuOpt: libgomp for OpenMP link = ${CUOPT_LIBGOMP_FILE}")
+list(APPEND CUOPT_CXX_FLAGS -fopenmp)
+
+# MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4
 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON)
 option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON)
+option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON)
 if (CUOPT_PARSER_WITH_BZIP2)
     find_package(BZip2 REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_BZIP2)
@@ -213,6 +229,10 @@ if (CUOPT_PARSER_WITH_ZLIB)
     find_package(ZLIB REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_ZLIB)
 endif ()
+if (CUOPT_PARSER_WITH_LZ4)
+    # No headers or link target needed; the experimental reader loads one liblz4 symbol at runtime.
+    add_compile_definitions(MPS_PARSER_WITH_LZ4)
+endif ()
 
 # Debug options
 if (CMAKE_BUILD_TYPE MATCHES Debug)
@@ -250,6 +270,20 @@ else ()
     find_package(RAFT REQUIRED)
 endif ()
 
+rapids_cpm_find(simde 0.8.2
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
+        GIT_TAG v0.8.2
+        GIT_SHALLOW TRUE
+        DOWNLOAD_ONLY TRUE
+)
+
+if (NOT TARGET simde::simde)
+    add_library(simde::simde INTERFACE IMPORTED GLOBAL)
+    set_target_properties(simde::simde
+            PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${simde_SOURCE_DIR}")
+endif ()
+
 FetchContent_Declare(
         papilo
         GIT_REPOSITORY "https://github.com/scipopt/papilo.git"
@@ -436,16 +470,27 @@ if (BUILD_TESTS)
 endif ()
 
 set(CUOPT_SRC_FILES)
+set(MPS_FAST_SRC_FILES)
 add_subdirectory(src)
 if (HOST_LINEINFO)
-    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
 endif ()
 
+# Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
+        CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
+    set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2")
+endif ()
+
+# TODO: figure out a set of flags for ARM that fits the range of CPUs we wish to support (neoverse?)
+# NEON should be universal on aarch64 and enough for our purposes (parsing) though
+
 # Apply -UNDEBUG only to solver source files (not gRPC infrastructure).
 # Must happen before gRPC files are appended to CUOPT_SRC_FILES.
 # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO).
 if (DEFINE_ASSERT)
-    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG")
 endif ()
 
@@ -470,7 +515,7 @@ if (NOT SKIP_GRPC_BUILD)
     # The conda-forge abseil shared library is built with NDEBUG and does not
     # export that symbol (abseil-cpp#1624).  Without this, Debug builds fail
     # at runtime with "undefined symbol: absl::…::Mutex::Dtor".
-    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG")
 endif (NOT SKIP_GRPC_BUILD)
 
@@ -483,6 +528,7 @@ set_target_properties(cuopt
         INSTALL_RPATH "\$ORIGIN"
         INTERFACE_POSITION_INDEPENDENT_CODE ON
         CXX_SCAN_FOR_MODULES OFF
+        LINKER_LANGUAGE CXX
 )
 
 target_compile_definitions(cuopt
@@ -552,8 +598,7 @@ add_dependencies(cuopt PSLP)
 set(CUOPT_PRIVATE_CUDA_LIBS
         CUDA::curand
         CUDA::cusolver
-        TBB::tbb
-        OpenMP::OpenMP_CXX)
+        TBB::tbb)
 
 list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
 
@@ -596,10 +641,17 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        simde::simde
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
 
+# Force libgomp from the active C++ toolchain into libcuopt.so. OpenMP::OpenMP_CXX and/or
+# -fopenmp alone can leave omp_fulfill_event undefined (CUDA-linked target + --as-needed) or
+# resolve a trailing bare -lgomp to an older system libgomp at executable link time.
+target_link_directories(cuopt PRIVATE ${CUOPT_LIBGOMP_DIR})
+target_link_libraries(cuopt PRIVATE "-Wl,--no-as-needed" gomp "-Wl,--as-needed")
+
 
 # ##################################################################################################
 # - generate tests --------------------------------------------------------------------------------
@@ -737,7 +789,6 @@ if (NOT BUILD_LP_ONLY)
     target_link_libraries(cuopt_cli
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             ${CUDSS_LIBRARIES}
             TBB::tbb
             PRIVATE
@@ -779,7 +830,6 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY)
     target_link_libraries(solve_MIP
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
     )
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
@@ -809,7 +859,6 @@ if (BUILD_LP_BENCHMARKS)
     target_link_libraries(solve_LP
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
     )
     if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "")
@@ -862,7 +911,6 @@ if (NOT SKIP_GRPC_BUILD)
     target_link_libraries(cuopt_grpc_server
             PUBLIC
             cuopt
-            OpenMP::OpenMP_CXX
             PRIVATE
             protobuf::libprotobuf
             gRPC::grpc++
diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 37876cac7a..13991ad1e3 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -90,11 +90,13 @@ inline cuopt::init_logger_t dummy_logger(
  *                  .mps/.qps and their .gz/.bz2 variants → MPS parser;
  *                  anything else is rejected.
  * @param initial_solution_file Path to initial solution file in SOL format
+ * @param mps_reader MPS reader implementation selected by the CLI
  * @param settings Merged solver settings (config file loaded in main, then CLI overrides applied)
  */
 int run_single_file(const std::string& file_path,
                     const std::string& initial_solution_file,
                     bool solve_relaxation,
+                    cuopt::linear_programming::io::mps_reader_type_t mps_reader,
                     cuopt::linear_programming::solver_settings_t<int, double>& settings)
 {
   cuopt::init_logger_t log(settings.get_parameter<std::string>(CUOPT_LOG_FILE),
@@ -108,7 +110,7 @@ int run_single_file(const std::string& file_path,
   {
     CUOPT_LOG_INFO("Reading file %s", base_filename.c_str());
     try {
-      mps_data_model = cuopt::linear_programming::io::read<int, double>(file_path);
+      mps_data_model = cuopt::linear_programming::io::read<int, double>(file_path, mps_reader);
     } catch (const std::logic_error& e) {
       CUOPT_LOG_ERROR("Parser exception: %s", e.what());
       parsing_failed = true;
@@ -284,8 +286,8 @@ int main(int argc, char* argv[])
   program.add_argument("filename")
     .help(
       "input problem file; format dispatched by extension (case-insensitive). "
-      "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants "
-      "(e.g. .lp.gz, .mps.bz2, .qps.gz)")
+      "Supported: .lp, .mps, .qps and their .gz / .bz2 / .lz4 compressed variants "
+      "(e.g. .lp.gz, .mps.bz2, .qps.lz4).")
     .nargs(1)
     .required();
 
@@ -303,6 +305,14 @@ int main(int argc, char* argv[])
     .help("path to parameter config file (key = value format, supports all parameters)")
     .default_value(std::string(""));
 
+  program.add_argument("--mps-reader")
+    .help(
+      "MPS reader implementation: default uses the production parser; experimental-fast uses the "
+      "experimental SIMD parser for free-format LP/MIP/QP/QCQP (SOCP) .mps/.qps files and their "
+      ".gz/.bz2/.lz4 compressed variants")
+    .default_value(std::string("default"))
+    .choices("default", "experimental-fast");
+
   program.add_argument("--dump-hyper-params")
     .help("print hyper-parameters only in config file format and exit")
     .default_value(false)
@@ -403,6 +413,12 @@ int main(int argc, char* argv[])
   const auto initial_solution_file = program.get<std::string>("--initial-solution");
   const auto solve_relaxation      = program.get<bool>("--relaxation");
   const auto params_file           = program.get<std::string>("--params-file");
+  const auto mps_reader_arg        = program.get<std::string>("--mps-reader");
+
+  auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader;
+  if (mps_reader_arg == "experimental-fast") {
+    mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental;
+  }
 
   cuopt::linear_programming::solver_settings_t<int, double> settings;
   try {
@@ -432,5 +448,5 @@ int main(int argc, char* argv[])
     RAFT_CUDA_TRY(cudaSetDevice(0));
   }
 
-  return run_single_file(file_name, initial_solution_file, solve_relaxation, settings);
+  return run_single_file(file_name, initial_solution_file, solve_relaxation, mps_reader, settings);
 }
diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp
index a63e40f31f..7122282e70 100644
--- a/cpp/include/cuopt/linear_programming/io/parser.hpp
+++ b/cpp/include/cuopt/linear_programming/io/parser.hpp
@@ -11,17 +11,26 @@
 
 #include <algorithm>
 #include <cctype>
+#include <cstring>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 
 namespace cuopt::linear_programming::io {
 
+/**
+ * @brief Selects which MPS reader implementation should be used by dispatching entry points.
+ *
+ * The experimental fast reader is intentionally opt-in. It supports the same free-format
+ * MPS/QPS scope as read_mps(): LP, MIP, QP (QUADOBJ/QMATRIX), and QCQP/SOCP (QCMATRIX).
+ */
+enum class mps_reader_type_t { default_reader, fast_experimental };
+
 /**
  * @brief Reads the equation from an MPS or QPS file.
  *
  * The input file can be a plain text file in MPS-/QPS-format or a compressed MPS/QPS
- * file (.mps.gz or .mps.bz2).
+ * file (.mps.gz, .mps.bz2, or .mps.lz4).
  *
  * Read this link http://lpsolve.sourceforge.net/5.5/mps-format.htm for more
  * details on both free and fixed MPS format.
@@ -32,8 +41,8 @@ namespace cuopt::linear_programming::io {
  * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ)
  * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP)
  *
- * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression
- * libraries zlib or libbzip2 are installed, respectively.
+ * Note: Compressed MPS files .mps.gz, .mps.bz2, and .mps.lz4 can only be read if
+ * zlib, libbzip2, or liblz4 are installed, respectively.
  *
  * @param[in] mps_file_path Path to MPS/QPSfile.
  * @param[in] fixed_mps_format If MPS/QPS file should be parsed as fixed, false by default
@@ -43,6 +52,19 @@ template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
                                     bool fixed_mps_format = false);
 
+/**
+ * @brief Reads an MPS/QPS problem with the experimental SIMD-optimized reader.
+ *
+ * Supports the same free-format LP/MIP/QP/QCQP (SOCP-relevant QCMATRIX) scope as read_mps().
+ * Fixed MPS format forcing is not supported. Accepts .mps/.qps and their .gz/.bz2/.lz4 variants
+ * (compression is detected from the file path, same as read_mps()).
+ *
+ * @param[in] mps_file_path Path to a raw or compressed .mps or .qps file.
+ * @return mps_data_model_t A fully formed LP/MIP/QP problem which represents the given file.
+ */
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path);
+
 /**
  * @brief Reads an MPS problem from in-memory file contents.
  *
@@ -111,38 +133,72 @@ mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
  * @brief Reads an optimization problem from a file, dispatching on the file
  *        extension. Extension matching is case-insensitive.
  *
- * Routing:
- *   - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps()
- *   - .lp,  .lp.gz,  .lp.bz2                            → read_lp()
+ * Routing (case-insensitive extensions):
+ *   - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4
+ *     → read_mps() when mps_reader == default_reader, or read_mps_fast_experimental()
+ *       when mps_reader == fast_experimental (fixed_mps_format must be false)
+ *   - .lp,  .lp.gz,  .lp.bz2, .lp.lz4 → read_lp()
  *   - anything else → std::logic_error
  *
  * This is the entry point of choice for user-facing tools (CLI, C API) that
  * want both formats to "just work" without an explicit format flag.
  *
  * @param[in] path Path to the input file.
+ * @param[in] mps_reader Selects the MPS reader implementation for MPS/QPS inputs.
  * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
  *             ignored for LP inputs. False by default.
  * @return mps_data_model_t The parsed problem.
  */
 template <typename i_t, typename f_t>
-inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+inline mps_data_model_t<i_t, f_t> read(const std::string& path,
+                                       mps_reader_type_t mps_reader,
+                                       bool fixed_mps_format = false)
 {
   std::string lower(path);
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  if (lower.ends_with(".mps") || lower.ends_with(".mps.gz") || lower.ends_with(".mps.bz2") ||
-      lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2")) {
-    return read_mps<i_t, f_t>(path, fixed_mps_format);
+  for (const char* compression_suffix : {".bz2", ".gz", ".lz4"}) {
+    if (lower.ends_with(compression_suffix)) {
+      lower.resize(lower.size() - std::strlen(compression_suffix));
+      break;
+    }
   }
-  if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) {
-    return read_lp<i_t, f_t>(path);
+  if (lower.ends_with(".mps") || lower.ends_with(".qps")) {
+    if (mps_reader == mps_reader_type_t::fast_experimental) {
+      if (fixed_mps_format) {
+        throw std::logic_error(
+          "experimental fast MPS reader does not support fixed MPS format forcing");
+      }
+      return read_mps_fast_experimental<i_t, f_t>(path);
+    }
+    return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
+  if (lower.ends_with(".lp")) { return read_lp<i_t, f_t>(path); }
   throw std::logic_error(
     "read: unrecognized input file extension. Supported (case-insensitive): "
-    ".mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, .lp.bz2. "
+    ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, "
+    ".lp, .lp.gz, .lp.bz2, .lp.lz4. "
     "Given path: " +
     path);
 }
 
+/**
+ * @brief Reads an optimization problem from a file, dispatching on the file
+ *        extension. Extension matching is case-insensitive.
+ *
+ * Uses the default MPS reader. See the 3-argument read() overload for routing
+ * details and supported extensions.
+ *
+ * @param[in] path Path to the input file.
+ * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
+ *             ignored for LP inputs. False by default.
+ * @return mps_data_model_t The parsed problem.
+ */
+template <typename i_t, typename f_t>
+inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+{
+  return read<i_t, f_t>(path, mps_reader_type_t::default_reader, fixed_mps_format);
+}
+
 }  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 1ae6988466..6883cce82f 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -25,3 +25,4 @@ add_subdirectory(branch_and_bound)
 add_subdirectory(cuts)
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${UTIL_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt
index cc4affa890..cafcffb23f 100644
--- a/cpp/src/io/CMakeLists.txt
+++ b/cpp/src/io/CMakeLists.txt
@@ -3,6 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 
+set(MPS_FAST_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp
+)
+
 set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/data_model_view.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/file_to_string.cpp
@@ -13,6 +20,8 @@ set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/parser.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/cython_parser.cpp
+  ${MPS_FAST_SRC_FILES}
 )
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${PARSERS_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
new file mode 100644
index 0000000000..02aca44dc3
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -0,0 +1,436 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <utilities/error.hpp>
+
+#include <array>
+#include <bit>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+#include <string_view>
+
+namespace cuopt::linear_programming::io::detail {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace fp64 {
+
+#define FASTP64_MIN_EXP_10    (-307)
+#define FASTP64_MAX_EXP_10    288
+#define FASTP64_POWER_COUNT   (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1)
+#define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1)
+#define FASTP64_EXPONENT_MASK 0x7FF
+#define FASTP64_HALF_MASK     0x1FF
+
+// Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+// verified on a large corpus of FP64 values: https://github.com/lemire/simple_fastfloat_benchmark
+
+struct power_10_lut_entry_t {
+  uint64_t high;
+  uint64_t low;
+  int biased_e2;
+};
+
+// util class to perform 256bit precision arithmetic in constexpr to build the eisel-lemire lookup
+// table
+struct cuopt_uint256_t {
+  std::array<uint64_t, 4> limb{};
+
+  constexpr uint32_t mul_u32(uint32_t m)
+  {
+    unsigned __int128 carry = 0;
+    for (uint64_t& v : limb) {
+      unsigned __int128 x = (unsigned __int128)v * m + carry;
+      v                   = (uint64_t)x;
+      carry               = x >> 64;
+    }
+    return (uint32_t)carry;
+  }
+
+  constexpr cuopt_uint256_t shl_small(int bits) const
+  {
+    cuopt_uint256_t out;
+    if (bits == 0) return *this;
+    for (int i = 3; i >= 0; --i) {
+      uint64_t v = limb[i] << bits;
+      if (i > 0) v |= limb[i - 1] >> (64 - bits);
+      out.limb[i] = v;
+    }
+    return out;
+  }
+};
+
+struct cuopt_normalized_uint256_t {
+  cuopt_uint256_t sig;
+  int exp2 = 0;
+
+  static constexpr cuopt_normalized_uint256_t one()
+  {
+    cuopt_normalized_uint256_t x;
+    x.sig.limb[3] = uint64_t{1} << 63;
+    x.exp2        = -255;
+    return x;
+  }
+
+  constexpr void mul10()
+  {
+    uint32_t carry = sig.mul_u32(10);
+    int shift      = 32 - std::countl_zero(carry);
+    // The normalized 256-bit value always overflows into carry after *10; keep
+    // the guard explicit because the cross-limb path shifts by 64 - shift.
+    if (shift == 0) { return; }
+    cuopt_uint256_t out;
+    for (int i = 0; i < 4; ++i) {
+      uint64_t lower = sig.limb[i] >> shift;
+      uint64_t upper = 0;
+      if (i + 1 < 4) {
+        upper = sig.limb[i + 1] << (64 - shift);
+      } else {
+        upper = (uint64_t)carry << (64 - shift);
+      }
+      out.limb[i] = lower | upper;
+    }
+    sig = out;
+    exp2 += shift;
+  }
+
+  constexpr void div10()
+  {
+    constexpr uint64_t div10_shift_4_threshold = 0xA000000000000000ULL;
+    int shift                                  = sig.limb[3] < div10_shift_4_threshold ? 4 : 3;
+    uint64_t extra                             = sig.limb[3] >> (64 - shift);
+    cuopt_uint256_t shifted                    = sig.shl_small(shift);
+
+    cuopt_uint256_t quotient;
+    unsigned __int128 rem = extra;
+    for (int i = 3; i >= 0; --i) {
+      unsigned __int128 cur = (rem << 64) | shifted.limb[i];
+      quotient.limb[i]      = (uint64_t)(cur / 10);
+      rem                   = cur % 10;
+    }
+    sig = quotient;
+    exp2 -= shift;
+  }
+};
+
+constexpr power_10_lut_entry_t make_power(const cuopt_normalized_uint256_t& p)
+{
+  int e2 = p.exp2 + 192;
+  return {p.sig.limb[3], p.sig.limb[2], 1150 + e2};
+}
+
+// build time LUT for the lemire trick
+constexpr std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> make_power_table()
+{
+  std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> table{};
+  cuopt_normalized_uint256_t p = cuopt_normalized_uint256_t::one();
+  table[-FASTP64_MIN_EXP_10]   = make_power(p);
+
+  for (int e = 1; e <= FASTP64_MAX_EXP_10; ++e) {
+    p.mul10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+
+  p = cuopt_normalized_uint256_t::one();
+  for (int e = -1; e >= FASTP64_MIN_EXP_10; --e) {
+    p.div10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+  return table;
+}
+
+inline constexpr auto fast_fp64_parse_lut = make_power_table();
+
+inline constexpr std::array<double, 23> small_powers = {
+  1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+  1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+inline constexpr std::array<uint64_t, 16> small_integer_powers = {1ULL,
+                                                                  10ULL,
+                                                                  100ULL,
+                                                                  1000ULL,
+                                                                  10000ULL,
+                                                                  100000ULL,
+                                                                  1000000ULL,
+                                                                  10000000ULL,
+                                                                  100000000ULL,
+                                                                  1000000000ULL,
+                                                                  10000000000ULL,
+                                                                  100000000000ULL,
+                                                                  1000000000000ULL,
+                                                                  10000000000000ULL,
+                                                                  100000000000000ULL,
+                                                                  1000000000000000ULL};
+
+struct parsed_decimal_t {
+  bool negative      = false;
+  bool fast_eligible = false;
+  uint64_t mantissa  = 0;
+  int exp10          = 0;
+};
+
+static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; }
+
+// SWAR 8char run of digits -> integer representation
+// better and more portable than AVX2 stuff since AVX2 doesn't like swizzling across 16B lanes
+// saw no real difference w/ 16B SSE
+static inline bool parse_8_digits(const char* p, uint32_t& out)
+{
+  // comply with strict aliasing rules
+  std::array<char, sizeof(uint64_t)> bytes{};
+  std::memcpy(bytes.data(), p, bytes.size());
+  uint64_t raw       = std::bit_cast<uint64_t>(bytes);
+  uint64_t high      = raw & 0xF0F0F0F0F0F0F0F0ULL;
+  uint64_t low_check = (raw + 0x0606060606060606ULL) & 0xF0F0F0F0F0F0F0F0ULL;
+  if (high != 0x3030303030303030ULL || low_check != 0x3030303030303030ULL) { return false; }
+
+  uint64_t v     = raw - 0x3030303030303030ULL;
+  uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL;
+  uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL;
+  out            = (uint32_t)((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL);
+  return true;
+}
+
+static inline void parse_u64_digits_advance(const char*& p, const char* end, uint64_t& out)
+{
+  while (p < end && is_digit(*p)) {
+    if (end - p >= 8) {
+      uint32_t chunk = 0;
+      if (parse_8_digits(p, chunk)) {
+        out = out * 100000000ULL + (uint64_t)chunk;
+        p += 8;
+        continue;
+      }
+    }
+    out = out * 10 + (uint64_t)(*p - '0');
+    ++p;
+  }
+}
+
+static inline void scan_digit_run(const char*& p,
+                                  const char* end,
+                                  bool after_dot,
+                                  parsed_decimal_t& out,
+                                  bool& saw_digit,
+                                  int& frac_digits,
+                                  int& sig_digits,
+                                  bool& too_many_digits)
+{
+  while (p < end) {
+    uint32_t chunk = 0;
+    if (end - p >= 8 && parse_8_digits(p, chunk)) {
+      saw_digit = true;
+      if (after_dot) frac_digits += 8;
+
+      if (!too_many_digits) {
+        if (sig_digits == 0 && chunk == 0) {
+          p += 8;
+          continue;
+        }
+
+        if (sig_digits + 8 <= 19) {
+          out.mantissa = out.mantissa * 100000000ULL + chunk;
+          sig_digits += 8;
+        } else {
+          too_many_digits = true;
+        }
+      }
+
+      p += 8;
+      continue;
+    }
+
+    if (!is_digit(*p)) return;
+    saw_digit = true;
+    int digit = *p - '0';
+    if (after_dot) ++frac_digits;
+    if (!too_many_digits && (digit != 0 || sig_digits != 0)) {
+      if (sig_digits < 19) {
+        out.mantissa = (out.mantissa * 10) + (uint64_t)digit;
+        ++sig_digits;
+      } else {
+        too_many_digits = true;
+      }
+    }
+    ++p;
+  }
+}
+
+static inline bool parse_decimal_advance(const char*& p, const char* end, parsed_decimal_t& out)
+{
+  if (p < end && (*p == '-' || *p == '+')) {
+    out.negative = *p == '-';
+    ++p;
+  }
+
+  bool saw_digit       = false;
+  int frac_digits      = 0;
+  int sig_digits       = 0;
+  bool too_many_digits = false;
+
+  scan_digit_run(p, end, false, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  if (p < end && *p == '.') {
+    ++p;
+    scan_digit_run(p, end, true, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  }
+
+  if (!saw_digit) return false;
+
+  int explicit_exp = 0;
+  if (p < end && (*p == 'e' || *p == 'E' || *p == 'd' || *p == 'D')) {
+    const char* exp_start = p;
+    ++p;
+    bool exp_negative = false;
+    if (p < end && (*p == '-' || *p == '+')) {
+      exp_negative = *p == '-';
+      ++p;
+    }
+    if (p == end || !is_digit(*p)) {
+      p = exp_start;
+    } else {
+      int exp_value = 0;
+      while (p < end && is_digit(*p)) {
+        if (exp_value < 1000000) exp_value = exp_value * 10 + (*p - '0');
+        ++p;
+      }
+      explicit_exp = exp_negative ? -exp_value : exp_value;
+    }
+  }
+
+  out.exp10         = explicit_exp - frac_digits;
+  out.fast_eligible = !too_many_digits;
+  return true;
+}
+
+// fallback to stdlib for edge case or ambiguous roundings (very rare)
+static inline double fallback_strtod(std::string_view s)
+{
+  char stack_buf[32];
+  // The MPS specs mandate that numeric tokens are not longer than 25 characters
+  if (s.size() >= sizeof(stack_buf)) {
+    mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length");
+  }
+  std::memcpy(stack_buf, s.data(), s.size());
+  stack_buf[s.size()] = '\0';
+  for (size_t i = 0; i < s.size(); ++i) {
+    if (stack_buf[i] == 'd' || stack_buf[i] == 'D') stack_buf[i] = 'e';
+  }
+
+  char* parse_end = nullptr;
+  errno           = 0;
+  double value    = std::strtod(stack_buf, &parse_end);
+  if (parse_end != stack_buf + s.size() || errno == ERANGE) {
+    mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+  }
+  return value;
+}
+
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits)
+{
+  if (exp10 < FASTP64_MIN_EXP_10 || exp10 > FASTP64_MAX_EXP_10) { return false; }
+
+  const power_10_lut_entry_t p = fast_fp64_parse_lut[exp10 - FASTP64_MIN_EXP_10];
+  int lz                       = std::countl_zero(man);
+  uint64_t norm                = man << lz;
+  int adj_e2                   = p.biased_e2 - lz;
+
+  unsigned __int128 product = (unsigned __int128)norm * p.high;
+  uint64_t hi               = (uint64_t)(product >> 64);
+  uint64_t lo               = (uint64_t)product;
+
+  // If the high product lands near the 9-bit halfway window, include the low
+  // 64x64 product to disambiguate rounding before deciding whether to fallback.
+  if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && lo + norm < norm) {
+    unsigned __int128 low_product = (unsigned __int128)norm * p.low;
+    uint64_t low_hi               = (uint64_t)(low_product >> 64);
+    uint64_t low_lo               = (uint64_t)low_product;
+    uint64_t old_lo               = lo;
+    lo += low_hi;
+    hi += lo < old_lo ? 1 : 0;
+    if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK &&
+        lo == std::numeric_limits<uint64_t>::max() && low_lo + norm < low_lo) {
+      return false;
+    }
+  }
+
+  uint64_t hi_msb = hi >> 63;
+  // Extract 54 bits: 53 significand bits plus one rounding bit. The product
+  // may be shifted by one depending on whether hi already has its top bit set.
+  uint64_t x54 = hi >> (9 + hi_msb);
+  adj_e2 -= (int)(1 - hi_msb);
+
+  // Exact halfway with round-to-even ambiguity; let strtod handle the rare tie.
+  if (lo == 0 && (hi & FASTP64_HALF_MASK) == 0 && (x54 & 3) == 1) { return false; }
+
+  // Round 54 -> 53 bits, carry into the exponent if rounding overflows.
+  uint64_t x53      = (x54 + (x54 & 1)) >> 1;
+  uint64_t overflow = x53 >> 53;
+  uint64_t ret_man  = (x53 >> overflow) & FASTP64_MANTISSA_MASK;
+  int ret_exp       = adj_e2 + (int)overflow;
+  if (ret_exp <= 0 || ret_exp >= FASTP64_EXPONENT_MASK) { return false; }
+
+  bits = ((uint64_t)ret_exp << 52) | ret_man;
+  return true;
+}
+
+static inline double assemble_fp64(const parsed_decimal_t& dec)
+{
+  uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0;
+  if (dec.mantissa == 0) { return std::bit_cast<double>(bits); }
+
+  if (dec.fast_eligible) {
+    double small    = 0.0;
+    bool used_small = false;
+    if (dec.exp10 >= 0 && dec.exp10 < (int)small_integer_powers.size()) {
+      uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10];
+      if (dec.mantissa <= limit) {
+        small      = (double)dec.mantissa * small_powers[dec.exp10];
+        used_small = true;
+      }
+    } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) {
+      small      = (double)dec.mantissa / small_powers[-dec.exp10];
+      used_small = true;
+    }
+    if (used_small) { return dec.negative ? -small : small; }
+
+    uint64_t mag_bits = 0;
+    if (eisel_lemire(dec.mantissa, dec.exp10, mag_bits)) {
+      return std::bit_cast<double>(bits | mag_bits);
+    }
+  }
+
+  return std::numeric_limits<double>::quiet_NaN();
+}
+
+static inline double parse_fp64_advance(const char*& p, const char* end)
+{
+  const char* start = p;
+  parsed_decimal_t dec;
+  if (!parse_decimal_advance(p, end, dec)) {
+    return fallback_strtod(std::string_view(start, (size_t)(p - start)));
+  }
+
+  double v = assemble_fp64(dec);
+  if (v == v) {
+    if (p < end && (unsigned char)*p > 32) {
+      mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+    }
+    return v;
+  }
+  return fallback_strtod(std::string_view(start, (size_t)(p - start)));
+}
+
+}  // namespace fp64
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
new file mode 100644
index 0000000000..8897bfef1c
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -0,0 +1,386 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "fast_fp64_parser.hpp"
+
+#include <cstdarg>
+#include <cstddef>
+#include <utility>
+
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
+#ifndef LIKELY
+#define LIKELY(x) __builtin_expect(!!(x), 1)
+#endif
+
+#ifndef UNLIKELY
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#endif
+
+namespace cuopt::linear_programming::io::detail {
+
+enum scan_mode {
+  skip_whitespace,
+  until_whitespace,
+};
+
+// util to serially scan along an in-memory input buffer
+// contains optimized primitives for most parsing operations
+struct cursor_t {
+  const char* start;
+  const char* ptr;
+  const char* end;
+
+  cursor_t(const char* data, std::size_t size) : start(data), ptr(data), end(data + size) {}
+
+  bool done() const { return ptr >= end; }
+
+  // used in error reporting
+  std::pair<std::size_t, std::size_t> linecol_position() const
+  {
+    std::size_t line       = 1;
+    const char* line_start = start;
+    for (const char* p = start; p < ptr; ++p) {
+      if (*p == '\n') {
+        ++line;
+        line_start = p + 1;
+      }
+    }
+    std::size_t column = (std::size_t)(ptr - line_start) + 1;
+    return {line, column};
+  }
+
+  [[noreturn]] void error(const char* msg, ...)
+  {
+    auto [line, col] = linecol_position();
+    va_list args;
+    va_start(args, msg);
+    char msg_buf[512];
+    std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args);
+    va_end(args);
+    mps_parser_fail(error_type_t::ValidationError, "%zu:%zu: %s", line, col, msg_buf);
+  }
+
+  void advance(std::size_t n)
+  {
+    if (ptr + n > end) { mps_parser_fail(error_type_t::ValidationError, "Unexpected end of file"); }
+    ptr += n;
+  }
+
+  template <scan_mode mode>
+  static const char* scalar_scan(const char* p, const char* end)
+  {
+    while (p < end) {
+      unsigned char c = (unsigned char)*p;
+      if constexpr (mode == skip_whitespace) {
+        if (c > 32 || c == '\n') return p;
+      } else {
+        if (c <= 32) return p;
+      }
+      p++;
+    }
+    return end;
+  }
+
+  // scans for the first non-whitespace (or vice versa)
+  template <scan_mode mode>
+  static const char* simd_scan(const char* p, const char* end)
+  {
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);  // space/control characters
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    while (p + 32 <= end) {
+      simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)p);
+      simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
+
+      unsigned int mask;
+      if constexpr (mode == skip_whitespace) {
+        simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+        mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+      } else {
+        mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+      }
+
+      if (mask != 0) { return p + __builtin_ctz(mask); }
+      p += 32;
+    }
+    return scalar_scan<mode>(p, end);
+  }
+
+  void skip_ws() { ptr = simd_scan<skip_whitespace>(ptr, end); }
+
+  bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); }
+
+  void consume_eol()
+  {
+    if (ptr < end && *ptr == '\r') {
+      ptr++;
+      if (ptr < end && *ptr == '\n') { ptr++; }
+      return;
+    }
+    if (ptr < end && *ptr == '\n') { ptr++; }
+  }
+
+  // could be SIMD but comments are usually rare
+  void skip_comment_line()
+  {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
+      ptr++;
+    }
+    consume_eol();
+  }
+
+  void skip_to_eol()
+  {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
+      ptr++;
+    }
+  }
+
+  // useful for parsing NAME/OBJNAME which may span multiple "fields" according to the MPS spec
+  std::string_view read_rest_of_line_trimmed()
+  {
+    const char* begin    = ptr;
+    const char* line_end = begin;
+    while (line_end < end && *line_end != '\n' && *line_end != '\r') {
+      ++line_end;
+    }
+
+    while (begin < line_end && (*begin == ' ' || *begin == '\t')) {
+      ++begin;
+    }
+    while (line_end > begin && (line_end[-1] == ' ' || line_end[-1] == '\t')) {
+      --line_end;
+    }
+    ptr = line_end;
+    return std::string_view(begin, (std::size_t)(line_end - begin));
+  }
+
+  inline __attribute__((always_inline)) std::string_view read_field()
+  {
+    if (UNLIKELY(done())) { return {}; }
+
+    const char* field_start = ptr;
+    if (UNLIKELY(end - ptr < 32)) {
+      ptr                   = scalar_scan<until_whitespace>(ptr, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    // all input streams provide trailing padding, so this 32B load is valid
+    // whenever end - ptr >= 32
+    simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
+    unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+
+    if (UNLIKELY(ws_mask == 0)) {
+      ptr                   = simd_scan<until_whitespace>(ptr + 32, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    int field_end_off     = __builtin_ctz(ws_mask);
+    const char* field_end = ptr + field_end_off;
+
+    simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int stop_mask =
+      (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+    unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1);
+
+    if (LIKELY(after_field != 0)) {
+      ptr = ptr + __builtin_ctz(after_field);
+    } else {
+      ptr = field_end;
+      if (ptr < end) { skip_ws(); }
+    }
+
+    return std::string_view(field_start, field_end - field_start);
+  }
+
+  // read but do not consume
+  inline __attribute__((always_inline)) std::string_view peek_field()
+  {
+    if (UNLIKELY(done())) { return {}; }
+    const char* field_end = simd_scan<until_whitespace>(ptr, end);
+    return std::string_view(ptr, field_end - ptr);
+  }
+
+  static inline std::string_view peek_field_at(const char* line_start, const char* section_end)
+  {
+    cursor_t cursor(line_start, (std::size_t)(section_end - line_start));
+    cursor.skip_ws();
+    return cursor.peek_field();
+  }
+
+  // usually in MPS fields go in pair. these can usually be extracted in a single 32B load
+  inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
+  read_two_fields()
+  {
+    auto slow = [&] {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return std::pair<std::string_view, std::string_view>{f1, f2};
+    };
+
+    if (UNLIKELY(end - ptr < 32)) { return slow(); }
+
+    const char* field1_start = ptr;
+    const simde__m256i v32   = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl   = simde_mm256_set1_epi8('\n');
+
+    // Same padded-buffer contract as read_field().
+    simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
+
+    unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32);
+    unsigned int ws_mask        = ~printable_mask;
+
+    if (UNLIKELY(ws_mask == 0)) { return slow(); }
+    int field1_end_off = __builtin_ctz(ws_mask);
+
+    simde__m256i is_nl                = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int nl_mask              = (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int barrier_after_field1 = (printable_mask | nl_mask) >> field1_end_off;
+    if (UNLIKELY(barrier_after_field1 == 0)) { return slow(); }
+    int field2_rel_off = __builtin_ctz(barrier_after_field1);
+    if (UNLIKELY(ptr[field1_end_off + field2_rel_off] == '\n' ||
+                 ptr[field1_end_off + field2_rel_off] == '\r')) {
+      return slow();
+    }
+    int field2_start_off = field1_end_off + field2_rel_off;
+
+    unsigned int ws_after_field2_start = ws_mask >> field2_start_off;
+    if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); }
+    int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start);
+
+    unsigned int stop_mask         = printable_mask | nl_mask;
+    unsigned int stop_after_field2 = stop_mask >> field2_end_off;
+    if (LIKELY(stop_after_field2 != 0)) {
+      ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2);
+    } else {
+      ptr = ptr + field2_end_off;
+      skip_ws();
+    }
+
+    return {std::string_view(field1_start, field1_end_off),
+            std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)};
+  }
+};
+
+static inline void expect(cursor_t& cursor, const char* field)
+{
+  auto id = cursor.read_field();
+  if (UNLIKELY(id != field)) {
+    cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data());
+  }
+}
+
+static inline void accept_comment_line(cursor_t& cursor)
+{
+  for (;;) {
+    while (!cursor.done() && cursor.eol()) {
+      cursor.consume_eol();
+    }
+    if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; }
+    cursor.skip_comment_line();
+  }
+}
+
+static inline void expect_eol(cursor_t& cursor)
+{
+  if (UNLIKELY(!cursor.eol())) {
+    auto got = cursor.peek_field();
+    cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data());
+  }
+
+  for (;;) {
+    while (cursor.eol()) {
+      cursor.consume_eol();
+    }
+    if (UNLIKELY(cursor.done())) { return; }
+
+    if (UNLIKELY(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) {
+      cursor.skip_comment_line();
+      continue;
+    }
+
+    if (LIKELY(cursor.ptr[0] == ' ') && LIKELY(cursor.ptr + 1 < cursor.end)) { cursor.ptr += 1; }
+
+    if (UNLIKELY(cursor.done())) { return; }
+    char c = cursor.ptr[0];
+    if (UNLIKELY(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) {
+      cursor.skip_ws();
+      if (cursor.eol()) { continue; }
+    }
+    break;
+  }
+}
+
+static inline std::string_view peek(cursor_t& cursor) { return cursor.peek_field(); }
+
+static inline bool accept(cursor_t& cursor, const char* field)
+{
+  if (peek(cursor) == field) {
+    expect(cursor, field);
+    return true;
+  }
+  return false;
+}
+
+static inline void expect_section(cursor_t& cursor, const char* section)
+{
+  expect(cursor, section);
+  expect_eol(cursor);
+}
+
+static inline double expect_number(cursor_t& cursor)
+{
+  auto num = cursor.read_field();
+  if (num.empty()) { cursor.error("expected number, got empty field"); }
+  const char* p = num.data();
+  return fp64::parse_fp64_advance(p, p + num.size());
+}
+
+static inline double expect_number_fast_pm_one(cursor_t& cursor)
+{
+  const char* p = cursor.ptr;
+  if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
+    cursor.ptr = p + 2;
+    cursor.skip_ws();
+    return -1.0;
+  }
+  if (cursor.end - p >= 2 && p[0] == '1' && p[1] <= ' ') {
+    cursor.ptr = p + 1;
+    cursor.skip_ws();
+    return 1.0;
+  }
+  return expect_number(cursor);
+}
+
+static inline bool accept_section(cursor_t& cursor, const char* section)
+{
+  if (accept(cursor, section)) {
+    expect_eol(cursor);
+    return true;
+  }
+  return false;
+}
+
+static inline bool accept_comment(cursor_t& cursor)
+{
+  if (UNLIKELY(!cursor.done() && cursor.ptr[0] == '$')) {
+    cursor.skip_to_eol();
+    return true;
+  }
+  return false;
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
new file mode 100644
index 0000000000..02038c6fd9
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -0,0 +1,3219 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "fast_parse_primitives.hpp"
+#include "file_reader.hpp"
+#include "hash_table_smallstr.hpp"
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <cuda/cmath>
+#if defined(MPS_FAST_PERF_COUNTERS) || defined(MPS_FAST_TIMERS)
+#include <utilities/perf_counters.hpp>
+#endif
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <omp.h>
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cerrno>
+#include <charconv>
+#include <climits>
+#include <concepts>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <limits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <file_to_string.hpp>
+
+#define MPS_FAST_COMPACT_ROW_HASH
+#define MPS_FAST_THP_PREFAULT
+
+namespace cuopt::linear_programming::io::detail {
+
+static constexpr size_t KiB = 1024;
+static constexpr size_t MiB = 1024 * KiB;
+static constexpr size_t GiB = 1024 * MiB;
+
+// per-chunk row-count scratch tile for the column parsing workers
+// small enough to remain warm in L1
+static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096;
+static constexpr int MPS_ROWS_THREAD_CAP            = 16;
+static constexpr int MPS_COLUMNS_THREAD_CAP         = 32;
+static constexpr int MPS_BOUNDS_THREAD_CAP          = 32;
+static constexpr int MPS_NAMES_THREAD_CAP           = 16;
+// avoid openmp setup for small bounds sections
+static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256 * MiB;
+// ordered-name fallback is cheap enough to parallelize on smaller bounds sections
+static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB;
+// lower bound on columns chunk size to avoid tiny parser tasks
+static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB;
+// parser-wide thread cap switch; very small files lose to scheduling overhead
+static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull;
+// thread caps for small and large files
+static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16;
+static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32;
+
+static int parser_thread_cap_for_size(size_t bytes)
+{
+  int size_cap = bytes < MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES ? MPS_SMALL_FILE_THREAD_CAP
+                                                                : MPS_LARGE_FILE_THREAD_CAP;
+  return std::max(1, std::min(size_cap, omp_get_max_threads()));
+}
+
+static int phase_thread_count(int phase_cap)
+{
+  const int available_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
+  return std::max(1, std::min(phase_cap, available_threads));
+}
+
+// Arena allocator for the strings (row names, column names) to avoid the dreadful overheads of
+// glibc's malloc and std::vector<std::string>
+class chunk_name_arena_t {
+ public:
+  void reserve(size_t bytes)
+  {
+    if (bytes > next_slab_size_) { next_slab_size_ = bytes; }
+  }
+
+  std::string_view copy(std::string_view name)
+  {
+    char* dst = allocate(name.size() + 1);
+    std::memcpy(dst, name.data(), name.size());
+    dst[name.size()] = '\0';
+    return std::string_view(dst, name.size());
+  }
+
+ private:
+  struct slab_t {
+    std::vector<char> data;
+    size_t used = 0;
+  };
+
+  char* allocate(size_t bytes)
+  {
+    if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().data.size()) {
+      size_t capacity = std::max(bytes, next_slab_size_);
+      slab_t slab;
+      slab.data.resize(capacity);
+      slabs_.push_back(std::move(slab));
+      next_slab_size_ = std::max(next_slab_size_ * 2, capacity);
+    }
+    slab_t& slab = slabs_.back();
+    char* ptr    = slab.data.data() + slab.used;
+    slab.used += bytes;
+    return ptr;
+  }
+
+  std::vector<slab_t> slabs_;
+  size_t next_slab_size_ = 64 * KiB;
+};
+
+struct timer_entry_t {
+  const char* name;
+  double elapsed_ms;
+  size_t rss_kb;
+  size_t hwm_kb;
+};
+
+static std::vector<timer_entry_t>& get_timer_buffer()
+{
+  static std::vector<timer_entry_t> buffer;
+  buffer.reserve(100);
+  return buffer;
+}
+
+static std::mutex& get_timer_mutex()
+{
+  static std::mutex mutex;
+  return mutex;
+}
+
+static void flush_timers()
+{
+#ifdef MPS_FAST_TIMERS
+  std::lock_guard<std::mutex> lock(get_timer_mutex());
+  auto& buffer = get_timer_buffer();
+  for (const auto& entry : buffer) {
+    std::fprintf(stderr,
+                 "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f\n",
+                 entry.name,
+                 entry.elapsed_ms,
+                 (double)entry.rss_kb / (double)(GiB / KiB),
+                 (double)entry.hwm_kb / (double)(GiB / KiB));
+  }
+  buffer.clear();
+#endif
+}
+
+enum class materialize_touch_t {
+  write_2mb,
+  write_4kb,
+};
+
+// instanciate a range using mmap anon pages with hugepage hints, and materialize them
+// by touching each to nudge the kernel into invoking its THP mechanism
+static void materialize_hugepages([[maybe_unused]] const char* label,
+                                  void* data,
+                                  size_t bytes,
+                                  materialize_touch_t touch)
+{
+  if (data == nullptr || bytes == 0) return;
+
+  constexpr size_t two_mb = 2 * MiB;
+  size_t page_size        = system_page_size();
+  uintptr_t start         = reinterpret_cast<uintptr_t>(data);
+  uintptr_t end           = start + bytes;
+  uintptr_t aligned_start = start & ~(uintptr_t)(page_size - 1);
+  uintptr_t aligned_end   = (end + page_size - 1) & ~(uintptr_t)(page_size - 1);
+  size_t aligned_bytes    = (size_t)(aligned_end - aligned_start);
+
+  errno = 0;
+  madvise((void*)(aligned_start), aligned_bytes, MADV_HUGEPAGE);
+
+  size_t step        = touch == materialize_touch_t::write_2mb ? two_mb : page_size;
+  volatile char* ptr = (volatile char*)(data);
+  for (size_t offset = 0; offset < bytes; offset += step) {
+    ptr[offset] = ptr[offset];
+  }
+  ptr[bytes - 1] = ptr[bytes - 1];
+}
+
+template <typename T>
+static void materialize_vector_hugepages(const char* label,
+                                         std::vector<T>& values,
+                                         materialize_touch_t touch)
+{
+  materialize_hugepages(label, values.data(), values.size() * sizeof(T), touch);
+}
+
+class scoped_timer_t {
+ public:
+  scoped_timer_t([[maybe_unused]] const char* name, double* accumulator = nullptr)
+#ifdef MPS_FAST_TIMERS
+    : name_(name),
+      accumulator_(accumulator),
+      nvtx_(name, nvtx::color_for_name(name)),
+      start_(std::chrono::high_resolution_clock::now()){}
+#else
+    : accumulator_(accumulator)
+  {
+  }
+#endif
+
+      ~scoped_timer_t()
+  {
+#ifdef MPS_FAST_TIMERS
+    auto end          = std::chrono::high_resolution_clock::now();
+    double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
+    nvtx_.end();
+    if (accumulator_) { *accumulator_ += elapsed_ms; }
+    auto [rss_kb, hwm_kb] = current_process_rss_kb();
+    std::lock_guard<std::mutex> lock(get_timer_mutex());
+    get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb});
+#endif
+  }
+
+  scoped_timer_t(const scoped_timer_t&)            = delete;
+  scoped_timer_t& operator=(const scoped_timer_t&) = delete;
+
+ private:
+#ifdef MPS_FAST_TIMERS
+  const char* name_;
+#endif
+  double* accumulator_;
+#ifdef MPS_FAST_TIMERS
+  nvtx::scoped_range_t nvtx_;
+  std::chrono::high_resolution_clock::time_point start_;
+#endif
+};
+
+class omp_max_active_levels_guard_t {
+ public:
+  explicit omp_max_active_levels_guard_t(int value) : old_value_(omp_get_max_active_levels())
+  {
+    omp_set_max_active_levels(value);
+  }
+
+  ~omp_max_active_levels_guard_t() { omp_set_max_active_levels(old_value_); }
+
+  omp_max_active_levels_guard_t(const omp_max_active_levels_guard_t&)            = delete;
+  omp_max_active_levels_guard_t& operator=(const omp_max_active_levels_guard_t&) = delete;
+
+ private:
+  int old_value_ = 0;
+};
+
+static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section)
+{
+  const char* row_end = row_start;
+  while (row_end < cursor.end && *row_end > ' ') {
+    row_end++;
+  }
+  cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start);
+}
+
+// Two modes for row/column name lookup:
+// - hash: arbitrary names via hash table (rows) or var_names_map (columns)
+// - dense_ordered: sequential numeric suffixes like R0001/R0002 or V0/V1
+enum class index_mode_t {
+  hash,
+  dense_ordered,
+};
+
+// Every 19-digit decimal string fits in uint64_t; 20+ digits may not and are wildly unlikely in the
+// context of dense MPS rows/cols
+static constexpr size_t dense_suffix_max_digits = 19;
+
+static inline size_t decimal_digits_u64(uint64_t value)
+{
+  size_t digits = 1;
+  while (value >= 10) {
+    value /= 10;
+    digits++;
+  }
+  return digits;
+}
+
+static inline bool parse_trailing_u64(std::string_view name,
+                                      std::string_view& prefix,
+                                      uint64_t& value,
+                                      size_t& suffix_width)
+{
+  size_t pos = name.size();
+  while (pos > 0 && fp64::is_digit(name[pos - 1])) {
+    pos--;
+  }
+  if (pos == name.size()) { return false; }
+
+  suffix_width = name.size() - pos;
+  if (suffix_width > dense_suffix_max_digits) { return false; }
+
+  uint64_t parsed = 0;
+  for (size_t i = pos; i < name.size(); ++i) {
+    parsed = parsed * 10 + (uint64_t)(name[i] - '0');
+  }
+
+  prefix = std::string_view(name.data(), pos);
+  value  = parsed;
+  return true;
+}
+
+// necessary to handle cases like R0001, ..., R2000, ...
+static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width)
+{
+  return suffix_width > 1 && name[name.size() - suffix_width] == '0';
+}
+
+static inline size_t dense_initial_pad_width(std::string_view name, size_t suffix_width)
+{
+  return dense_suffix_is_zero_padded(name, suffix_width) ? suffix_width : 0;
+}
+
+static inline bool dense_suffix_width_ok(uint64_t value, size_t suffix_width, size_t pad_width)
+{
+  size_t digits         = decimal_digits_u64(value);
+  size_t expected_width = std::max(pad_width, digits);
+  return suffix_width == expected_width;
+}
+
+struct dense_name_index_t {
+  std::string prefix;
+  uint64_t min_id  = 0;
+  uint64_t max_id  = 0;
+  size_t pad_width = 0;
+
+  void reset()
+  {
+    prefix.clear();
+    min_id    = 0;
+    max_id    = 0;
+    pad_width = 0;
+  }
+
+  bool suffix_width_ok(uint64_t value, size_t suffix_width) const
+  {
+    return dense_suffix_width_ok(value, suffix_width, pad_width);
+  }
+
+  size_t lookup(std::string_view name) const
+  {
+    std::string_view parsed_prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, parsed_prefix, value, suffix_width)) { return SIZE_MAX; }
+    if (parsed_prefix != prefix || !suffix_width_ok(value, suffix_width)) { return SIZE_MAX; }
+    if (value < min_id || value > max_id) { return SIZE_MAX; }
+    return (size_t)(value - min_id);
+  }
+
+  void format_name(size_t idx, std::string& out) const
+  {
+    uint64_t value = min_id + idx;
+    char digits_buf[32];
+    auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value);
+    if (ec != std::errc()) {
+      out.assign(prefix);
+      return;
+    }
+    size_t digits_len = (size_t)(digits_end - digits_buf);
+    size_t width      = std::max(pad_width, digits_len);
+    out.resize(prefix.size() + width);
+    std::memcpy(out.data(), prefix.data(), prefix.size());
+    char* suffix = out.data() + prefix.size();
+    if (width > digits_len) {
+      std::memset(suffix, '0', width - digits_len);
+      suffix += width - digits_len;
+    }
+    std::memcpy(suffix, digits_buf, digits_len);
+  }
+};
+
+struct dense_observe_state_t {
+  bool candidate = true;
+  dense_name_index_t index;
+  size_t count = 0;
+};
+
+static inline void observe_dense_name(bool& candidate,
+                                      dense_name_index_t& index,
+                                      size_t& observed_count,
+                                      std::string_view name,
+                                      uint64_t expected_id = std::numeric_limits<uint64_t>::max())
+{
+  if (!candidate) { return; }
+
+  std::string_view prefix;
+  uint64_t value      = 0;
+  size_t suffix_width = 0;
+  if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  if (observed_count == 0) {
+    index.prefix.assign(prefix);
+    index.min_id    = value;
+    index.max_id    = value;
+    index.pad_width = dense_initial_pad_width(name, suffix_width);
+    observed_count  = 1;
+    return;
+  }
+
+  if (prefix != index.prefix) {
+    candidate = false;
+    return;
+  }
+
+  if (expected_id != std::numeric_limits<uint64_t>::max() && value != expected_id) {
+    candidate = false;
+    return;
+  }
+
+  if (!index.suffix_width_ok(value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  index.max_id = value;
+  observed_count++;
+}
+
+// Maps MPS row/column names to indices via one of two strategies, chosen per problem:
+//
+//   * dense_ordered - when every name in a section is a shared prefix followed by a
+//     contiguous run of integers (e.g. R0001, R0002, ... or x1, x2, ...). The index is
+//     then computed straight from the parsed integer (value - min_id), so no hash table
+//     is built or probed. This is the common, fast case for solver-generated models.
+//   * hash          - the general fallback (smallstr_hash_table_t) for arbitrary names.
+//
+// Each section decides its own mode while scanning: it stays a dense_ordered "candidate"
+// as long as names keep matching the prefix + consecutive-integer + zero-pad-width rule
+// (see observe_dense_name), and the first violation drops it to the hash path. The chosen
+// mode lives in row_index_mode / col_index_mode, and every lookup branches on it
+// (row_lookup / read_row_lookup vs the dense_ordered variants below). Holding this in mind
+// explains most of the paired/dual code paths throughout this file.
+template <typename i_t, typename f_t>
+struct parse_state_t {
+  mps_data_model_t<i_t, f_t>& problem;
+  cursor_t& cursor;
+
+  // backed by the input buffer
+  std::vector<std::string_view> row_names_sv;
+  // backed by the arena allocator
+  std::vector<std::string_view> var_names_sv;
+  std::vector<chunk_name_arena_t> var_name_arenas;
+  std::string_view problem_name_sv;
+  std::string_view objective_name_sv;
+  // secondary 'N' rows in ROWS — rare; membership distinguishes them from unknown row names
+  std::unordered_set<std::string_view> ignored_objective_names;
+
+  // Column name lookup for labels like V0, V1, ...
+  index_mode_t col_index_mode = index_mode_t::hash;
+  dense_name_index_t col_dense;
+
+  smallstr_hash_table_t row_hash_;
+
+  // Row name lookup for labels like R0001, R0002, ...
+  index_mode_t row_index_mode = index_mode_t::hash;
+  bool row_dense_candidate    = true;
+  dense_name_index_t row_dense;
+
+  // var_names still uses STL (only used in parse_bounds, not as hot)
+  std::unordered_map<std::string_view, size_t> var_names_map;
+
+  mmap_region_t temp_A_region;
+  mmap_region_t temp_A_indices_region;
+  f_t* temp_A                = nullptr;
+  i_t* temp_A_indices        = nullptr;
+  size_t temp_csr_nnz        = 0;
+  bool temp_csr_materialized = false;
+
+  struct bounds_only_var_t {
+    f_t lb    = f_t{0};
+    f_t ub    = std::numeric_limits<f_t>::infinity();
+    char type = 'C';
+  };
+
+  // some writers introduce zero-column variables only in BOUNDS.
+  std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
+
+  struct qcmatrix_block_t {
+    size_t row_idx = SIZE_MAX;
+    std::string_view row_name;
+    std::vector<std::tuple<i_t, i_t, f_t>> entries;
+  };
+
+  std::vector<qcmatrix_block_t> qcmatrix_blocks;
+
+  parse_state_t(mps_data_model_t<i_t, f_t>& p, cursor_t& c) : problem(p), cursor(c) {}
+
+  void init_row_hash_table()
+  {
+    if (init_row_dense_ordered_table()) { return; }
+    init_row_hash_table_impl();
+  }
+
+  void observe_objective_row_name(std::string_view name)
+  {
+    if (objective_name_sv.empty()) {
+      objective_name_sv = name;
+    } else if (name != objective_name_sv) {
+      ignored_objective_names.insert(name);
+    }
+  }
+
+  bool init_row_dense_ordered_table()
+  {
+    scoped_timer_t timer("row_dense_finalize");
+    size_t n_rows = row_names_sv.size();
+    if (!row_dense_candidate || n_rows == 0) { return false; }
+    if (row_dense.max_id < row_dense.min_id) { return false; }
+    uint64_t dense_count = row_dense.max_id - row_dense.min_id + 1;
+    if (dense_count != n_rows) { return false; }
+
+    row_index_mode = index_mode_t::dense_ordered;
+    return true;
+  }
+
+  // Insert all rows into the hash table. The perf-counter instrumentation is isolated in
+  // these two helpers so its #ifdefs do not fragment init_row_hash_table_impl's setup flow;
+  // both compile down to a bare insert loop when MPS_FAST_PERF_COUNTERS is off.
+  void insert_rows_partitioned(
+    int num_threads,
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1>& partition_offsets,
+    const std::vector<size_t>& row_order,
+    const std::vector<uint32_t>& row_hashes)
+  {
+    scoped_timer_t timer("row_hash_insert_partitioned");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
+#endif
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
+      size_t p = (size_t)part_id;
+#ifdef MPS_FAST_PERF_COUNTERS
+      thread_perf_counters_t perf_counters;
+#endif
+      for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
+        size_t idx = row_order[pos];
+        row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx);
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      perf_snapshots[p] = perf_counters.stop();
+#endif
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
+#endif
+  }
+
+  void insert_rows_serial(size_t n_rows)
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    thread_perf_counters_t perf_counters;
+#endif
+    for (size_t idx = 0; idx < n_rows; ++idx) {
+      row_hash_.insert_serial(row_names_sv[idx], idx);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
+#endif
+  }
+
+  void init_row_hash_table_impl()
+  {
+    scoped_timer_t timer("row_hash_init_total");
+    size_t n_rows              = row_names_sv.size();
+    const int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1;
+#ifdef MPS_FAST_COMPACT_ROW_HASH
+    constexpr bool compact_row_hash = true;
+#else
+    constexpr bool compact_row_hash = false;
+#endif
+    std::vector<uint32_t> row_hashes;
+    std::vector<size_t> row_order;
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS> partition_counts      = {};
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1> partition_offsets = {};
+
+    if (use_partitioned) {
+      scoped_timer_t timer("row_hash_partition_metadata");
+      row_hashes.resize(n_rows);
+      size_t inline_rows = 0;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        std::string_view name = row_names_sv[idx];
+        if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
+          row_hash_.note_long_name(name, idx);
+          continue;
+        }
+        uint32_t hash   = fnv1a_hash(name.data(), name.size());
+        row_hashes[idx] = hash;
+        ++partition_counts[hash_partition_for(hash)];
+        ++inline_rows;
+      }
+
+      for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+        partition_offsets[p + 1] = partition_offsets[p] + partition_counts[p];
+      }
+
+      row_order.resize(inline_rows);
+      auto next_offsets = partition_offsets;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
+        size_t part                     = hash_partition_for(row_hashes[idx]);
+        row_order[next_offsets[part]++] = idx;
+      }
+    }
+
+    if (use_partitioned) {
+      row_hash_.configure_partitioned_buckets(partition_counts, compact_row_hash);
+    } else {
+      row_hash_.configure_serial_buckets(n_rows, compact_row_hash);
+    }
+
+    {
+      scoped_timer_t timer("row_hash_mmap");
+      row_hash_.allocate_mmap("row hash table");
+    }
+
+#ifdef MPS_FAST_THP_PREFAULT
+    {
+      scoped_timer_t timer("row_hash_thp_prefault");
+      materialize_hugepages("row_names_ht",
+                            row_hash_.slots(),
+                            row_hash_.region().size(),
+                            materialize_touch_t::write_2mb);
+    }
+#endif
+
+    {
+      scoped_timer_t timer("row_hash_insert_all");
+      row_hash_.reset_build_probe_stats();
+      if (use_partitioned) {
+        insert_rows_partitioned(num_threads, partition_offsets, row_order, row_hashes);
+      } else {
+        insert_rows_serial(n_rows);
+      }
+      row_hash_.print_build_probe_report(n_rows);
+    }
+
+#ifdef MPS_FAST_MADV_COLLAPSE
+    {
+      scoped_timer_t timer("row_hash_madv_collapse");
+      row_hash_.region().advise(MADV_COLLAPSE);
+    }
+#endif
+  }
+
+  size_t row_lookup(std::string_view name) const
+  {
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); }
+    return row_hash_.lookup(name);
+  }
+
+  size_t read_row_lookup_dense_ordered(cursor_t& cursor) const
+  {
+    const char* start = cursor.ptr;
+    const char* p     = start;
+
+    size_t prefix_len = row_dense.prefix.size();
+    if (prefix_len > 0) {
+      if ((size_t)(cursor.end - p) < prefix_len ||
+          std::memcmp(p, row_dense.prefix.data(), prefix_len) != 0) {
+        cursor.read_field();
+        return SIZE_MAX;
+      }
+      p += prefix_len;
+    }
+
+    const char* digits_start = p;
+    uint64_t value           = 0;
+    fp64::parse_u64_digits_advance(p, cursor.end, value);
+
+    size_t suffix_width = (size_t)(p - digits_start);
+    if (suffix_width == 0 || suffix_width > dense_suffix_max_digits || p >= cursor.end ||
+        *p > ' ' || !row_dense.suffix_width_ok(value, suffix_width) || value < row_dense.min_id ||
+        value > row_dense.max_id) {
+      cursor.ptr = start;
+      cursor.read_field();
+      return SIZE_MAX;
+    }
+
+    cursor.ptr = p;
+    cursor.skip_ws();
+    return (size_t)(value - row_dense.min_id);
+  }
+
+  size_t read_row_lookup(cursor_t& cursor) const
+  {
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) {
+      return read_row_lookup_dense_ordered(cursor);
+    }
+
+    auto row_name = cursor.read_field();
+    return row_hash_.lookup(row_name);
+  }
+};
+
+// =============================================================================
+// Section parsers
+// =============================================================================
+
+template <typename i_t, typename f_t>
+static void parse_name_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_name");
+  if (peek(state.cursor) == "ROWS") { return; }
+  expect(state.cursor, "NAME");
+  if (!state.cursor.eol()) { state.problem_name_sv = state.cursor.read_rest_of_line_trimmed(); }
+  expect_eol(state.cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_objsense_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objsense");
+  if (accept(state.cursor, "OBJSENSE")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    auto sense = state.cursor.read_field();
+    if (sense == "MIN" || sense == "MINIMIZE") {
+      state.problem.maximize_ = false;
+    } else if (sense == "MAX" || sense == "MAXIMIZE") {
+      state.problem.maximize_ = true;
+    } else {
+      state.cursor.error("expected MIN/MAX or MINIMIZE/MAXIMIZE, got '%s'", sense.data());
+    }
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_objname_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objname");
+  if (accept(state.cursor, "OBJNAME")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    state.objective_name_sv = state.cursor.read_field();
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+struct row_chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+struct row_chunk_info_t {
+  size_t constraints = 0;
+  bool malformed     = false;
+  std::vector<std::string_view> objective_names;
+  bool has_first_constraint = false;
+  std::string_view first_constraint_name;
+};
+
+static const char* rows_find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static bool parse_rows_line_fast(const char*& p,
+                                 const char* end,
+                                 char& row_type,
+                                 std::string_view& row_name)
+{
+  p = cursor_t::simd_scan<skip_whitespace>(p, end);
+  if (p >= end) { return false; }
+  if (*p == '\n') {
+    p++;
+    return false;
+  }
+  if (*p == '*' || *p == '$') {
+    p = rows_find_next_line(p, end);
+    return false;
+  }
+
+  row_type = *p++;
+  p        = cursor_t::simd_scan<skip_whitespace>(p, end);
+
+  const char* name_start = p;
+  p                      = cursor_t::simd_scan<until_whitespace>(p, end);
+  if (name_start == p) { return false; }
+  row_name = std::string_view(name_start, (size_t)(p - name_start));
+
+  // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and
+  // field 3 may start with '$' to comment the rest of the record.
+  // could be SIMD'd, but in practice the newline is right after the row name
+  p = rows_find_next_line(p, end);
+  return true;
+}
+
+// row chunks are established based on byte count, thus boundaries can land in the middle of a row
+// this cleans up chunks to have row line boundaries
+static std::vector<row_chunk_boundary_t> compute_row_chunk_boundaries(const char* rows_start,
+                                                                      const char* rows_end,
+                                                                      int num_threads)
+{
+  scoped_timer_t timer("rows_compute_chunk_boundaries");
+
+  std::vector<row_chunk_boundary_t> boundaries((size_t)num_threads);
+  size_t total_size = (size_t)(rows_end - rows_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  boundaries[0].start = rows_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = rows_end;
+    } else {
+      const char* boundary            = rows_start + (size_t)(t + 1) * chunk_size;
+      boundary                        = rows_find_next_line(boundary, rows_end);
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
+    }
+  }
+
+  return boundaries;
+}
+
+// reads the row section in chunks and inserts into the worker's hash table partition
+// Parallel ROWS parser: count constraints per chunk, prefix-sum, then fill the output arrays
+// in parallel (with per-chunk dense-name reconciliation at the end). Must keep the same line
+// grammar as its serial twin parse_rows_section_serial_impl; parse_rows_section chooses between
+// them by size. Returns false if a chunk hit a malformed line (nothing committed for the fill
+// pass), so the caller can reset and retry serially for clean error reporting.
+template <typename i_t, typename f_t>
+static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
+                                             const char* rows_start,
+                                             const char* rows_end,
+                                             int num_threads)
+{
+  scoped_timer_t timer("parse_rows_parallel");
+
+  auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads);
+  std::vector<row_chunk_info_t> infos((size_t)num_threads);
+
+  {
+    scoped_timer_t timer("rows_count_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      row_chunk_info_t info;
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            info.malformed = true;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') {
+          info.objective_names.push_back(row_name);
+        } else {
+          if (!info.has_first_constraint) {
+            info.first_constraint_name = row_name;
+            info.has_first_constraint  = true;
+          }
+          info.constraints++;
+        }
+      }
+
+      infos[(size_t)t] = info;
+    }
+  }
+
+  if (std::any_of(
+        infos.begin(), infos.end(), [](const row_chunk_info_t& info) { return info.malformed; })) {
+    return false;
+  }
+
+  // prefix sum to do a paralle scatter of every row entries into the global output arrays
+  std::vector<size_t> offsets((size_t)num_threads + 1, 0);
+  {
+    scoped_timer_t timer("rows_prefix_sum");
+    for (int t = 0; t < num_threads; ++t) {
+      offsets[(size_t)t + 1] = offsets[(size_t)t] + infos[(size_t)t].constraints;
+    }
+  }
+
+  size_t total_rows = offsets[(size_t)num_threads];
+  if (UNLIKELY(total_rows > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", total_rows);
+  }
+  {
+    scoped_timer_t timer("rows_resize_outputs");
+    state.row_names_sv.resize(total_rows);
+    state.problem.row_types_.resize(total_rows);
+  }
+
+  if (state.objective_name_sv.empty()) {
+    for (const auto& info : infos) {
+      if (!info.objective_names.empty()) {
+        state.objective_name_sv = info.objective_names.front();
+        break;
+      }
+    }
+  }
+  for (const auto& info : infos) {
+    for (std::string_view name : info.objective_names) {
+      if (name != state.objective_name_sv) { state.ignored_objective_names.insert(name); }
+    }
+  }
+
+  bool dense_candidate = total_rows > 0;
+  std::string_view dense_prefix;
+  uint64_t dense_base_id = 0;
+  size_t dense_pad_width = 0;
+
+  if (dense_candidate) {
+    std::string_view first_name;
+    for (const auto& info : infos) {
+      if (info.has_first_constraint) {
+        first_name = info.first_constraint_name;
+        break;
+      }
+    }
+
+    uint64_t first_value      = 0;
+    size_t first_suffix_width = 0;
+    if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) {
+      dense_candidate = false;
+    } else {
+      dense_base_id   = first_value;
+      dense_pad_width = dense_initial_pad_width(first_name, first_suffix_width);
+    }
+  }
+
+  std::vector<uint8_t> dense_ok_by_chunk((size_t)num_threads, 1);
+
+  {
+    scoped_timer_t timer("rows_fill_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_fill_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      size_t out      = offsets[(size_t)t];
+
+      bool local_dense_ok = dense_candidate;
+      dense_name_index_t dense_index;
+      if (local_dense_ok) {
+        dense_index.prefix.assign(dense_prefix);
+        dense_index.min_id    = dense_base_id;
+        dense_index.max_id    = dense_base_id;
+        dense_index.pad_width = dense_pad_width;
+      }
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            local_dense_ok = false;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') { continue; }
+
+        state.row_names_sv[out]       = row_name;
+        state.problem.row_types_[out] = row_type;
+
+        if (local_dense_ok) {
+          size_t observed_count = out;
+          observe_dense_name(
+            local_dense_ok, dense_index, observed_count, row_name, dense_base_id + out);
+        }
+        out++;
+      }
+
+      dense_ok_by_chunk[(size_t)t] = local_dense_ok ? 1 : 0;
+    }
+  }
+
+  {
+    scoped_timer_t timer("rows_dense_metadata");
+    for (uint8_t ok : dense_ok_by_chunk) {
+      dense_candidate = dense_candidate && ok;
+    }
+    state.row_dense_candidate = dense_candidate;
+    if (dense_candidate) {
+      state.row_dense.prefix.assign(dense_prefix);
+      state.row_dense.min_id    = dense_base_id;
+      state.row_dense.max_id    = dense_base_id + total_rows - 1;
+      state.row_dense.pad_width = dense_pad_width;
+    }
+  }
+
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows_serial");
+
+  while (state.cursor.ptr < rows_end) {
+    auto row_type = state.cursor.ptr[0];
+    state.cursor.advance(1);
+    state.cursor.skip_ws();
+
+    auto row_name = state.cursor.read_field();
+    // ROWS fields after the row name are unused; tolerate annotations/comments there.
+    state.cursor.skip_to_eol();
+
+    // 'N' type is the objective row - store its name but don't add to constraints
+    if (row_type == 'N') {
+      state.observe_objective_row_name(row_name);
+    } else {
+      size_t row_idx = state.row_names_sv.size();
+      state.row_names_sv.push_back(row_name);
+      observe_dense_name(
+        state.row_dense_candidate,
+        state.row_dense,
+        row_idx,
+        row_name,
+        row_idx == 0 ? std::numeric_limits<uint64_t>::max() : state.row_dense.min_id + row_idx);
+      state.problem.row_types_.push_back(row_type);
+    }
+    expect_eol(state.cursor);
+  }
+  if (UNLIKELY(state.row_names_sv.size() > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu",
+                       state.row_names_sv.size());
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows");
+  expect_section(state.cursor, "ROWS");
+
+  {
+    scoped_timer_t timer("parse_rows_scan");
+    const char* rows_start = state.cursor.ptr;
+
+    size_t rows_bytes    = (size_t)(rows_end - state.cursor.ptr);
+    int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    bool parsed_parallel = false;
+    if (rows_bytes >= 512 * MiB && num_threads > 1) {
+      parsed_parallel =
+        parse_rows_section_parallel_impl<i_t, f_t>(state, state.cursor.ptr, rows_end, num_threads);
+      // serial fallback in case a likely malformed chunk has been encounter
+      // makes error reporting much easier
+      if (!parsed_parallel) {
+        state.row_names_sv.clear();
+        state.problem.row_types_.clear();
+        state.row_dense_candidate = true;
+        state.row_dense.reset();
+        state.cursor.ptr = rows_start;
+        parse_rows_section_serial_impl(state, rows_end);
+      }
+    } else {
+      parse_rows_section_serial_impl(state, rows_end);
+    }
+    state.cursor.ptr = rows_end;
+  }
+
+  state.problem.n_constraints_ = (i_t)state.row_names_sv.size();
+  state.problem.b_.resize((size_t)state.problem.n_constraints_);
+
+  {
+    scoped_timer_t timer("parse_rows_hash_init");
+    state.init_row_hash_table();
+  }
+}
+
+// Columns parser
+
+// integer variable markers
+struct marker_info_t {
+  enum Type { INTORG, INTEND };
+  Type type;
+  size_t after_local_var_idx;  // SIZE_MAX means "before first variable"
+};
+
+struct row_count_block_t {
+  size_t block_id       = 0;
+  size_t storage_offset = 0;
+};
+
+// Each column parsing worker owns chunks of the global CSC which are parsed in parallel and then
+// later scattered into the final CSR
+struct chunk_result_t {
+  std::vector<double> values;
+  std::vector<uint32_t> row_indices;
+  std::vector<size_t> col_offsets;
+  std::vector<std::string_view> var_names;
+  chunk_name_arena_t var_name_arena;
+  std::vector<marker_info_t> markers;
+  std::vector<std::pair<size_t, double>> objective_entries;  // local_col_idx -> coefficient
+  // COLUMNS is parsed as chunk-local CSC. To build the global CSR, each chunk needs row counts
+  // first, then row-local write cursors for scatter. Store those counts only for touched
+  // 4096-row blocks instead of allocating a dense chunks*n_rows matrix
+  // The same slots are rewritten as write cursors after the global CSR row offsets are known
+  std::vector<int64_t> row_count_storage;
+  std::vector<row_count_block_t> row_count_blocks;
+  std::vector<int32_t> row_count_block_dir;
+  dense_observe_state_t dense_col_stats;
+};
+
+struct chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+struct bounds_chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+// enables representing row counts per chunk as a sparse representation w/ 4096 granularity
+// works well since nnzs are often clustered around the same matrix blocks
+static inline int64_t& column_row_count_slot(chunk_result_t& result, size_t row_idx)
+{
+  size_t block_id   = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+  size_t local      = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+  int32_t block_pos = result.row_count_block_dir[block_id];
+  if (UNLIKELY(block_pos < 0)) {
+    block_pos                            = (int32_t)result.row_count_blocks.size();
+    result.row_count_block_dir[block_id] = block_pos;
+    row_count_block_t block;
+    block.block_id       = block_id;
+    block.storage_offset = result.row_count_storage.size();
+    result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0);
+    result.row_count_blocks.push_back(std::move(block));
+  }
+  return result
+    .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local];
+}
+
+static bool dense_col_chunk_padding_compatible(const dense_observe_state_t& stats,
+                                               size_t global_pad_width)
+{
+  if (global_pad_width > 0) {
+    return stats.index.pad_width == global_pad_width ||
+           (stats.index.pad_width == 0 &&
+            decimal_digits_u64(stats.index.min_id) >= global_pad_width);
+  }
+  return stats.index.pad_width == 0;
+}
+
+static const char* find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static std::string_view peek_bounds_line_var_name(const char* line_start, const char* end)
+{
+  const char* p = line_start;
+  for (int field = 0; field < 2; ++field) {
+    while (p < end && *p <= ' ' && *p != '\n')
+      p++;
+    while (p < end && *p > ' ')
+      p++;
+  }
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+  const char* var_start = p;
+  while (p < end && *p > ' ')
+    p++;
+  return std::string_view(var_start, (size_t)(p - var_start));
+}
+
+static const char* find_line_start(const char* section_start, const char* p)
+{
+  while (p > section_start && p[-1] != '\n')
+    --p;
+  return p;
+}
+
+static std::vector<bounds_chunk_boundary_t> compute_bounds_chunk_boundaries(
+  const char* section_start, const char* section_end, int num_threads)
+{
+  scoped_timer_t timer("bounds_compute_chunk_boundaries");
+
+  const size_t total_size = (size_t)(section_end - section_start);
+  const size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<bounds_chunk_boundary_t> boundaries((size_t)num_threads);
+  boundaries[0].start = section_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = section_end;
+    } else {
+      const char* boundary =
+        find_next_line(section_start + (size_t)(t + 1) * chunk_size, section_end);
+
+      // Keep consecutive BOUNDS records for the same variable in one chunk.
+      // Then each thread owns full LO/UP-style groups and can apply file order locally.
+      while (boundary < section_end) {
+        const char* prev_line = find_line_start(section_start, boundary - 1);
+        const auto prev_var   = peek_bounds_line_var_name(prev_line, section_end);
+        const auto next_var   = peek_bounds_line_var_name(boundary, section_end);
+        if (prev_var.empty() || next_var.empty() || prev_var != next_var) { break; }
+        boundary = find_next_line(boundary, section_end);
+      }
+
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
+    }
+  }
+  return boundaries;
+}
+
+static std::vector<chunk_boundary_t> compute_chunk_boundaries(const char* columns_start,
+                                                              const char* columns_end,
+                                                              int num_threads)
+{
+  scoped_timer_t timer("compute_chunk_boundaries");
+
+  size_t total_size = (size_t)(columns_end - columns_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<chunk_boundary_t> boundaries(num_threads);
+
+  for (int t = 0; t < num_threads; t++) {
+    if (t == 0) { boundaries[t].start = columns_start; }
+
+    if (t == num_threads - 1) {
+      boundaries[t].end = columns_end;
+    } else {
+      // Find estimated position and align to line boundary
+      const char* estimated_end = columns_start + (t + 1) * chunk_size;
+      const char* line_start    = estimated_end;
+      while (line_start < columns_end && *line_start != '\n')
+        line_start++;
+      if (line_start < columns_end) line_start++;
+
+      // Read column name at this line
+      std::string_view col_name = cursor_t::peek_field_at(line_start, columns_end);
+
+      // Scan forward until column name changes (to avoid splitting a column)
+      const char* boundary = line_start;
+      while (boundary < columns_end) {
+        const char* next_line = find_next_line(boundary, columns_end);
+        if (next_line >= columns_end) break;
+
+        std::string_view next_col = cursor_t::peek_field_at(next_line, columns_end);
+        if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') {
+          // Found a column transition. Marker-state fixup later handles any split near markers.
+          boundary = next_line;
+          break;
+        }
+        boundary = next_line;
+      }
+      boundaries[t].end = boundary;
+    }
+  }
+
+  // Fix up start pointers (each start is previous end)
+  for (int t = 1; t < num_threads; t++) {
+    boundaries[t].start = boundaries[t - 1].end;
+  }
+
+  return boundaries;
+}
+
+template <typename i_t, typename f_t>
+static chunk_result_t parse_columns_chunk(const char* chunk_start,
+                                          const char* chunk_end,
+                                          const parse_state_t<i_t, f_t>& state)
+{
+  chunk_result_t result;
+
+  if (chunk_start >= chunk_end) {
+    result.col_offsets.push_back(0);
+    return result;
+  }
+
+  size_t chunk_size     = (size_t)(chunk_end - chunk_start);
+  size_t estimated_nnz  = chunk_size / 100;
+  size_t estimated_cols = estimated_nnz / 10;
+  if (UNLIKELY(state.problem.n_constraints_ > (i_t)std::numeric_limits<int32_t>::max())) {
+    state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices");
+  }
+  result.values.reserve(estimated_nnz);
+  result.row_indices.reserve(estimated_nnz);
+  result.col_offsets.reserve(estimated_cols + 1);
+  result.var_names.reserve(estimated_cols);
+  result.var_name_arena.reserve(std::max<size_t>(4096, estimated_cols * 16));
+  result.objective_entries.reserve(estimated_cols);
+  size_t n_row_blocks =
+    cuda::ceil_div((size_t)state.problem.n_constraints_, COLUMN_ROW_COUNT_BLOCK_ROWS);
+  result.row_count_block_dir.resize(n_row_blocks, -1);
+  size_t estimated_touched_blocks = std::min(n_row_blocks, std::max<size_t>(16, estimated_nnz));
+  result.row_count_blocks.reserve(estimated_touched_blocks);
+  result.row_count_storage.reserve(estimated_touched_blocks * COLUMN_ROW_COUNT_BLOCK_ROWS);
+
+  cursor_t cursor(chunk_start, (size_t)(chunk_end - chunk_start));
+  std::string_view prev_var_name = "";
+
+  cursor.skip_ws();
+
+  while (!cursor.done()) {
+    if (UNLIKELY(*cursor.ptr == 'R')) {
+      auto next = cursor.peek_field();
+      // RHS section is mandatory right after COLUMNS section
+      if (next == "RHS") { break; }
+    }
+
+    auto [var_name, field2] = cursor.read_two_fields();
+    if (UNLIKELY(!field2.empty() && field2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+
+    // Check for integer marker
+    if (UNLIKELY(field2[0] == '\'' && field2 == "'MARKER'")) {
+      auto marker_type = cursor.read_field();
+
+      marker_info_t marker;
+      marker.after_local_var_idx =
+        result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1;
+
+      if (marker_type == "'INTORG'") {
+        marker.type = marker_info_t::INTORG;
+      } else if (marker_type == "'INTEND'") {
+        marker.type = marker_info_t::INTEND;
+      } else {
+        cursor.error("unknown integer marker type in COLUMNS: %.*s",
+                     (int)marker_type.size(),
+                     marker_type.data());
+      }
+      result.markers.push_back(marker);
+
+      while (!cursor.done() && !cursor.eol())
+        cursor.ptr++;
+      if (!cursor.done()) cursor.ptr++;
+      cursor.skip_ws();
+      continue;
+    }
+
+    auto row_name = field2;
+    // quite often in MIPs the coefficient is just a single-digit integer
+    double value;
+    double sign = 1.0;
+    if (cursor.ptr[0] == '-') {
+      sign = -1.0;
+      cursor.advance(1);
+    }
+    if (cursor.ptr + 1 < cursor.end && fp64::is_digit(cursor.ptr[0]) &&
+        (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) {
+      value = sign * (cursor.ptr[0] - '0');
+      cursor.advance(1);
+    } else {
+      value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end);
+    }
+    // usually EOL directly follows
+    if (UNLIKELY(!cursor.eol())) { cursor.skip_ws(); }
+    accept_comment(cursor);
+
+    if (prev_var_name != var_name) {
+      std::string_view owned_var_name = result.var_name_arena.copy(var_name);
+      result.var_names.push_back(owned_var_name);
+      observe_dense_name(result.dense_col_stats.candidate,
+                         result.dense_col_stats.index,
+                         result.dense_col_stats.count,
+                         owned_var_name);
+      result.col_offsets.push_back(result.values.size());
+      prev_var_name = owned_var_name;
+    }
+
+    auto add_entry = [&](std::string_view rn, double val) {
+      size_t row_idx = state.row_lookup(rn);
+      if (LIKELY(row_idx != SIZE_MAX)) {
+        assert(row_idx <= (size_t)std::numeric_limits<int32_t>::max());
+        result.values.push_back(val);
+        result.row_indices.push_back((uint32_t)row_idx);
+        column_row_count_slot(result, row_idx)++;
+      } else if (LIKELY(rn == state.objective_name_sv)) {
+        result.objective_entries.push_back({result.var_names.size() - 1, val});
+      } else if (state.ignored_objective_names.count(rn)) {
+        return;
+      } else {
+        cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data());
+      }
+    };
+
+    add_entry(row_name, value);
+
+    // Optional second entry on same line
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      double value2 = fp64::parse_fp64_advance(cursor.ptr, cursor.end);
+      cursor.skip_ws();
+      accept_comment(cursor);
+
+      add_entry(row_name2, value2);
+    }
+
+    expect_eol(cursor);
+  }
+
+  result.col_offsets.push_back(result.values.size());
+
+  return result;
+}
+
+// Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC
+template <typename i_t>
+struct column_merge_shape_t {
+  int num_chunks = 0;
+  i_t n_rows     = 0;
+  std::vector<size_t> global_col_offset;
+  size_t total_cols = 0;
+  size_t total_nnz  = 0;
+};
+
+template <typename i_t>
+static column_merge_shape_t<i_t> compute_column_merge_shape(
+  const std::vector<chunk_result_t>& chunks, i_t n_rows)
+{
+  column_merge_shape_t<i_t> shape;
+  shape.num_chunks = (int)chunks.size();
+  shape.n_rows     = n_rows;
+  shape.global_col_offset.resize((size_t)shape.num_chunks + 1);
+  {
+    scoped_timer_t timer("columns_global_offsets");
+    for (int t = 0; t < shape.num_chunks; t++) {
+      shape.global_col_offset[(size_t)t + 1] =
+        shape.global_col_offset[(size_t)t] + chunks[(size_t)t].var_names.size();
+      shape.total_nnz += chunks[(size_t)t].values.size();
+    }
+  }
+  shape.total_cols = shape.global_col_offset[(size_t)shape.num_chunks];
+  if constexpr (std::numeric_limits<i_t>::max() < std::numeric_limits<int64_t>::max()) {
+    const size_t index_max = (size_t)std::numeric_limits<i_t>::max();
+    if (shape.total_nnz > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu",
+                      shape.total_nnz,
+                      index_max);
+    }
+    if (shape.total_cols > index_max || (size_t)n_rows > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index "
+                      "max=%zu",
+                      (size_t)n_rows,
+                      shape.total_cols,
+                      index_max);
+    }
+  }
+  return shape;
+}
+
+template <typename i_t, typename f_t>
+static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_dense_metadata");
+  bool dense_ok   = shape.total_cols > 0;
+  bool have_first = false;
+  std::string_view dense_prefix;
+  uint64_t expected_next_id = 0;
+  uint64_t dense_min_id     = 0;
+  uint64_t dense_max_id     = 0;
+  size_t dense_pad_width    = 0;
+
+  for (int t = 0; t < shape.num_chunks && dense_ok; ++t) {
+    const auto& stats = chunks[(size_t)t].dense_col_stats;
+    if (stats.count == 0) { continue; }
+    if (!stats.candidate || stats.count != chunks[(size_t)t].var_names.size()) {
+      dense_ok = false;
+      break;
+    }
+    if (!have_first) {
+      have_first       = true;
+      dense_prefix     = stats.index.prefix;
+      expected_next_id = stats.index.min_id;
+      dense_min_id     = stats.index.min_id;
+      dense_pad_width  = stats.index.pad_width;
+    }
+    if (stats.index.prefix != dense_prefix || stats.index.min_id != expected_next_id ||
+        !dense_col_chunk_padding_compatible(stats, dense_pad_width)) {
+      dense_ok = false;
+      break;
+    }
+    if (stats.index.max_id < stats.index.min_id ||
+        stats.index.max_id - stats.index.min_id + 1 != stats.count) {
+      dense_ok = false;
+      break;
+    }
+    dense_max_id = stats.index.max_id;
+    if (stats.index.max_id == std::numeric_limits<uint64_t>::max()) {
+      dense_ok = false;
+      break;
+    }
+    expected_next_id = stats.index.max_id + 1;
+  }
+
+  if (!have_first || dense_max_id < dense_min_id ||
+      dense_max_id - dense_min_id + 1 != shape.total_cols) {
+    dense_ok = false;
+  }
+
+  state.col_index_mode = dense_ok ? index_mode_t::dense_ordered : index_mode_t::hash;
+  if (dense_ok) {
+    state.col_dense.prefix.assign(dense_prefix);
+    state.col_dense.min_id    = dense_min_id;
+    state.col_dense.max_id    = dense_max_id;
+    state.col_dense.pad_width = dense_pad_width;
+  }
+}
+
+template <typename i_t, typename f_t>
+static std::vector<i_t> build_csr_row_offsets(parse_state_t<i_t, f_t>& state,
+                                              const std::vector<chunk_result_t>& chunks,
+                                              const column_merge_shape_t<i_t>& shape)
+{
+  std::vector<i_t> global_row_counts((size_t)shape.n_rows, 0);
+  {
+    scoped_timer_t timer("columns_sum_row_counts");
+    for (int t = 0; t < shape.num_chunks; t++) {
+      for (const auto& block : chunks[(size_t)t].row_count_blocks) {
+        const int64_t* block_counts =
+          chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+        size_t row_base    = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+        size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
+        for (size_t local = 0; local < block_limit; ++local) {
+          global_row_counts[row_base + local] += (i_t)block_counts[local];
+        }
+      }
+    }
+  }
+  {
+    scoped_timer_t timer("columns_build_row_offsets");
+    state.problem.A_offsets_.resize((size_t)shape.n_rows + 1);
+    state.problem.A_offsets_[0] = 0;
+    for (i_t r = 0; r < shape.n_rows; r++) {
+      state.problem.A_offsets_[(size_t)r + 1] =
+        state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r];
+    }
+  }
+  return global_row_counts;
+}
+
+template <typename i_t>
+static void convert_counts_to_write_positions(std::vector<chunk_result_t>& chunks,
+                                              const column_merge_shape_t<i_t>& shape,
+                                              const std::vector<i_t>& row_offsets,
+                                              std::vector<i_t>& global_row_counts)
+{
+  scoped_timer_t timer("columns_counts_to_write_positions");
+  std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (auto& block : chunks[(size_t)t].row_count_blocks) {
+      int64_t* block_counts = chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+      size_t row_base       = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+      size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
+      for (size_t local = 0; local < block_limit; ++local) {
+        int64_t count = block_counts[local];
+        if (count == 0) continue;
+        size_t row          = row_base + local;
+        i_t pos             = row_offsets[row] + global_row_counts[row];
+        block_counts[local] = (int64_t)pos;
+        global_row_counts[row] += (i_t)count;
+      }
+    }
+  }
+}
+
+static void materialize_chunk_row_count_storage(std::vector<chunk_result_t>& chunks,
+                                                int num_threads)
+{
+  scoped_timer_t timer("columns_row_count_storage_hugepages");
+#pragma omp parallel for num_threads(num_threads)
+  for (int t = 0; t < (int)chunks.size(); ++t) {
+    materialize_vector_hugepages("column_row_count_storage",
+                                 chunks[(size_t)t].row_count_storage,
+                                 materialize_touch_t::write_2mb);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
+                                    const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("allocate_temp_csr_arrays");
+  size_t values_bytes  = shape.total_nnz * sizeof(f_t);
+  size_t indices_bytes = shape.total_nnz * sizeof(i_t);
+  state.temp_csr_nnz   = shape.total_nnz;
+
+#pragma omp parallel sections num_threads(4)
+  {
+#pragma omp section
+    {
+      state.temp_A_region = mmap_region_t::anonymous(
+        std::max<size_t>(values_bytes, 1), PROT_READ | PROT_WRITE, MAP_PRIVATE, "temp CSR values");
+      state.temp_A = (f_t*)state.temp_A_region.data();
+      state.temp_A_region.advise(MADV_HUGEPAGE);
+    }
+#pragma omp section
+    {
+      state.temp_A_indices_region = mmap_region_t::anonymous(std::max<size_t>(indices_bytes, 1),
+                                                             PROT_READ | PROT_WRITE,
+                                                             MAP_PRIVATE,
+                                                             "temp CSR column indices");
+      state.temp_A_indices        = (i_t*)state.temp_A_indices_region.data();
+      state.temp_A_indices_region.advise(MADV_HUGEPAGE);
+    }
+#pragma omp section
+    {
+      if (state.col_index_mode != index_mode_t::dense_ordered) {
+        state.var_name_arenas.clear();
+        state.var_name_arenas.resize((size_t)shape.num_chunks);
+        state.var_names_sv.resize(shape.total_cols);
+      }
+    }
+#pragma omp section
+    {
+      state.problem.var_types_.resize(shape.total_cols);
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
+                                         std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape,
+                                         int num_threads)
+{
+  scoped_timer_t timer("scatter_into_csr");
+  {
+    scoped_timer_t matrix_timer("scatter_matrix_entries");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)shape.num_chunks);
+#endif
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < shape.num_chunks; t++) {
+#ifdef MPS_FAST_PERF_COUNTERS
+      thread_perf_counters_t perf_counters;
+#endif
+      auto& chunk = chunks[(size_t)t];
+      for (size_t local_col = 0; local_col < chunk.var_names.size(); local_col++) {
+        i_t global_col   = (i_t)(shape.global_col_offset[(size_t)t] + local_col);
+        size_t col_start = chunk.col_offsets[local_col];
+        size_t col_end   = chunk.col_offsets[local_col + 1];
+        for (size_t idx = col_start; idx < col_end; idx++) {
+          i_t row                    = (i_t)chunk.row_indices[idx];
+          size_t row_idx             = (size_t)row;
+          size_t block_id            = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+          size_t local               = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+          int32_t block_pos          = chunk.row_count_block_dir[block_id];
+          row_count_block_t& block   = chunk.row_count_blocks[(size_t)block_pos];
+          int64_t& write_pos         = chunk.row_count_storage[block.storage_offset + local];
+          i_t dest                   = (i_t)write_pos++;
+          state.temp_A[dest]         = (f_t)chunk.values[idx];
+          state.temp_A_indices[dest] = global_col;
+        }
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("scatter_matrix_entries", perf_snapshots);
+#endif
+  }
+
+  if (state.col_index_mode != index_mode_t::dense_ordered) {
+    scoped_timer_t names_timer("scatter_var_names");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < shape.num_chunks; t++) {
+      chunk_name_arena_t& arena = state.var_name_arenas[(size_t)t];
+      arena.reserve(std::max<size_t>(4096, chunks[(size_t)t].var_names.size() * 16));
+      for (size_t i = 0; i < chunks[(size_t)t].var_names.size(); i++) {
+        state.var_names_sv[shape.global_col_offset[(size_t)t] + i] =
+          arena.copy(chunks[(size_t)t].var_names[i]);
+      }
+    }
+  } else {
+    scoped_timer_t names_timer("scatter_var_names");
+  }
+}
+
+struct global_marker_t {
+  marker_info_t::Type type;
+  size_t global_var_idx;
+};
+
+template <typename i_t, typename f_t>
+static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_apply_markers");
+  std::vector<global_marker_t> all_markers;
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& m : chunks[(size_t)t].markers) {
+      global_marker_t gm;
+      gm.type = m.type;
+      gm.global_var_idx =
+        m.after_local_var_idx == SIZE_MAX
+          ? (shape.global_col_offset[(size_t)t] > 0 ? shape.global_col_offset[(size_t)t] - 1
+                                                    : SIZE_MAX)
+          : shape.global_col_offset[(size_t)t] + m.after_local_var_idx;
+      all_markers.push_back(gm);
+    }
+  }
+
+  std::stable_sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
+    if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
+    if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
+    return a.global_var_idx < b.global_var_idx;
+  });
+
+  bool is_integer   = false;
+  size_t marker_idx = 0;
+  for (size_t v = 0; v < shape.total_cols; v++) {
+    while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
+                                               all_markers[marker_idx].global_var_idx < v)) {
+      is_integer = all_markers[marker_idx].type == marker_info_t::INTORG;
+      marker_idx++;
+    }
+    state.problem.var_types_[v] = is_integer ? 'I' : 'C';
+  }
+}
+
+template <typename i_t, typename f_t>
+static void assign_column_objective_entries(parse_state_t<i_t, f_t>& state,
+                                            const std::vector<chunk_result_t>& chunks,
+                                            const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_objective_entries");
+  state.problem.c_.resize(shape.total_cols, f_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& [local_col, coeff] : chunks[(size_t)t].objective_entries) {
+      size_t global_col = shape.global_col_offset[(size_t)t] + local_col;
+      if (global_col < shape.total_cols) { state.problem.c_[global_col] = (f_t)coeff; }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
+                                       std::vector<chunk_result_t>& chunks,
+                                       int num_threads)
+{
+  scoped_timer_t timer("merge_chunks_to_csr");
+  if (chunks.empty()) return;
+
+  auto shape = compute_column_merge_shape<i_t>(chunks, state.problem.n_constraints_);
+  detect_dense_column_metadata(state, chunks, shape);
+  auto global_row_counts = build_csr_row_offsets(state, chunks, shape);
+  convert_counts_to_write_positions(chunks, shape, state.problem.A_offsets_, global_row_counts);
+  materialize_chunk_row_count_storage(chunks, num_threads);
+  allocate_column_outputs(state, shape);
+  scatter_column_chunks_to_csr(state, chunks, shape, num_threads);
+  apply_column_integer_markers(state, chunks, shape);
+  assign_column_objective_entries(state, chunks, shape);
+
+  state.problem.n_vars_ = (i_t)shape.total_cols;
+  state.problem.nnz_    = (i_t)shape.total_nnz;
+}
+
+template <typename i_t, typename f_t>
+static void materialize_problem_csr(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_csr");
+  size_t nnz       = state.temp_csr_nnz;
+  int copy_threads = 2;
+  copy_threads     = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP));
+
+  int resize_threads = copy_threads > 1 ? 2 : 1;
+#pragma omp parallel sections num_threads(resize_threads)
+  {
+#pragma omp section
+    {
+      state.problem.A_.resize(nnz);
+    }
+#pragma omp section
+    {
+      state.problem.A_indices_.resize(nnz);
+    }
+  }
+
+  size_t value_bytes = nnz * sizeof(f_t);
+  size_t index_bytes = nnz * sizeof(i_t);
+  size_t total_bytes = value_bytes + index_bytes;
+  // Copy A_ and A_indices overlapping with the other phases
+  // this hides the latency costs of heap alloc and default init with other parsing/IO
+  // instead of making it blocking for the column parse
+  // TODO: just have A_ and A_indices_ be mmap anon allocs directly in the mps_data_model_t
+  // but that'd require careful work around avoiding breaking changes and the API esp cython stuff
+  if (total_bytes != 0) {
+#pragma omp parallel for num_threads(copy_threads) schedule(static)
+    for (int t = 0; t < copy_threads; ++t) {
+      size_t begin = (total_bytes * (size_t)t) / (size_t)copy_threads;
+      size_t end   = (total_bytes * (size_t)(t + 1)) / (size_t)copy_threads;
+      if (begin < value_bytes) {
+        size_t value_end = std::min(end, value_bytes);
+        if (value_end > begin) {
+          std::memcpy((char*)state.problem.A_.data() + begin,
+                      (const char*)state.temp_A + begin,
+                      value_end - begin);
+        }
+      }
+      if (end > value_bytes) {
+        size_t index_begin = begin > value_bytes ? begin - value_bytes : 0;
+        size_t index_end   = end - value_bytes;
+        std::memcpy((char*)state.problem.A_indices_.data() + index_begin,
+                    (const char*)state.temp_A_indices + index_begin,
+                    index_end - index_begin);
+      }
+    }
+  }
+
+  state.temp_A                = nullptr;
+  state.temp_A_indices        = nullptr;
+  state.temp_csr_materialized = true;
+  state.temp_A_region.reset();
+  state.temp_A_indices_region.reset();
+}
+
+// COLUMNS is always parsed chunk-parallel: each chunk is counted/parsed by parse_columns_chunk
+// and the per-chunk results are stitched together by merge_chunk_results_to_csr. There is no
+// separate serial implementation -- a single thread just runs one chunk through the same path.
+template <typename i_t, typename f_t>
+static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
+                                           int num_threads,
+                                           const char* columns_end)
+{
+  scoped_timer_t timer("parse_columns_parallel");
+
+  if (num_threads <= 0) { num_threads = phase_thread_count(MPS_COLUMNS_THREAD_CAP); }
+
+  // Skip the "COLUMNS" header
+  expect_section(state.cursor, "COLUMNS");
+
+  const char* columns_start    = state.cursor.ptr;
+  size_t columns_bytes         = (size_t)(columns_end - columns_start);
+  size_t chunk_limited_threads = std::max<size_t>(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES);
+  num_threads = std::max(1, std::min<int>(num_threads, (int)chunk_limited_threads));
+
+  auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads);
+
+  // Parse chunks in parallel
+  std::vector<chunk_result_t> results(num_threads);
+
+  {
+    scoped_timer_t timer("parse_columns_chunk_parallel");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_threads);
+#endif
+    std::exception_ptr first_error = nullptr;
+    std::mutex error_mutex;
+    {
+#pragma omp parallel for num_threads(num_threads)
+      for (int t = 0; t < num_threads; t++) {
+        try {
+          MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns);
+#ifdef MPS_FAST_PERF_COUNTERS
+          thread_perf_counters_t perf_counters;
+#endif
+          results[t] =
+            parse_columns_chunk<i_t, f_t>(chunk_bounds[t].start, chunk_bounds[t].end, state);
+#ifdef MPS_FAST_PERF_COUNTERS
+          perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+        } catch (...) {
+          std::lock_guard<std::mutex> lock(error_mutex);
+          if (!first_error) { first_error = std::current_exception(); }
+        }
+      }
+    }
+    if (first_error) { std::rethrow_exception(first_error); }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("parse_columns_chunk_parallel", perf_snapshots);
+#endif
+  }
+
+  // Merge results directly into CSR format
+  merge_chunk_results_to_csr(state, results, num_threads);
+
+  // Update cursor to RHS section
+  state.cursor.ptr = columns_end;
+  state.cursor.skip_ws();
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_rhs");
+  expect_section(cursor, "RHS");
+
+  // necessary on the cold path since we directly read and lookup on the hot path
+  auto reread_field_name = [](const char* start, const char* end) {
+    const char* p = start;
+    while (p < end && *p > ' ') {
+      p++;
+    }
+    return std::string_view(start, (size_t)(p - start));
+  };
+
+  auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) {
+    // This is a regular non-obj row.
+    if (row_idx != SIZE_MAX) {
+      state.problem.b_[row_idx] = value;
+      return;
+    }
+    // This is the objective row.
+    std::string_view row_name = reread_field_name(row_start, cursor.end);
+    if (row_name == state.objective_name_sv) {
+      state.problem.objective_offset_ = -value;
+      return;
+    }
+    // Other objectives, ignored currently. cold path
+    if (state.ignored_objective_names.count(row_name)) { return; }
+    // Unexpected!
+    error_unknown_row(cursor, row_start, "RHS");
+  };
+
+  while (cursor.ptr < cursor.end) {
+    [[maybe_unused]] auto rhs_name = cursor.read_field();
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    const char* row_start = cursor.ptr;
+    size_t row_idx        = state.read_row_lookup(cursor);
+    auto value            = expect_number_fast_pm_one(cursor);
+    apply_rhs(row_start, row_idx, (f_t)value);
+
+    accept_comment(cursor);
+    // Optional second entry
+    if (!cursor.eol()) {
+      const char* row_start2 = cursor.ptr;
+      size_t row_idx2        = state.read_row_lookup(cursor);
+      auto value2            = expect_number_fast_pm_one(cursor);
+      apply_rhs(row_start2, row_idx2, (f_t)value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+// does the job on 99% of instances, in the vast majority of cases bound names are sequential with
+// occasional sparsity
+static size_t find_var_after_hint(const std::vector<std::string_view>& var_names,
+                                  std::string_view var_name,
+                                  size_t hint_idx)
+{
+  const size_t n_vars = var_names.size();
+  if (hint_idx + 1 < n_vars && var_names[hint_idx + 1] == var_name) { return hint_idx + 1; }
+  if (hint_idx < n_vars && var_names[hint_idx] == var_name) { return hint_idx; }
+
+  const size_t first_begin = std::min(hint_idx + 2, n_vars);
+  for (size_t i = first_begin; i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  for (size_t i = 0; i < hint_idx && i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  return SIZE_MAX;
+}
+
+template <typename f_t, typename SetLb, typename SetUb, typename SetType, typename Error>
+static bool apply_bound_record(std::string_view bound_type,
+                               f_t value,
+                               bool has_value,
+                               bool first_bound_for_var,
+                               SetLb&& set_lb,
+                               SetUb&& set_ub,
+                               SetType&& set_type,
+                               Error&& error)
+{
+  if (bound_type == "LO") {
+    set_lb(value);
+  } else if (bound_type == "UP") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+  } else if (bound_type == "FX") {
+    set_lb(value);
+    set_ub(value);
+  } else if (bound_type == "FR") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "MI") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "PL") {
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "BV") {
+    set_lb(f_t{0});
+    set_ub(f_t{1});
+    set_type('I');
+  } else if (bound_type == "LI") {
+    set_lb(value);
+    set_type('I');
+  } else if (bound_type == "UI") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+    set_type('I');
+  } else if (bound_type == "SC") {
+    if (UNLIKELY(!has_value)) {
+      error("SC bound requires an upper bound value", bound_type);
+      return false;
+    }
+    set_ub(value);
+    set_type('S');
+  } else {
+    error("unknown bound type", bound_type);
+    return false;
+  }
+  return true;
+}
+
+// Parallel BOUNDS parser for the common dense/ordered-name case. Returns false when the section
+// is too small or not safely parallelizable, so parse_bounds_section resets and falls back to its
+// serial path. Bound-type semantics (LO/UP/FX/...) are shared with the serial path through
+// apply_bound_record, so the two cannot drift.
+template <typename i_t, typename f_t>
+static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
+                                                cursor_t& cursor,
+                                                const char* bounds_body_start,
+                                                const char* bounds_body_end,
+                                                size_t n_vars)
+{
+  const size_t bounds_bytes   = (size_t)(bounds_body_end - bounds_body_start);
+  const int num_threads       = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
+  const bool use_dense_lookup = state.col_index_mode == index_mode_t::dense_ordered;
+  const size_t min_parallel_bytes =
+    use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES;
+  if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; }
+
+  MPS_NVTX_RANGE(
+    use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint",
+    nvtx::colors::bounds);
+
+  struct bounds_parallel_stats_t {
+    size_t lines            = 0;
+    size_t dense_hits       = 0;
+    size_t dense_misses     = 0;
+    size_t comments         = 0;
+    size_t min_var          = SIZE_MAX;
+    size_t max_var          = 0;
+    size_t decreasing_order = 0;
+    const char* error_ptr   = nullptr;
+    char error_msg[192]     = {};
+  };
+
+  std::vector<bounds_parallel_stats_t> stats((size_t)num_threads);
+  auto boundaries =
+    compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
+
+  std::vector<uint8_t> bound_seen;
+  {
+    scoped_timer_t timer("bounds_parallel_seen_alloc");
+    bound_seen.resize(n_vars, 0);
+  }
+
+  {
+    scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense"
+                                          : "parse_bounds_parallel_ordered_hint");
+    // Repeated BOUNDS for the same variable are safe inside a group-owned chunk.
+    // Parse optimistically, then accept only if chunk summaries prove no backward jumps.
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      auto& local = stats[(size_t)t];
+      cursor_t cursor(boundaries[(size_t)t].start,
+                      (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start));
+      cursor.skip_ws();
+      size_t prev_var = SIZE_MAX;
+      size_t hint_idx = 0;
+      auto lookup_var = [&](std::string_view var_name) {
+        if (use_dense_lookup) { return state.col_dense.lookup(var_name); }
+        // quite often variables are in order, so a cheap lookup trick is to look for the variable
+        // right after this one
+        return find_var_after_hint(state.var_names_sv, var_name, hint_idx);
+      };
+      try {
+        while (cursor.ptr < cursor.end) {
+          if (UNLIKELY(*cursor.ptr == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          auto bound_type = cursor.read_field();
+          if (UNLIKELY(bound_type.empty())) { break; }
+          if (UNLIKELY(bound_type[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          [[maybe_unused]] auto bound_name = cursor.read_field();
+          auto var_name                    = cursor.read_field();
+          if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          size_t var_idx = lookup_var(var_name);
+          if (UNLIKELY(var_idx == SIZE_MAX)) {
+            local.dense_misses++;
+            break;
+          }
+          hint_idx = var_idx;
+          local.dense_hits++;
+          local.lines++;
+          local.min_var = std::min(local.min_var, var_idx);
+          local.max_var = std::max(local.max_var, var_idx);
+          if (prev_var != SIZE_MAX && var_idx < prev_var) { local.decreasing_order++; }
+          prev_var = var_idx;
+
+          bool first_bound_for_var = bound_seen[var_idx] == 0;
+          bound_seen[var_idx]      = 1;
+
+          f_t value      = 0;
+          bool has_value = false;
+          accept_comment(cursor);
+          if (!cursor.eol()) {
+            value     = (f_t)expect_number_fast_pm_one(cursor);
+            has_value = true;
+            accept_comment(cursor);
+          }
+
+          auto set_lb    = [&](f_t x) { state.problem.variable_lower_bounds_[var_idx] = x; };
+          auto set_ub    = [&](f_t x) { state.problem.variable_upper_bounds_[var_idx] = x; };
+          auto set_type  = [&](char t) { state.problem.var_types_[var_idx] = t; };
+          auto set_error = [&](const char* msg, std::string_view type) {
+            if (type.empty() || std::strcmp(msg, "unknown bound type") != 0) {
+              std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", msg);
+            } else {
+              std::snprintf(local.error_msg,
+                            sizeof(local.error_msg),
+                            "%s: %.*s",
+                            msg,
+                            (int)type.size(),
+                            type.data());
+            }
+            local.error_ptr = cursor.ptr;
+          };
+          if (!apply_bound_record(bound_type,
+                                  value,
+                                  has_value,
+                                  first_bound_for_var,
+                                  set_lb,
+                                  set_ub,
+                                  set_type,
+                                  set_error)) {
+            break;
+          }
+
+          expect_eol(cursor);
+        }
+      } catch (const std::exception& e) {
+        std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", e.what());
+        local.error_ptr = cursor.ptr;
+      }
+    }
+  }
+
+  size_t dense_misses     = 0;
+  size_t decreasing_order = 0;
+  size_t overlap_chunks   = 0;
+  size_t prev_max         = SIZE_MAX;
+  for (int t = 0; t < num_threads; ++t) {
+    const auto& local = stats[(size_t)t];
+    if (local.error_ptr != nullptr) {
+      cursor.ptr = local.error_ptr;
+      cursor.error("%s", local.error_msg);
+    }
+    dense_misses += local.dense_misses;
+    decreasing_order += local.decreasing_order;
+    if (local.lines > 0) {
+      if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; }
+      prev_max = local.max_var;
+    }
+  }
+
+  const bool order_safe = dense_misses == 0 && decreasing_order == 0 && overlap_chunks == 0;
+
+  if (!order_safe) {
+    std::fprintf(stderr,
+                 "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu "
+                 "decreasing_order=%zu overlap_chunks=%zu\n",
+                 dense_misses,
+                 decreasing_order,
+                 overlap_chunks);
+    cursor.ptr = bounds_body_start;
+    return false;
+  }
+
+  {
+    scoped_timer_t timer("bounds_integer_defaults");
+    for (size_t i = 0; i < n_vars; ++i) {
+      if (!bound_seen[i] && state.problem.var_types_[i] == 'I') {
+        state.problem.variable_lower_bounds_[i] = f_t{0};
+        state.problem.variable_upper_bounds_[i] = f_t{1};
+      }
+    }
+  }
+
+  cursor.ptr = bounds_body_end;
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void init_variable_bounds_defaults(parse_state_t<i_t, f_t>& state)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  {
+    scoped_timer_t timer("bounds_init_defaults");
+    state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
+    state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
+  }
+  {
+    scoped_timer_t timer("bounds_madvise_pretouch");
+    materialize_vector_hugepages("variable_lower_bounds",
+                                 state.problem.variable_lower_bounds_,
+                                 materialize_touch_t::write_4kb);
+    materialize_vector_hugepages("variable_upper_bounds",
+                                 state.problem.variable_upper_bounds_,
+                                 materialize_touch_t::write_4kb);
+  }
+}
+
+template <typename i_t, typename f_t, typename HasBound>
+static void apply_unspecified_integer_bounds(parse_state_t<i_t, f_t>& state, HasBound&& has_bound)
+{
+  scoped_timer_t timer("bounds_integer_defaults");
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  for (size_t i = 0; i < n_vars; ++i) {
+    if (!has_bound(i) && state.problem.var_types_[i] == 'I') {
+      state.problem.variable_lower_bounds_[i] = f_t{0};
+      state.problem.variable_upper_bounds_[i] = f_t{1};
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void init_variable_bounds_without_bounds_section(parse_state_t<i_t, f_t>& state)
+{
+  init_variable_bounds_defaults(state);
+  apply_unspecified_integer_bounds(state, [](size_t) { return false; });
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
+                                 cursor_t& cursor,
+                                 bool allow_parallel_dense = false)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  init_variable_bounds_defaults(state);
+
+  std::vector<uint64_t> bound_seen((n_vars + 63) / 64, 0);
+  auto has_bound = [&](size_t var_idx) {
+    return (bound_seen[var_idx >> 6] & (uint64_t{1} << (var_idx & 63))) != 0;
+  };
+  auto mark_bound = [&](size_t var_idx) {
+    bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63);
+  };
+
+  if (!accept_section(cursor, "BOUNDS")) {
+    apply_unspecified_integer_bounds(state, has_bound);
+    return;
+  }
+
+  const char* bounds_body_start = cursor.ptr;
+  const char* bounds_body_end   = cursor.end;
+  if (allow_parallel_dense) {
+    if (parse_bounds_section_parallel_dense(
+          state, cursor, bounds_body_start, bounds_body_end, n_vars)) {
+      return;
+    }
+    {
+      scoped_timer_t timer("bounds_parallel_fallback_reset");
+      std::fill(state.problem.variable_lower_bounds_.begin(),
+                state.problem.variable_lower_bounds_.end(),
+                f_t{0});
+      std::fill(state.problem.variable_upper_bounds_.begin(),
+                state.problem.variable_upper_bounds_.end(),
+                std::numeric_limits<f_t>::infinity());
+    }
+  }
+
+  size_t hint_idx = 0;
+  {
+    scoped_timer_t timer("parse_bounds");
+    while (!cursor.done()) {
+      auto bound_type                  = cursor.read_field();
+      [[maybe_unused]] auto bound_name = cursor.read_field();
+      auto var_name                    = cursor.read_field();
+      if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+
+      // optimized lookup using hint (bounds often in same order as columns)
+      size_t var_idx = SIZE_MAX;
+      // handle annoying bounds-only vars that weren't declared in COLUMNS
+      typename parse_state_t<i_t, f_t>::bounds_only_var_t* aux_var = nullptr;
+      if (LIKELY(state.col_index_mode == index_mode_t::dense_ordered)) {
+        var_idx = state.col_dense.lookup(var_name);
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
+      } else {
+        var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx);
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
+      }
+      if (var_idx != SIZE_MAX) { hint_idx = var_idx; }
+      bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx);
+
+      f_t value      = 0;
+      bool has_value = false;
+      accept_comment(cursor);
+      if (!cursor.eol()) {
+        value     = (f_t)expect_number(cursor);
+        has_value = true;
+        accept_comment(cursor);
+      }
+
+      auto set_lb = [&](f_t x) {
+        if (aux_var) {
+          aux_var->lb = x;
+        } else {
+          state.problem.variable_lower_bounds_[var_idx] = x;
+        }
+      };
+      auto set_ub = [&](f_t x) {
+        if (aux_var) {
+          aux_var->ub = x;
+        } else {
+          state.problem.variable_upper_bounds_[var_idx] = x;
+        }
+      };
+      auto set_type = [&](char t) {
+        if (aux_var) {
+          aux_var->type = t;
+        } else {
+          state.problem.var_types_[var_idx] = t;
+        }
+      };
+
+      auto set_error = [&](const char* msg, std::string_view type) {
+        if (std::strcmp(msg, "unknown bound type") == 0) {
+          cursor.error("%s: %.*s", msg, (int)type.size(), type.data());
+        }
+        cursor.error("%s", msg);
+      };
+      [[maybe_unused]] bool bound_applied = apply_bound_record(
+        bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error);
+      if (aux_var == nullptr) { mark_bound(var_idx); }
+
+      expect_eol(cursor);
+    }
+  }
+  apply_unspecified_integer_bounds(state, has_bound);
+}
+
+template <typename i_t, typename f_t>
+static void init_constraint_bounds_from_rows(parse_state_t<i_t, f_t>& state)
+{
+  state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_);
+  state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_);
+
+  for (i_t i = 0; i < state.problem.n_constraints_; ++i) {
+    char row_type = state.problem.row_types_[i];
+    f_t b         = state.problem.b_[i];
+    if (row_type == 'E') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[i] = -std::numeric_limits<f_t>::infinity();
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'G') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = std::numeric_limits<f_t>::infinity();
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_ranges");
+  init_constraint_bounds_from_rows(state);
+
+  if (!accept_section(cursor, "RANGES")) { return; }
+
+  auto apply_range = [&](std::string_view row_name, f_t range_val) {
+    size_t row_idx = state.row_lookup(row_name);
+    if (row_idx == SIZE_MAX) {
+      cursor.error("unknown row name in RANGES: %.*s", (int)row_name.size(), row_name.data());
+    }
+    char row_type = state.problem.row_types_[row_idx];
+    f_t abs_range = std::abs(range_val);
+
+    if (row_type == 'E') {
+      if (range_val >= 0) {
+        state.problem.constraint_upper_bounds_[row_idx] =
+          state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+      } else {
+        state.problem.constraint_lower_bounds_[row_idx] =
+          state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+      }
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[row_idx] =
+        state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+    } else if (row_type == 'G') {
+      state.problem.constraint_upper_bounds_[row_idx] =
+        state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+    }
+  };
+
+  while (cursor.ptr < cursor.end) {
+    [[maybe_unused]] auto range_name = cursor.read_field();
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    auto row_name = cursor.read_field();
+    auto value    = (f_t)expect_number(cursor);
+    apply_range(row_name, value);
+
+    accept_comment(cursor);
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      auto value2 = (f_t)expect_number(cursor);
+      apply_range(row_name2, value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+// quadratric stuff is bare bones for now, optimize if needed
+
+template <typename i_t, typename f_t>
+static void build_var_name_map_if_needed(parse_state_t<i_t, f_t>& state)
+{
+  if (state.col_index_mode == index_mode_t::dense_ordered || !state.var_names_map.empty()) {
+    return;
+  }
+  scoped_timer_t timer("quadratic_build_var_name_map");
+  state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2);
+  for (size_t i = 0; i < state.var_names_sv.size(); ++i) {
+    state.var_names_map.emplace(state.var_names_sv[i], i);
+  }
+}
+
+template <typename i_t, typename f_t>
+static size_t lookup_quadratic_var(parse_state_t<i_t, f_t>& state, std::string_view name)
+{
+  if (state.col_index_mode == index_mode_t::dense_ordered) { return state.col_dense.lookup(name); }
+  auto it = state.var_names_map.find(name);
+  return it == state.var_names_map.end() ? SIZE_MAX : it->second;
+}
+
+template <typename i_t, typename f_t>
+static void build_quadratic_csr(parse_state_t<i_t, f_t>& state,
+                                const std::vector<std::tuple<i_t, i_t, f_t>>& entries,
+                                bool symmetric_upper_triangular)
+{
+  scoped_timer_t timer("build_quadratic_csr");
+  const size_t n_vars = (size_t)state.problem.n_vars_;
+  if (entries.empty()) { return; }
+
+  struct expanded_entry_t {
+    size_t row;
+    size_t col;
+    size_t seq;
+    f_t value;
+  };
+
+  std::vector<expanded_entry_t> expanded;
+  expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size());
+  size_t seq = 0;
+  for (const auto& [row_i, col_i, value] : entries) {
+    size_t row = (size_t)row_i;
+    size_t col = (size_t)col_i;
+    expanded.push_back({row, col, seq++, value});
+    if (symmetric_upper_triangular && row != col) { expanded.push_back({col, row, seq++, value}); }
+  }
+
+  std::stable_sort(expanded.begin(), expanded.end(), [](const auto& a, const auto& b) {
+    if (a.row != b.row) return a.row < b.row;
+    if (a.col != b.col) return a.col < b.col;
+    return a.seq < b.seq;
+  });
+
+  auto& values  = state.problem.Q_objective_values_;
+  auto& indices = state.problem.Q_objective_indices_;
+  auto& offsets = state.problem.Q_objective_offsets_;
+  values.clear();
+  indices.clear();
+  offsets.assign(n_vars + 1, i_t{0});
+  values.reserve(expanded.size());
+  indices.reserve(expanded.size());
+
+  size_t current_row = 0;
+  offsets[0]         = 0;
+  for (const auto& entry : expanded) {
+    while (current_row < entry.row) {
+      offsets[++current_row] = (i_t)values.size();
+    }
+    values.push_back(entry.value * f_t{0.5});
+    indices.push_back((i_t)entry.col);
+  }
+  while (current_row < n_vars) {
+    offsets[++current_row] = (i_t)values.size();
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_quadratic_sections");
+  if (cursor.done()) { return; }
+
+  build_var_name_map_if_needed(state);
+  std::vector<std::tuple<i_t, i_t, f_t>> quadobj_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>> qmatrix_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>>* active_entries = nullptr;
+
+  auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) {
+    size_t var1_idx = lookup_quadratic_var(state, var1);
+    if (var1_idx == SIZE_MAX) {
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var1.size(), var1.data());
+    }
+    size_t var2_idx = lookup_quadratic_var(state, var2);
+    if (var2_idx == SIZE_MAX) {
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var2.size(), var2.data());
+    }
+    active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value);
+  };
+
+  while (cursor.ptr < cursor.end) {
+    if (accept_section(cursor, "QUADOBJ")) {
+      active_entries = &quadobj_entries;
+      continue;
+    }
+    if (accept_section(cursor, "QMATRIX")) {
+      active_entries = &qmatrix_entries;
+      continue;
+    }
+    if (accept(cursor, "QCMATRIX")) {
+      auto row_name = cursor.read_field();
+      if (row_name.empty()) { cursor.error("QCMATRIX missing constraint row name"); }
+      size_t row_idx = state.row_lookup(row_name);
+      if (row_idx == SIZE_MAX) {
+        cursor.error(
+          "unknown constraint row name in QCMATRIX: %.*s", (int)row_name.size(), row_name.data());
+      }
+      char row_type = state.problem.row_types_[row_idx];
+      if (row_type != 'L' && row_type != 'G') {
+        cursor.error(
+          "QCMATRIX row must have ROWS type L or G: %.*s", (int)row_name.size(), row_name.data());
+      }
+      expect_eol(cursor);
+      typename parse_state_t<i_t, f_t>::qcmatrix_block_t block;
+      block.row_idx  = row_idx;
+      block.row_name = row_name;
+      state.qcmatrix_blocks.push_back(std::move(block));
+      active_entries = &state.qcmatrix_blocks.back().entries;
+      continue;
+    }
+    if (active_entries == nullptr) { break; }
+
+    const char* field_start = cursor.ptr;
+    auto var1               = cursor.read_field();
+    if (UNLIKELY(var1.empty())) { break; }
+    if (UNLIKELY(var1[0] == '$' || var1[0] == '*')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    const bool starts_column_one =
+      field_start == cursor.start || field_start[-1] == '\n' || field_start[-1] == '\r';
+    if (UNLIKELY(starts_column_one)) {
+      cursor.error("unknown quadratic section record: %.*s", (int)var1.size(), var1.data());
+    }
+    auto var2 = cursor.read_field();
+    if (UNLIKELY(!var2.empty() && var2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    f_t value = (f_t)expect_number(cursor);
+    add_entry(var1, var2, value);
+    accept_comment(cursor);
+    expect_eol(cursor);
+  }
+
+  if (!quadobj_entries.empty()) {
+    build_quadratic_csr(state, quadobj_entries, true);
+  } else if (!qmatrix_entries.empty()) {
+    build_quadratic_csr(state, qmatrix_entries, false);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void set_cursor_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  state.cursor.ptr = range.begin;
+  state.cursor.end = range.end;
+}
+
+template <typename i_t, typename f_t>
+static void parse_header_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  accept_comment_line(state.cursor);
+  if (state.cursor.done()) { return; }
+  parse_name_section(state);
+  parse_objsense_section(state);
+  parse_objname_section(state);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  parse_rows_section(state, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_columns_range(parse_state_t<i_t, f_t>& state,
+                                mps_phase_range_t range,
+                                int num_threads = 0)
+{
+  set_cursor_range(state, range);
+  parse_columns_section_parallel(state, num_threads, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_rhs_section(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) {
+    init_variable_bounds_without_bounds_section(state);
+    return;
+  }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_bounds_section(state, cursor, true);
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) {
+    init_constraint_bounds_from_rows(state);
+    return;
+  }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_ranges_section(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_quadratic_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_quadratic_sections(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void finalize_qcmatrix_constraints(parse_state_t<i_t, f_t>& state)
+{
+  if (state.qcmatrix_blocks.empty()) { return; }
+  scoped_timer_t timer("finalize_qcmatrix_constraints");
+  const size_t original_rows = (size_t)state.problem.n_constraints_;
+  std::vector<uint8_t> quadratic_rows(original_rows, 0);
+  std::vector<uint8_t> seen_rows(original_rows, 0);
+  size_t active_blocks = 0;
+
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+    if (block.row_idx >= original_rows) {
+      state.cursor.error("QCMATRIX row index is out of range");
+    }
+    if (seen_rows[block.row_idx]) {
+      state.cursor.error("duplicate QCMATRIX block for constraint row: %.*s",
+                         (int)block.row_name.size(),
+                         block.row_name.data());
+    }
+    seen_rows[block.row_idx]      = 1;
+    quadratic_rows[block.row_idx] = 1;
+    ++active_blocks;
+  }
+
+  if (active_blocks == 0) { return; }
+
+  // rebuild the A_ matrix. fairly ugly and brute force, could do better if we parsed the QCMATRIX
+  // entries before building the CSR in COLUMNS but unclear if worth it
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+
+    size_t linear_begin = (size_t)state.problem.A_offsets_[block.row_idx];
+    size_t linear_end   = (size_t)state.problem.A_offsets_[block.row_idx + 1];
+    typename mps_data_model_t<i_t, f_t>::quadratic_constraint_t qc;
+    qc.constraint_row_index = (i_t)block.row_idx;
+    qc.constraint_row_name  = state.problem.row_names_[block.row_idx];
+    qc.constraint_row_type  = state.problem.row_types_[block.row_idx];
+    qc.rhs_value            = state.problem.b_[block.row_idx];
+    qc.linear_values.assign(state.problem.A_.begin() + linear_begin,
+                            state.problem.A_.begin() + linear_end);
+    qc.linear_indices.assign(state.problem.A_indices_.begin() + linear_begin,
+                             state.problem.A_indices_.begin() + linear_end);
+
+    std::vector<size_t> perm(block.entries.size());
+    for (size_t i = 0; i < perm.size(); ++i) {
+      perm[i] = i;
+    }
+    std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) {
+      const auto& ea = block.entries[a];
+      const auto& eb = block.entries[b];
+      if (std::get<0>(ea) != std::get<0>(eb)) { return std::get<0>(ea) < std::get<0>(eb); }
+      return std::get<1>(ea) < std::get<1>(eb);
+    });
+
+    qc.rows.reserve(block.entries.size());
+    qc.cols.reserve(block.entries.size());
+    qc.vals.reserve(block.entries.size());
+    for (size_t idx : perm) {
+      const auto& [row, col, val] = block.entries[idx];
+      qc.rows.push_back(row);
+      qc.cols.push_back(col);
+      qc.vals.push_back(val);
+    }
+    state.problem.quadratic_constraints_.push_back(std::move(qc));
+  }
+
+  std::vector<f_t> new_A;
+  std::vector<i_t> new_A_indices;
+  std::vector<i_t> new_A_offsets;
+  std::vector<f_t> new_b;
+  std::vector<f_t> new_clb;
+  std::vector<f_t> new_cub;
+  std::vector<std::string> new_row_names;
+  std::vector<char> new_row_types;
+
+  new_A.reserve(state.problem.A_.size());
+  new_A_indices.reserve(state.problem.A_indices_.size());
+  new_A_offsets.reserve(original_rows + 1 - active_blocks);
+  new_b.reserve(original_rows - active_blocks);
+  new_clb.reserve(original_rows - active_blocks);
+  new_cub.reserve(original_rows - active_blocks);
+  new_row_names.reserve(original_rows - active_blocks);
+  new_row_types.reserve(original_rows - active_blocks);
+  new_A_offsets.push_back(0);
+
+  for (size_t row = 0; row < original_rows; ++row) {
+    if (quadratic_rows[row]) { continue; }
+    size_t begin = (size_t)state.problem.A_offsets_[row];
+    size_t end   = (size_t)state.problem.A_offsets_[row + 1];
+    new_A.insert(new_A.end(), state.problem.A_.begin() + begin, state.problem.A_.begin() + end);
+    new_A_indices.insert(new_A_indices.end(),
+                         state.problem.A_indices_.begin() + begin,
+                         state.problem.A_indices_.begin() + end);
+    new_A_offsets.push_back((i_t)new_A.size());
+    new_b.push_back(state.problem.b_[row]);
+    new_clb.push_back(state.problem.constraint_lower_bounds_[row]);
+    new_cub.push_back(state.problem.constraint_upper_bounds_[row]);
+    new_row_names.push_back(std::move(state.problem.row_names_[row]));
+    new_row_types.push_back(state.problem.row_types_[row]);
+  }
+
+  state.problem.A_                       = std::move(new_A);
+  state.problem.A_indices_               = std::move(new_A_indices);
+  state.problem.A_offsets_               = std::move(new_A_offsets);
+  state.problem.b_                       = std::move(new_b);
+  state.problem.constraint_lower_bounds_ = std::move(new_clb);
+  state.problem.constraint_upper_bounds_ = std::move(new_cub);
+  state.problem.row_names_               = std::move(new_row_names);
+  state.problem.row_types_               = std::move(new_row_types);
+  state.problem.n_constraints_           = (i_t)state.problem.b_.size();
+  state.problem.nnz_                     = (i_t)state.problem.A_.size();
+}
+
+template <typename i_t, typename f_t>
+static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_names");
+  int num_threads = phase_thread_count(MPS_NAMES_THREAD_CAP);
+  // Copy string_views to actual strings (this is where allocation happens)
+  {
+    scoped_timer_t timer("materialize_problem_scalar_names");
+    state.problem.problem_name_   = std::string(state.problem_name_sv);
+    state.problem.objective_name_ = std::string(state.objective_name_sv);
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_row_names");
+    size_t n = state.row_names_sv.size();
+    state.problem.row_names_.resize(n);
+    // row names are usually small enough for SSO - parallel assigns mostly don't touch the heap and
+    // as such may help a lot ideally we could just allocate an arena and store non-owning string
+    // views but that'd require a refactor of the problem representation
+    if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    }
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_var_names");
+    const bool col_dense_ordered = state.col_index_mode == index_mode_t::dense_ordered;
+    size_t n = col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size();
+    state.problem.var_names_.resize(n);
+    if (col_dense_ordered && n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
+      }
+    } else if (col_dense_ordered) {
+      for (size_t i = 0; i < n; ++i) {
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
+      }
+    } else if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void append_bounds_only_variables(parse_state_t<i_t, f_t>& state)
+{
+  if (state.bounds_only_vars.empty()) { return; }
+  scoped_timer_t timer("append_bounds_only_variables");
+
+  // BOUNDS-only variables have no matrix entries; append after COLUMNS vars.
+  for (const auto& [name, aux] : state.bounds_only_vars) {
+    state.problem.var_names_.emplace_back(name);
+    state.problem.var_types_.push_back(aux.type);
+    state.problem.c_.push_back(f_t{0});
+    state.problem.variable_lower_bounds_.push_back(aux.lb);
+    state.problem.variable_upper_bounds_.push_back(aux.ub);
+  }
+  state.problem.n_vars_ = (i_t)state.problem.var_names_.size();
+}
+
+template <typename i_t, typename f_t>
+static std::size_t init_problem_storage(mps_data_model_t<i_t, f_t>& problem,
+                                        std::size_t reserve_hint)
+{
+  problem.n_vars_                   = 0;
+  problem.n_constraints_            = 0;
+  problem.nnz_                      = 0;
+  problem.maximize_                 = false;
+  problem.objective_scaling_factor_ = f_t{1};
+  problem.objective_offset_         = f_t{0};
+
+  std::size_t reserve_size = std::max<std::size_t>(reserve_hint, 1 * MiB);
+  std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
+  problem.A_offsets_.reserve(reserve_dim);
+  problem.b_.reserve(reserve_dim);
+  problem.variable_lower_bounds_.reserve(reserve_dim);
+  problem.variable_upper_bounds_.reserve(reserve_dim);
+  problem.var_types_.reserve(reserve_dim);
+  problem.row_types_.reserve(reserve_dim);
+  problem.row_names_.reserve(reserve_dim);
+  problem.var_names_.reserve(reserve_dim);
+  problem.constraint_lower_bounds_.reserve(reserve_dim);
+  problem.constraint_upper_bounds_.reserve(reserve_dim);
+  return reserve_dim;
+}
+
+// Contract every input stream fed to parse_mps_fast_stream must satisfy.
+template <typename Stream>
+concept InputStream = requires(Stream stream)
+{
+  {stream.data()}->std::convertible_to<const char*>;
+  {stream.mutable_data()}->std::convertible_to<char*>;
+  {stream.size()}->std::convertible_to<std::size_t>;
+  {stream.compressed_size()}->std::convertible_to<std::size_t>;
+  {stream.reserve_size_hint()}->std::convertible_to<std::size_t>;
+  {stream.registry()}->std::same_as<mps_phase_registry_t&>;
+  {stream.view()}->std::same_as<input_stream_view_t>;
+  {stream.run_decode_tasks()}->std::same_as<void>;
+};
+
+template <InputStream Stream, typename i_t, typename f_t>
+static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
+                                                        const char* total_timer_name,
+                                                        const char* producer_task_name)
+{
+  omp_max_active_levels_guard_t omp_active_levels(2);
+
+  input_stream_view_t input = stream.view();
+  auto total_timer          = std::make_unique<scoped_timer_t>(total_timer_name);
+  mps_data_model_t<i_t, f_t> problem;
+  std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint());
+
+  cursor_t cursor(input.data, 0);
+  parse_state_t<i_t, f_t> state(problem, cursor);
+  state.row_names_sv.reserve(reserve_dim);
+
+  auto phase_end = [](const char*) { flush_timers(); };
+
+  parallel_error_latch_t parser_tasks;
+
+  auto run_parser_task = [&](auto&& fn) {
+    if (parser_tasks.stopped()) { return; }
+    try {
+      fn();
+    } catch (...) {
+      parser_tasks.capture(std::current_exception());
+    }
+  };
+
+  auto unblock_phase_waiters_after_error = [&]() {
+    mps_phase_range_t empty{input.data, input.data, false};
+    input.registry->publish(mps_phase_kind::header, empty);
+    input.registry->publish(mps_phase_kind::rows, empty);
+    input.registry->publish(mps_phase_kind::columns, empty);
+    input.registry->publish(mps_phase_kind::rhs, empty);
+    input.registry->publish(mps_phase_kind::bounds, empty);
+    input.registry->publish(mps_phase_kind::ranges, empty);
+    input.registry->publish(mps_phase_kind::quadratic, empty);
+  };
+
+  // These ints carry no data; they exist only as OpenMP task-dependency tokens. A task's
+  // depend(out: X) "produces" X and depend(in: X) waits on it, so the phase ordering in the
+  // task graph below (e.g. bounds after columns_done, because bounds reference variable names)
+  // is expressed purely through which tokens each task depends on.
+  int header_ready = 0, rows_ready = 0, columns_ready = 0;
+  int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0;
+  int header_done = 0, rows_done = 0, columns_done = 0;
+  int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
+  int csr_done = 0;
+
+  const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size);
+  const int parser_threads      = parser_thread_cap_for_size(parser_size);
+
+#pragma omp parallel num_threads(parser_threads)
+  {
+    std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num());
+    nvtx::name_current_thread(thread_name.c_str());
+
+#pragma omp single
+    {
+      // Bridge between the producer and the parse tasks: each detached task below blocks
+      // until run_decode_tasks() publishes that phase's byte range into the registry, then
+      // completes its event and fulfills depend(out: <phase>_ready) -- releasing the matching
+      // parse task. This is what lets ROWS parsing start the instant the ROWS bytes are
+      // decoded, overlapping with the decode of later sections.
+      omp_event_handle_t ev_header;
+#pragma omp task detach(ev_header) depend(out : header_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::header, ev_header);
+      }
+      omp_event_handle_t ev_rows;
+#pragma omp task detach(ev_rows) depend(out : rows_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rows, ev_rows);
+      }
+      omp_event_handle_t ev_columns;
+#pragma omp task detach(ev_columns) depend(out : columns_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::columns, ev_columns);
+      }
+      omp_event_handle_t ev_rhs;
+#pragma omp task detach(ev_rhs) depend(out : rhs_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rhs, ev_rhs);
+      }
+      omp_event_handle_t ev_bounds;
+#pragma omp task detach(ev_bounds) depend(out : bounds_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::bounds, ev_bounds);
+      }
+      omp_event_handle_t ev_ranges;
+#pragma omp task detach(ev_ranges) depend(out : ranges_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::ranges, ev_ranges);
+      }
+      omp_event_handle_t ev_quadratic;
+#pragma omp task detach(ev_quadratic) depend(out : quadratic_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic);
+      }
+
+      // We intentionally keep LZ4/raw input as a stable full-buffer producer here. The
+      // progressive decoded-page lifetime prototype saved RSS, but made COLUMNS/merge slower
+      // and really wants a separate memory-limited parser pipeline instead of this fast path.
+#pragma omp task
+      {
+        MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io);
+        try {
+          stream.run_decode_tasks();
+        } catch (...) {
+          parser_tasks.capture(std::current_exception());
+          unblock_phase_waiters_after_error();
+        }
+      }
+
+#pragma omp task depend(in : header_ready) depend(out : header_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_header", nvtx::colors::generic);
+          parse_header_range(state, input.registry->range(mps_phase_kind::header));
+          phase_end("header");
+        });
+      }
+
+#pragma omp task depend(in : rows_ready, header_done) depend(out : rows_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rows", nvtx::colors::rows);
+          parse_rows_range(state, input.registry->range(mps_phase_kind::rows));
+          phase_end("rows");
+        });
+      }
+
+#pragma omp task depend(in : rows_done, columns_ready) depend(out : columns_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_columns", nvtx::colors::columns);
+          parse_columns_range(state, input.registry->range(mps_phase_kind::columns));
+          phase_end("columns");
+        });
+      }
+
+#pragma omp task depend(in : columns_done) depend(out : names_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_names", nvtx::colors::names);
+          scoped_timer_t timer("materialize_problem_names_task");
+          materialize_problem_names(state);
+        });
+      }
+
+#pragma omp task depend(in : columns_done) depend(out : csr_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_csr", nvtx::colors::alloc);
+          materialize_problem_csr(state);
+        });
+      }
+
+#pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rhs", nvtx::colors::rhs);
+          parse_rhs_range(state, input.registry->range(mps_phase_kind::rhs));
+          phase_end("rhs");
+        });
+      }
+
+#pragma omp task depend(in : ranges_ready, rhs_done) depend(out : ranges_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges);
+          parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges));
+          phase_end("ranges");
+        });
+      }
+
+#pragma omp task depend(in : bounds_ready, columns_done) depend(out : bounds_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds);
+          parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds));
+          phase_end("bounds");
+        });
+      }
+
+#pragma omp task depend(in : quadratic_ready, columns_done) depend(out : quadratic_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic);
+          parse_quadratic_range(state, input.registry->range(mps_phase_kind::quadratic));
+          phase_end("quadratic");
+        });
+      }
+    }
+  }
+
+  parser_tasks.rethrow_if_error();
+
+  finalize_qcmatrix_constraints(state);
+  append_bounds_only_variables(state);
+
+  input.size = stream.size();
+  cursor.end = input.data + input.size;
+  if (!input.registry->endata_ready()) {
+    cursor.ptr = input.data + input.size;
+    cursor.error("input ended before ENDATA boundary was resolved");
+  }
+  if (input.registry->endata_present()) {
+    cursor.ptr = input.registry->endata_begin();
+    expect(cursor, "ENDATA");
+  }
+
+  total_timer.reset();
+  flush_timers();
+  return problem;
+}
+
+struct padded_memory_input_t {
+  std::vector<char> buffer;
+  std::size_t input_size      = 0;
+  std::size_t compressed_size = 0;
+};
+
+static padded_memory_input_t read_compressed_mps_file(const std::string& path)
+{
+  std::vector<char> buffer = file_to_string(path);
+  if (buffer.empty()) { buffer.push_back('\0'); }
+
+  std::size_t input_size = buffer.size() - 1;
+  ensure_input_buffer_padding(buffer, input_size);
+  return {std::move(buffer), input_size, get_file_size(path)};
+}
+
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path, FileReadMethod read_method)
+{
+  FileReadMethod effective_method = effective_file_read_method(path, read_method);
+  switch (effective_method) {
+    case FileReadMethod::Lz4: {
+      lz4_input_stream_t stream(path);
+      return parse_mps_fast_stream<lz4_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
+    }
+    case FileReadMethod::Gzip:
+    case FileReadMethod::Bzip2: {
+      padded_memory_input_t input = read_compressed_mps_file(path);
+      memory_input_stream_t stream(
+        std::move(input.buffer), input.input_size, input.compressed_size);
+      const char* timer_name = effective_method == FileReadMethod::Gzip
+                                 ? "parse_mps_fast_file_gzip (total)"
+                                 : "parse_mps_fast_file_bzip2 (total)";
+      return parse_mps_fast_stream<memory_input_stream_t, i_t, f_t>(
+        stream, timer_name, "task_memory_scan");
+    }
+    case FileReadMethod::Read: {
+      raw_input_stream_t stream(path);
+      return parse_mps_fast_stream<raw_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
+    }
+  }
+  __builtin_unreachable();
+}
+
+template mps_data_model_t<int, float> parse_mps_fast_file(const std::string& path,
+                                                          FileReadMethod read_method);
+template mps_data_model_t<int, double> parse_mps_fast_file(const std::string& path,
+                                                           FileReadMethod read_method);
+template mps_data_model_t<int64_t, float> parse_mps_fast_file(const std::string& path,
+                                                              FileReadMethod read_method);
+template mps_data_model_t<int64_t, double> parse_mps_fast_file(const std::string& path,
+                                                               FileReadMethod read_method);
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
new file mode 100644
index 0000000000..6047a55f05
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "file_reader.hpp"
+
+#include <cuopt/linear_programming/io/mps_data_model.hpp>
+
+#include <cstddef>
+#include <string>
+
+namespace cuopt::linear_programming::io::detail {
+
+template <typename i_t, typename f_t>
+using parser_model_t = mps_data_model_t<i_t, f_t>;
+
+template <typename i_t, typename f_t>
+parser_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path,
+                                             FileReadMethod read_method = FileReadMethod::Read);
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
new file mode 100644
index 0000000000..78e4219e06
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -0,0 +1,371 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
+
+#include <cuda/cmath>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cctype>
+#include <cerrno>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::linear_programming::io::detail {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace {
+
+constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
+constexpr std::size_t raw_input_max_read_threads          = 8;
+constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
+constexpr long nfs_super_magic                            = 0x6969;
+
+bool path_has_suffix(const std::string& path, const char* suffix) noexcept
+{
+  std::size_t suffix_len = std::strlen(suffix);
+  if (path.size() < suffix_len) { return false; }
+  for (std::size_t i = 0; i < suffix_len; ++i) {
+    unsigned char path_char = path[path.size() - suffix_len + i];
+    if (std::tolower(path_char) != suffix[i]) { return false; }
+  }
+  return true;
+}
+
+std::size_t add_input_padding(std::size_t size)
+{
+  if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow");
+  }
+  return size + input_buffer_padding_bytes;
+}
+
+bool is_nfs_backed_path(const std::string& path) noexcept
+{
+  struct statfs fs;
+  return ::statfs(path.c_str(), &fs) == 0 && fs.f_type == nfs_super_magic;
+}
+
+}  // namespace
+
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size)
+{
+  if (input_size > buffer.size()) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "input_size %zu exceeds buffer size %zu",
+                    input_size,
+                    buffer.size());
+  }
+  std::size_t required = add_input_padding(input_size);
+  if (buffer.size() < required) { buffer.resize(required, '\0'); }
+}
+
+std::size_t get_file_size(int fd, const std::string& path)
+{
+  struct stat st;
+  if (::fstat(fd, &st) != 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to stat file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  if (st.st_size < 0) {
+    mps_parser_fail(error_type_t::RuntimeError, "Negative file size for '%s'", path.c_str());
+  }
+  return (std::size_t)st.st_size;
+}
+
+std::size_t get_file_size(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+
+  std::size_t size = get_file_size(fd, path);
+  ::close(fd);
+  return size;
+}
+
+std::size_t system_page_size()
+{
+  static std::size_t page_size = [] {
+    long value = ::sysconf(_SC_PAGESIZE);
+    return value > 0 ? (std::size_t)value : (std::size_t)4096;
+  }();
+  return page_size;
+}
+
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset)
+{
+  std::size_t done = 0;
+  while (done < bytes) {
+    std::size_t remaining = bytes - done;
+    std::size_t chunk =
+      std::min<std::size_t>(remaining, (std::size_t)std::numeric_limits<ssize_t>::max());
+    ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done));
+    if (got < 0) {
+      if (errno == EINTR) { continue; }
+      return false;
+    }
+    if (got == 0) {
+      errno = EIO;
+      return false;
+    }
+    done += (std::size_t)got;
+  }
+  return true;
+}
+
+raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
+  int buffered_fd = ::open(path.c_str(), O_RDONLY);
+  cuopt::scope_guard close_buffered([&] {
+    if (buffered_fd >= 0) { ::close(buffered_fd); }
+  });
+  if (buffered_fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open raw MPS file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+
+  int direct_fd = -1;
+  cuopt::scope_guard close_direct([&] {
+    if (direct_fd >= 0) { ::close(direct_fd); }
+  });
+
+  file_size_                   = get_file_size(buffered_fd, path);
+  int read_fd                  = buffered_fd;
+  bool large_enough_for_direct = file_size_ > raw_input_direct_io_threshold_bytes;
+  bool nfs_backed              = is_nfs_backed_path(path);
+  // Buffered reads are consistently faster than O_DIRECT on our NFS mounts;
+  // keep direct I/O for large local files where it wins.
+  if (large_enough_for_direct && !nfs_backed) {
+#ifdef O_DIRECT
+    direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
+    if (direct_fd >= 0) {
+      read_fd    = direct_fd;
+      direct_io_ = true;
+    }
+#endif
+  }
+  window_bytes_ = raw_input_window_bytes;
+  window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
+#ifdef MPS_FAST_TIMERS
+  read_window_ms_.assign(window_count_, 0);
+#endif
+
+  output_mapped_size_ =
+    cuda::round_up(std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
+  output_region_ = mmap_region_t::anonymous(
+    output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
+  output_data_ = output_region_.char_data();
+  output_region_.advise(MADV_HUGEPAGE);
+
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
+
+  buffered_fd_ = buffered_fd;
+  buffered_fd  = -1;
+  fd_          = read_fd;
+  if (read_fd == direct_fd) { direct_fd = -1; }
+}
+
+raw_input_stream_t::~raw_input_stream_t()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+  if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); }
+}
+
+const char* raw_input_stream_t::data() const noexcept { return output_data_; }
+char* raw_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; }
+std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; }
+
+void raw_input_stream_t::read_window_payload(std::size_t offset, std::size_t size)
+{
+  if (pread_full(fd_, output_data_ + offset, size, offset)) { return; }
+  // O_DIRECT can reject an unaligned request with EINVAL; fall back to the
+  // buffered descriptor for this window when that happens.
+  if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0 &&
+      pread_full(buffered_fd_, output_data_ + offset, size, offset)) {
+    return;
+  }
+  mps_parser_fail(error_type_t::RuntimeError,
+                  "Failed to pread raw MPS file '%s': %s",
+                  path_.c_str(),
+                  std::strerror(errno));
+}
+
+void raw_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io);
+  if (file_size_ == 0) {
+    output_view_size_ = 0;
+    section_scanner_->publish_ready(0);
+    return;
+  }
+
+  std::size_t hw_threads =
+    std::max<std::size_t>(1, (std::size_t)std::thread::hardware_concurrency());
+  std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
+  thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
+
+  // Each window is read independently and handed to the scanner, which owns the
+  // contiguous decoded-byte frontier and the parallel section publication.
+  parallel_error_latch_t latch;
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_start = std::chrono::steady_clock::now();
+#endif
+  parallel_for_indexed(
+    window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) {
+      MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
+      std::size_t offset = index * window_bytes_;
+      std::size_t size   = std::min(window_bytes_, file_size_ - offset);
+      {
+        MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+#ifdef MPS_FAST_TIMERS
+        auto start = std::chrono::steady_clock::now();
+#endif
+        read_window_payload(offset, size);
+#ifdef MPS_FAST_TIMERS
+        auto end     = std::chrono::steady_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        read_window_ms_[index] =
+          (uint32_t)std::min<long long>(elapsed.count(), std::numeric_limits<uint32_t>::max());
+#endif
+      }
+      MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
+      section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
+    });
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_end = std::chrono::steady_clock::now();
+#endif
+  latch.rethrow_if_error();
+
+#ifdef MPS_FAST_TIMERS
+  if (!read_window_ms_.empty()) {
+    std::vector<uint32_t> sorted = read_window_ms_;
+    std::sort(sorted.begin(), sorted.end());
+    auto percentile = [&](double pct) {
+      std::size_t idx = (std::size_t)std::min<double>((double)(sorted.size() - 1),
+                                                      pct * (double)(sorted.size() - 1));
+      return sorted[idx];
+    };
+    uint64_t total_ms = 0;
+    for (uint32_t value : read_window_ms_) {
+      total_ms += value;
+    }
+    std::fprintf(
+      stderr,
+      "[RAW_READ_LATENCY] windows=%zu wall_ms=%lld total_window_ms=%llu avg_ms=%.3f min_ms=%u "
+      "p50_ms=%u p90_ms=%u p99_ms=%u max_ms=%u\n",
+      read_window_ms_.size(),
+      (long long)std::chrono::duration_cast<std::chrono::milliseconds>(read_wall_end -
+                                                                       read_wall_start)
+        .count(),
+      (unsigned long long)total_ms,
+      (double)total_ms / (double)read_window_ms_.size(),
+      sorted.front(),
+      percentile(0.50),
+      percentile(0.90),
+      percentile(0.99),
+      sorted.back());
+  }
+#endif
+
+  output_view_size_ = section_scanner_->ready_bytes();
+  section_scanner_->publish_ready(output_view_size_);
+}
+
+memory_input_stream_t::memory_input_stream_t(std::vector<char> buffer,
+                                             std::size_t input_size,
+                                             std::size_t compressed_size)
+  : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size)
+{
+  ensure_input_buffer_padding(buffer_, input_size_);
+  section_scanner_ = std::make_unique<mps_section_block_scanner_t>(buffer_.data(), 1, registry_);
+}
+
+const char* memory_input_stream_t::data() const noexcept { return buffer_.data(); }
+char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); }
+std::size_t memory_input_stream_t::size() const noexcept { return input_size_; }
+std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; }
+
+void memory_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io);
+  // Single block: observe_block advances the frontier and publishes.
+  section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_);
+}
+
+bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
+bool has_gzip_extension(const std::string& path) noexcept { return path_has_suffix(path, ".gz"); }
+bool has_bzip2_extension(const std::string& path) noexcept { return path_has_suffix(path, ".bz2"); }
+
+void drop_file_cache(const std::string& path)
+{
+  MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) { return; }
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+  ::close(fd);
+}
+
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
+{
+  if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
+  if (has_gzip_extension(path)) { return FileReadMethod::Gzip; }
+  if (has_bzip2_extension(path)) { return FileReadMethod::Bzip2; }
+  if (method == FileReadMethod::Lz4) {
+    mps_parser_fail(
+      error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str());
+  }
+  return method;
+}
+
+const char* file_read_method_name(FileReadMethod method) noexcept
+{
+  switch (method) {
+    case FileReadMethod::Read: return "read";
+    case FileReadMethod::Lz4: return "lz4";
+    case FileReadMethod::Gzip: return "gzip";
+    case FileReadMethod::Bzip2: return "bzip2";
+    default: return "unknown";
+  }
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
new file mode 100644
index 0000000000..8ca3456401
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -0,0 +1,319 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+// Input layer for the fast MPS parser: turns on-disk bytes (plain or .lz4) into one
+// contiguous parse buffer and publishes MPS section boundaries as data becomes available.
+//
+// Model:
+//   - Output is an anonymous mmap'd buffer (THP-hinted, tail-padded for SIMD/cursor safety).
+//     Raw inputs pread directly into fixed slots; LZ4 decodes into the same layout.
+//   - Work is split into windows (fixed spans of compressed/raw file bytes). Workers use
+//     parallel_for_indexed() — std::thread + shared-index dispatch, not OpenMP — because
+//     blocking pread()/decode does not compose cleanly with OMP team barriers.
+//   - Each completed window/block is handed to mps_section_block_scanner_t::observe_block().
+//     Blocks may finish out of order; the scanner advances a contiguous ready_bytes_
+//     frontier and publishes section ranges into mps_phase_registry_t only once the prefix
+//     up to a section title is contiguous and scannable.
+//   - The parser runs as OpenMP tasks on those published phases while run_decode_tasks()
+//     (raw parallel pread, or the LZ4 reader → metadata scanner → decoder pipeline) fills
+//     the buffer on separate threads. parallel_error_latch_t propagates the first worker
+//     failure and stops the rest.
+//
+// LZ4 adds a resident-window pool (parallel pread of compressed spans), block metadata
+// scanning with ptr_if_contiguous()/copy_to for window-boundary payloads, parallel decode
+// workers, window ref-counting/release, and lazy commit_up_to() of decoded output pages.
+
+#pragma once
+
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::linear_programming::io::detail {
+
+inline constexpr std::size_t input_buffer_padding_bytes = 64;
+
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size);
+
+struct lz4_pipeline_t;
+
+/**
+ * @brief File reading method selection
+ */
+enum class FileReadMethod { Read, Lz4, Gzip, Bzip2 };
+
+/**
+ * @brief Return the effective method for a path.
+ *
+ * Compressed inputs are auto-detected by extension; all other inputs use raw input reads.
+ */
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method);
+
+/**
+ * @brief Human-readable method name.
+ */
+const char* file_read_method_name(FileReadMethod method) noexcept;
+
+/**
+ * @brief True when the file name has an lz4 extension.
+ */
+bool has_lz4_extension(const std::string& path) noexcept;
+bool has_gzip_extension(const std::string& path) noexcept;
+bool has_bzip2_extension(const std::string& path) noexcept;
+
+/**
+ * @brief Ask the OS to evict clean cached pages for this file.
+ *
+ * This is advisory and affects the local client page cache only.
+ */
+void drop_file_cache(const std::string& path);
+
+/**
+ * @brief OS memory page size, queried once and cached.
+ */
+std::size_t system_page_size();
+
+/**
+ * @brief File size in bytes; fails with a parser error if it cannot be determined.
+ */
+std::size_t get_file_size(int fd, const std::string& path);
+std::size_t get_file_size(const std::string& path);
+
+/**
+ * @brief Read exactly @p bytes at @p offset into @p dst, retrying on EINTR.
+ *
+ * Returns false and leaves errno set on error or unexpected EOF.
+ */
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset);
+
+// First-error-wins latch shared by the parallel reader/decoder pipelines. The
+// first captured exception is retained and a stop flag is raised so cooperating
+// workers can unwind promptly. The retained exception is rethrown by the
+// orchestrating thread once all workers have joined.
+class parallel_error_latch_t {
+ public:
+  void capture(std::exception_ptr eptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!first_error_) {
+      first_error_ = eptr;
+      stopped_.store(true, std::memory_order_release);
+    }
+  }
+
+  bool stopped() const noexcept { return stopped_.load(std::memory_order_acquire); }
+
+  void rethrow_if_error() const
+  {
+    if (first_error_) { std::rethrow_exception(first_error_); }
+  }
+
+ private:
+  std::mutex mutex_;
+  std::exception_ptr first_error_ = nullptr;
+  std::atomic_bool stopped_{false};
+};
+
+class scoped_thread_group {
+ public:
+  void reserve(std::size_t count) { threads_.reserve(count); }
+
+  template <typename F>
+  void emplace(F&& f)
+  {
+    threads_.emplace_back(std::forward<F>(f));
+  }
+
+  ~scoped_thread_group()
+  {
+    for (auto& thread : threads_) {
+      if (thread.joinable()) { thread.join(); }
+    }
+  }
+
+ private:
+  std::vector<std::thread> threads_;
+};
+
+// Work-stealing parallel loop over [0, count). Each of thread_count workers pulls
+// the next index from a shared counter and invokes body(index). An exception
+// escaping body is captured into the latch and stops the loop; the caller is
+// responsible for calling latch.rethrow_if_error() after this returns. Workers
+// are named "<thread_name_prefix><worker-id>" when a prefix is supplied.
+// OMP just doesn't really play well with blocking pread()
+template <typename Body>
+void parallel_for_indexed(std::size_t count,
+                          std::size_t thread_count,
+                          parallel_error_latch_t& latch,
+                          const char* thread_name_prefix,
+                          Body body)
+{
+  assert(thread_count > 0);
+
+  std::atomic_size_t next{0};
+  scoped_thread_group workers;
+  workers.reserve(thread_count);
+  for (std::size_t t = 0; t < thread_count; ++t) {
+    workers.emplace([&, t] {
+      if (thread_name_prefix != nullptr) {
+        std::string name = thread_name_prefix + std::to_string(t);
+        nvtx::name_current_thread(name.c_str());
+      }
+      while (!latch.stopped()) {
+        std::size_t index = next.fetch_add(1, std::memory_order_relaxed);
+        if (index >= count) { break; }
+        try {
+          body(index);
+        } catch (...) {
+          latch.capture(std::current_exception());
+          return;
+        }
+      }
+    });
+  }
+}
+
+struct input_stream_view_t {
+  const char* data               = nullptr;
+  char* mutable_data             = nullptr;
+  std::size_t size               = 0;
+  std::size_t compressed_size    = 0;
+  mps_phase_registry_t* registry = nullptr;
+};
+
+/**
+ * @brief CRTP base supplying the registry and view() shared by every input
+ * stream. Derived classes provide data()/mutable_data()/size()/compressed_size().
+ */
+template <typename Derived>
+class input_stream_base_t {
+ public:
+  mps_phase_registry_t& registry() noexcept { return registry_; }
+
+  input_stream_view_t view() noexcept
+  {
+    auto* self = static_cast<Derived*>(this);
+    return {self->data(), self->mutable_data(), self->size(), self->compressed_size(), &registry_};
+  }
+
+ protected:
+  mps_phase_registry_t registry_;
+};
+
+// Handles lz4 compressed files (useful since lz4 is very fast, works well for MPS, and makes
+// parallel decompression trivial)
+class lz4_input_stream_t : public input_stream_base_t<lz4_input_stream_t> {
+ public:
+  explicit lz4_input_stream_t(const std::string& path);
+  ~lz4_input_stream_t();
+
+  lz4_input_stream_t(const lz4_input_stream_t&)            = delete;
+  lz4_input_stream_t& operator=(const lz4_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  friend struct lz4_pipeline_t;
+
+  void commit_up_to(std::size_t bytes);
+
+  std::string path_;
+  int fd_ = -1;
+  mmap_region_t output_region_;
+  std::size_t compressed_size_       = 0;
+  char* output_data_                 = nullptr;
+  std::size_t output_mapped_size_    = 0;
+  std::size_t output_view_size_      = 0;
+  std::size_t output_committed_size_ = 0;
+  std::size_t block_max_size_        = 0;
+  std::size_t content_size_          = 0;
+  std::size_t header_size_           = 0;
+  bool content_size_present_         = false;
+  bool block_checksum_               = false;
+  bool content_checksum_             = false;
+  bool dict_id_                      = false;
+  std::mutex commit_mutex_;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+  std::size_t block_slot_count_ = 0;
+};
+
+// Takes a file path
+class raw_input_stream_t : public input_stream_base_t<raw_input_stream_t> {
+ public:
+  explicit raw_input_stream_t(const std::string& path);
+  ~raw_input_stream_t();
+
+  raw_input_stream_t(const raw_input_stream_t&)            = delete;
+  raw_input_stream_t& operator=(const raw_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  void read_window_payload(std::size_t offset, std::size_t size);
+
+  std::string path_;
+  int fd_          = -1;
+  int buffered_fd_ = -1;
+  bool direct_io_  = false;
+  mmap_region_t output_region_;
+  char* output_data_              = nullptr;
+  std::size_t output_mapped_size_ = 0;
+  std::size_t output_view_size_   = 0;
+  std::size_t file_size_          = 0;
+  std::size_t window_bytes_       = 0;
+  std::size_t window_count_       = 0;
+#ifdef MPS_FAST_TIMERS
+  std::vector<uint32_t> read_window_ms_;
+#endif
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+};
+
+// Takes an in-memory buffer
+class memory_input_stream_t : public input_stream_base_t<memory_input_stream_t> {
+ public:
+  memory_input_stream_t(std::vector<char> buffer,
+                        std::size_t input_size,
+                        std::size_t compressed_size);
+
+  memory_input_stream_t(const memory_input_stream_t&)            = delete;
+  memory_input_stream_t& operator=(const memory_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  std::vector<char> buffer_;
+  std::size_t input_size_      = 0;
+  std::size_t compressed_size_ = 0;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+};
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
new file mode 100644
index 0000000000..b7138fedb6
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -0,0 +1,304 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "mmap_region.hpp"
+
+#include <cuda/cmath>
+
+#include <simde/x86/avx2.h>
+
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#ifdef MPS_FAST_PERF_COUNTERS
+#include <cstdio>
+#endif
+#include <limits>
+#include <string_view>
+#include <unordered_map>
+
+namespace cuopt::linear_programming::io::detail {
+
+// below this threshold, the serial row-hash build is usually cheaper than partition setup
+inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024;
+inline constexpr int MPS_ROW_HASH_PARTITION_BITS          = 5;
+inline constexpr size_t MPS_ROW_HASH_PARTITIONS           = (1 << MPS_ROW_HASH_PARTITION_BITS);
+
+// FNV-1a over bytes in reverse order; row names commonly share long prefixes.
+static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len)
+{
+  constexpr uint32_t fnv_offset = 2166136261u;
+  constexpr uint32_t fnv_prime  = 16777619u;
+
+  uint32_t h    = fnv_offset;
+  const char* p = ptr + len;
+  while (p > ptr) {
+    --p;
+    h ^= (uint8_t)*p;
+    h *= fnv_prime;
+  }
+  return h;
+}
+
+// 28-byte inline key + uint32 payload: two slots per 64-byte cache line.
+struct alignas(32) hash_slot_28_t {
+  char key[28];
+  uint32_t count;
+};
+
+using hash_key_t                     = simde__m256i;
+using hash_slot_var_t                = hash_slot_28_t;
+constexpr std::size_t HASH_KEY_BYTES = 28;
+
+static_assert(sizeof(hash_slot_28_t) == 32);
+static_assert(alignof(hash_slot_28_t) == 32);
+static_assert(offsetof(hash_slot_28_t, count) == HASH_KEY_BYTES);
+
+static inline hash_key_t make_key(const char* ptr, std::size_t len)
+{
+  alignas(32) char buf[32] = {};
+  std::memcpy(buf, ptr, len < HASH_KEY_BYTES ? len : HASH_KEY_BYTES);
+  return simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(buf));
+}
+
+static inline bool key_cmpeq(const char* slot_key, hash_key_t key)
+{
+  simde__m256i slot_vec = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(slot_key));
+  int mask              = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key));
+  return (mask & 0x0fffffff) == 0x0fffffff;
+}
+
+static inline void key_store(char* slot_key, hash_key_t key)
+{
+  simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(slot_key), key);
+}
+
+struct hash_partition_t {
+  hash_slot_var_t* slots = nullptr;
+  size_t buckets         = 0;
+  size_t mask            = 0;
+};
+
+static inline size_t hash_partition_for(uint32_t hash)
+{
+  return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
+}
+
+static inline size_t hash_bucket_count_for(size_t n_rows, bool compact)
+{
+  if (compact) { return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); }
+  return cuda::next_power_of_two(std::max(n_rows * 2, (size_t)64));
+}
+
+static inline size_t hash_lookup_in(
+  const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash)
+{
+  const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) { return std::numeric_limits<size_t>::max(); }
+    if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
+  }
+  return std::numeric_limits<size_t>::max();
+}
+
+static inline size_t hash_insert_into(hash_slot_var_t* slots,
+                                      size_t buckets,
+                                      size_t mask,
+                                      std::string_view name,
+                                      uint32_t hash,
+                                      size_t index)
+{
+  hash_key_t key        = make_key(name.data(), name.size());
+  hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) {
+      key_store(slot->key, key);
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+    if (key_cmpeq(slot->key, key)) {
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+  }
+  __builtin_unreachable();
+}
+
+#ifdef MPS_FAST_PERF_COUNTERS
+struct hash_build_probe_stats_t {
+  size_t total_probes = 0;
+  size_t max_probes   = 0;
+  size_t long_names   = 0;
+
+  void seed_long_names(size_t n) { long_names = n; }
+
+  void record_insert(size_t probes)
+  {
+    if (probes == 0) {
+      ++long_names;
+    } else {
+      total_probes += probes;
+      max_probes = std::max(max_probes, probes);
+    }
+  }
+
+  void merge(const hash_build_probe_stats_t& other)
+  {
+    total_probes += other.total_probes;
+    max_probes = std::max(max_probes, other.max_probes);
+    long_names += other.long_names;
+  }
+};
+#endif
+
+class smallstr_hash_table_t {
+ public:
+  void note_long_name(std::string_view name, size_t index) { long_names_[name] = index; }
+
+  size_t long_name_count() const { return long_names_.size(); }
+
+  void reset_build_probe_stats()
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_ = {};
+    build_probe_stats_.seed_long_names(long_names_.size());
+    partition_probe_stats_ = {};
+#endif
+  }
+
+  void print_build_probe_report(size_t n_rows) const
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    hash_build_probe_stats_t stats = build_probe_stats_;
+    if (partition_count_ != 0) {
+      for (size_t p = 0; p < partition_count_; ++p) {
+        stats.merge(partition_probe_stats_[p]);
+      }
+    }
+    size_t probed_rows = n_rows - stats.long_names;
+    double mean_probes = probed_rows == 0 ? 0.0 : (double)stats.total_probes / (double)probed_rows;
+    double load_factor = buckets_ == 0 ? 0.0 : (double)n_rows / (double)buckets_;
+    std::fprintf(stderr,
+                 "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n",
+                 n_rows,
+                 buckets_,
+                 load_factor,
+                 stats.long_names,
+                 mean_probes,
+                 stats.max_probes);
+#endif
+  }
+
+  void configure_serial_buckets(size_t n_rows, bool compact)
+  {
+    partition_count_ = 0;
+    buckets_         = hash_bucket_count_for(n_rows, compact);
+    mask_            = buckets_ - 1;
+  }
+
+  void configure_partitioned_buckets(
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS>& partition_counts, bool compact)
+  {
+    partition_count_ = MPS_ROW_HASH_PARTITIONS;
+    buckets_         = 0;
+    for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+      partitions_[p].buckets = hash_bucket_count_for(partition_counts[p], compact);
+      partitions_[p].mask    = partitions_[p].buckets - 1;
+      buckets_ += partitions_[p].buckets;
+    }
+    mask_ = buckets_ - 1;
+  }
+
+  void allocate_mmap(const char* label)
+  {
+    size_t mmap_size = buckets_ * sizeof(hash_slot_var_t);
+    region_ = mmap_region_t::anonymous(mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, label);
+    slots_  = (hash_slot_var_t*)region_.data();
+    if (partition_count_ != 0) {
+      hash_slot_var_t* next_slots = slots_;
+      for (size_t p = 0; p < partition_count_; ++p) {
+        partitions_[p].slots = next_slots;
+        next_slots += partitions_[p].buckets;
+      }
+    }
+    region_.advise(MADV_HUGEPAGE);
+  }
+
+  mmap_region_t& region() noexcept { return region_; }
+  const mmap_region_t& region() const noexcept { return region_; }
+
+  hash_slot_var_t* slots() noexcept { return slots_; }
+  const hash_slot_var_t* slots() const noexcept { return slots_; }
+
+  size_t buckets() const noexcept { return buckets_; }
+  size_t mask() const noexcept { return mask_; }
+  size_t partition_count() const noexcept { return partition_count_; }
+
+  const hash_partition_t& partition(size_t p) const noexcept { return partitions_[p]; }
+
+  size_t lookup(std::string_view name) const
+  {
+    if (name.size() > HASH_KEY_BYTES) {
+      auto it = long_names_.find(name);
+      return it != long_names_.end() ? it->second : std::numeric_limits<size_t>::max();
+    }
+    hash_key_t key = make_key(name.data(), name.size());
+    uint32_t hash  = fnv1a_hash(name.data(), name.size());
+    if (partition_count_ != 0) {
+      const auto& part = partitions_[hash_partition_for(hash)];
+      return hash_lookup_in(part.slots, part.buckets, part.mask, key, hash);
+    }
+    return hash_lookup_in(slots_, buckets_, mask_, key, hash);
+  }
+
+  size_t insert_serial(std::string_view name, size_t index)
+  {
+    size_t probes;
+    if (name.size() > HASH_KEY_BYTES) {
+      note_long_name(name, index);
+      probes = 0;
+    } else {
+      probes = hash_insert_into(
+        slots_, buckets_, mask_, name, fnv1a_hash(name.data(), name.size()), index);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_.record_insert(probes);
+#endif
+    return probes;
+  }
+
+  size_t insert_partition(size_t partition, std::string_view name, uint32_t hash, size_t index)
+  {
+    const auto& part = partitions_[partition];
+    size_t probes    = hash_insert_into(part.slots, part.buckets, part.mask, name, hash, index);
+#ifdef MPS_FAST_PERF_COUNTERS
+    partition_probe_stats_[partition].record_insert(probes);
+#endif
+    return probes;
+  }
+
+ private:
+  mmap_region_t region_;
+  hash_slot_var_t* slots_ = nullptr;
+  size_t buckets_         = 0;
+  size_t mask_            = 0;
+  size_t partition_count_ = 0;
+  std::array<hash_partition_t, MPS_ROW_HASH_PARTITIONS> partitions_{};
+  std::unordered_map<std::string_view, size_t> long_names_{};
+#ifdef MPS_FAST_PERF_COUNTERS
+  hash_build_probe_stats_t build_probe_stats_{};
+  std::array<hash_build_probe_stats_t, MPS_ROW_HASH_PARTITIONS> partition_probe_stats_{};
+#endif
+};
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
new file mode 100644
index 0000000000..5e535ce7f2
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -0,0 +1,920 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
+
+#include <cuda/cmath>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <condition_variable>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::linear_programming::io::detail {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace {
+
+constexpr uint32_t lz4_frame_magic                        = 0x184D2204u;
+constexpr uint32_t lz4_uncompressed_block                 = 0x80000000u;
+constexpr uint32_t lz4_block_size_mask                    = 0x7FFFFFFFu;
+constexpr std::size_t lz4_pipeline_batch_bytes            = 64ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_input_max_io_threads            = 8;
+constexpr std::size_t lz4_no_content_size_reserve_ratio   = 128;
+
+using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
+
+std::size_t estimate_lz4_no_content_size(std::size_t compressed_size)
+{
+  constexpr std::size_t max_size = std::numeric_limits<std::size_t>::max();
+  if (compressed_size > max_size / lz4_no_content_size_reserve_ratio) {
+    return max_size - input_buffer_padding_bytes;
+  }
+  return compressed_size * lz4_no_content_size_reserve_ratio;
+}
+
+#if defined(MPS_PARSER_WITH_LZ4)
+struct lz4_runtime_t {
+  void* handle                          = nullptr;
+  LZ4_decompress_safe_t decompress_safe = nullptr;
+
+  lz4_runtime_t()
+  {
+    for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+      handle = ::dlopen(soname, RTLD_LAZY);
+      if (handle != nullptr) { break; }
+    }
+    if (handle == nullptr) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Could not open .mps.lz4 file since liblz4 was not found "
+                      "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually "
+                      "or install liblz4.");
+    }
+
+    decompress_safe =
+      reinterpret_cast<LZ4_decompress_safe_t>(::dlsym(handle, "LZ4_decompress_safe"));
+    if (decompress_safe == nullptr) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file "
+                      "manually or install a compatible liblz4.");
+    }
+  }
+
+  ~lz4_runtime_t()
+  {
+    if (handle != nullptr) { ::dlclose(handle); }
+  }
+
+  lz4_runtime_t(const lz4_runtime_t&)            = delete;
+  lz4_runtime_t& operator=(const lz4_runtime_t&) = delete;
+};
+
+const lz4_runtime_t& lz4_runtime()
+{
+  static const lz4_runtime_t runtime;
+  return runtime;
+}
+#endif
+
+int lz4_decompress_safe_runtime([[maybe_unused]] const char* src,
+                                [[maybe_unused]] char* dst,
+                                [[maybe_unused]] int compressed_size,
+                                [[maybe_unused]] int dst_capacity)
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity);
+#else
+  mps_parser_fail(
+    error_type_t::RuntimeError,
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+void ensure_lz4_runtime_available()
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  [[maybe_unused]] auto& runtime = lz4_runtime();
+#else
+  mps_parser_fail(
+    error_type_t::RuntimeError,
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+int open_lz4_fd(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open LZ4 file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  return fd;
+}
+
+uint32_t read_le32(const char* ptr)
+{
+  const auto* p = reinterpret_cast<const unsigned char*>(ptr);
+  return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
+}
+
+uint64_t read_le64(const char* ptr)
+{
+  const auto* p  = reinterpret_cast<const unsigned char*>(ptr);
+  uint64_t value = 0;
+  for (int i = 7; i >= 0; --i) {
+    value = (value << 8) | p[i];
+  }
+  return value;
+}
+
+std::size_t block_max_size_from_bd(unsigned char bd)
+{
+  unsigned block_size_id = (bd >> 4) & 0x7u;
+  switch (block_size_id) {
+    case 4: return 64ull * 1024ull;
+    case 5: return 256ull * 1024ull;
+    case 6: return 1024ull * 1024ull;
+    case 7: return 4ull * 1024ull * 1024ull;
+    default: mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame block size ID");
+  }
+}
+
+struct lz4_resident_window_t {
+  std::size_t index       = 0;
+  std::size_t file_offset = 0;
+  std::size_t size        = 0;
+  std::unique_ptr<char[]> data;
+};
+
+class lz4_resident_windows_t {
+ public:
+  explicit lz4_resident_windows_t(std::vector<lz4_resident_window_t>& windows) : windows_(windows)
+  {
+  }
+
+  // Compressed file bytes arrive in fixed resident windows; block payloads may span a boundary.
+  // Return a direct pointer when the whole payload sits in one window (LZ4 decompress + pin);
+  // otherwise nullptr and the caller stages via copy_to.
+  const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const
+  {
+    if (size == 0) return nullptr;
+    const auto& w     = window_for_offset(offset);
+    std::size_t local = offset - w.file_offset;
+    if (local <= w.size && size <= w.size - local) { return w.data.get() + local; }
+    return nullptr;
+  }
+
+  void copy_to(std::size_t offset, char* dst, std::size_t size) const
+  {
+    std::size_t copied = 0;
+    while (copied < size) {
+      const auto& w     = window_for_offset(offset + copied);
+      std::size_t local = offset + copied - w.file_offset;
+      std::size_t take  = std::min(w.size - local, size - copied);
+      std::memcpy(dst + copied, w.data.get() + local, take);
+      copied += take;
+    }
+  }
+
+  uint8_t read_u8(std::size_t offset) const
+  {
+    uint8_t value = 0;
+    copy_to(offset, reinterpret_cast<char*>(&value), sizeof(value));
+    return value;
+  }
+
+  uint32_t read_u32(std::size_t offset) const
+  {
+    char bytes[4];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le32(bytes);
+  }
+
+  uint64_t read_u64(std::size_t offset) const
+  {
+    char bytes[8];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le64(bytes);
+  }
+
+ private:
+  const lz4_resident_window_t& window_for_offset(std::size_t offset) const
+  {
+    if (windows_.empty()) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows");
+    }
+    std::size_t window_stride = windows_.size() > 1 ? windows_[1].file_offset : windows_[0].size;
+    std::size_t idx           = offset / window_stride;
+    if (idx >= windows_.size()) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    }
+    const auto& w = windows_[idx];
+    if (offset >= w.file_offset + w.size) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    }
+    return w;
+  }
+
+  std::vector<lz4_resident_window_t>& windows_;
+};
+
+// Parsed fields of the leading LZ4 frame descriptor (RFC: magic, FLG, BD, and
+// optional content size / dictionary id / header checksum).
+struct lz4_frame_header_t {
+  std::size_t block_max_size = 0;
+  std::size_t content_size   = 0;
+  std::size_t header_size    = 0;
+  bool content_size_present  = false;
+  bool block_checksum        = false;
+  bool content_checksum      = false;
+  bool dict_id               = false;
+};
+
+lz4_frame_header_t parse_lz4_frame_header(int fd,
+                                          const std::string& path,
+                                          std::size_t compressed_size)
+{
+  if (compressed_size < 7) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "LZ4 input is too small to contain a frame header");
+  }
+  char header[32];
+  std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size);
+  if (!pread_full(fd, header, header_bytes, 0)) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to read LZ4 frame header '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+
+  std::size_t offset = 0;
+  uint32_t magic     = read_le32(header + offset);
+  if (magic != lz4_frame_magic) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "unsupported LZ4 input: expected standard LZ4 frame magic");
+  }
+  offset += 4;
+  unsigned char flg = (unsigned char)header[offset++];
+  unsigned char bd  = (unsigned char)header[offset++];
+  unsigned version  = (flg >> 6) & 0x3u;
+  if (version != 1) {
+    mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version");
+  }
+  bool block_independent = (flg & 0x20u) != 0;
+  if (!block_independent) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "parallel LZ4 reader requires independent blocks; compress with -BI");
+  }
+
+  lz4_frame_header_t info;
+  info.block_checksum       = (flg & 0x10u) != 0;
+  info.content_size_present = (flg & 0x08u) != 0;
+  info.content_checksum     = (flg & 0x04u) != 0;
+  info.dict_id              = (flg & 0x01u) != 0;
+  info.block_max_size       = block_max_size_from_bd(bd);
+  if (info.content_size_present) {
+    if (offset + 8 > header_bytes) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading content size");
+    }
+    info.content_size = (std::size_t)read_le64(header + offset);
+    offset += 8;
+  }
+  if (info.dict_id) {
+    if (offset + 4 > header_bytes) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading dictionary id");
+    }
+    offset += 4;
+  }
+  if (offset + 1 > header_bytes) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "truncated LZ4 frame while reading header checksum");
+  }
+  offset += 1;
+  info.header_size = offset;
+  return info;
+}
+
+}  // namespace
+
+lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("lz4_input_constructor", nvtx::colors::io);
+
+  ensure_lz4_runtime_available();
+
+  int fd = open_lz4_fd(path);
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+
+  compressed_size_ = get_file_size(fd, path);
+
+  lz4_frame_header_t header = parse_lz4_frame_header(fd, path, compressed_size_);
+  block_max_size_           = header.block_max_size;
+  content_size_             = header.content_size;
+  header_size_              = header.header_size;
+  content_size_present_     = header.content_size_present;
+  block_checksum_           = header.block_checksum;
+  content_checksum_         = header.content_checksum;
+  dict_id_                  = header.dict_id;
+
+  std::size_t reserve_size = content_size_;
+  if (!content_size_present_) {
+    reserve_size = estimate_lz4_no_content_size(compressed_size_);
+    reserve_size = std::max(reserve_size, block_max_size_);
+  }
+  reserve_size += input_buffer_padding_bytes;
+
+  constexpr std::size_t huge_alignment = 2 * 1024 * 1024;  // 2MiB
+  output_mapped_size_                  = cuda::round_up(reserve_size, system_page_size());
+  output_region_                       = mmap_region_t::anonymous_aligned(output_mapped_size_,
+                                                    huge_alignment,
+                                                    PROT_NONE,
+                                                    MAP_PRIVATE | MAP_NORESERVE,
+                                                    "LZ4 output buffer");
+  output_data_                         = output_region_.char_data();
+
+  block_slot_count_ = std::max<std::size_t>(1, cuda::ceil_div(reserve_size, block_max_size_) + 1);
+
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, block_slot_count_, registry_);
+
+  fd_ = fd;
+  fd  = -1;
+}
+
+lz4_input_stream_t::~lz4_input_stream_t()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+}
+
+const char* lz4_input_stream_t::data() const noexcept { return output_data_; }
+char* lz4_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t lz4_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t lz4_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept
+{
+  return content_size_present_
+           ? content_size_
+           : std::max<std::size_t>(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024);
+}
+
+void lz4_input_stream_t::commit_up_to(std::size_t bytes)
+{
+  MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc);
+  std::lock_guard<std::mutex> lock(commit_mutex_);
+  if (bytes <= output_committed_size_) return;
+  if (bytes > output_mapped_size_) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping");
+  }
+  std::size_t new_committed = cuda::round_up(bytes, system_page_size());
+  if (new_committed > output_mapped_size_) new_committed = output_mapped_size_;
+  std::size_t add = new_committed - output_committed_size_;
+  void* target    = output_data_ + output_committed_size_;
+  mmap_region_t::map_fixed_or_throw(
+    target, add, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0, "LZ4 output commit");
+  ::madvise(target, add, MADV_HUGEPAGE);
+  output_committed_size_ = new_committed;
+}
+
+struct resident_block_desc_t {
+  const char* src                 = nullptr;
+  std::size_t compressed_size     = 0;
+  std::size_t decompressed_offset = 0;
+  std::size_t decompressed_size   = 0;
+  std::size_t index               = 0;
+  std::size_t window_index        = std::numeric_limits<std::size_t>::max();
+  bool uncompressed               = false;
+};
+
+struct window_state_t {
+  std::atomic<uint32_t> decode_refs{0};
+  std::atomic<uint8_t> released{0};
+};
+
+// Two distinct units flow through this pipeline:
+//   * window  - a fixed-size span of the compressed file read by the I/O stage.
+//   * block   - a single independent LZ4 data block (decompressed unit) that the
+//               metadata scanner discovers inside the resident windows.
+// Windows feed blocks; the decoded blocks are handed to the section scanner,
+// which owns the contiguous decoded-byte frontier and section publication.
+//
+// Locking (the grouped members below repeat each guard in context):
+//   * window_mutex          - guards window_done[]   (reader -> scanner readiness)
+//   * desc_mutex            - guards desc_queue + scanner_done (scanner -> decoders)
+//   * window_release_mutex  - serializes freeing a window buffer + RSS accounting
+//   * window_state_[].decode_refs/.released, scanned_through_, blocks_scanned,
+//     compressed_resident_bytes - lock-free atomics
+// Locks are never nested. The scanner thread is the sole writer of the frame walk,
+// so offset / decompressed_offset are mutated without locking.
+struct lz4_pipeline_t {
+  explicit lz4_pipeline_t(lz4_input_stream_t& input_)
+    : input(input_),
+      window_count(cuda::ceil_div(input.compressed_size_, window_bytes)),
+      windows(window_count),
+      window_state_(std::make_unique<window_state_t[]>(window_count)),
+      io_threads(std::min(lz4_input_max_io_threads, window_count)),
+      window_done(window_count, 0)
+  {
+    for (std::size_t i = 0; i < window_count; ++i) {
+      std::size_t offset     = i * window_bytes;
+      std::size_t size       = std::min(window_bytes, input.compressed_size_ - offset);
+      windows[i].index       = i;
+      windows[i].file_offset = offset;
+      windows[i].size        = size;
+    }
+  }
+
+  // Runs the three-stage pipeline to completion:
+  //
+  //   readers --window_done/window_cv--> scanner --desc_queue/desc_cv--> decoders
+  //
+  //   * readers  (io_threads): pread fixed compressed windows into RAM, mark ready.
+  //   * scanner  (1 thread)  : walk the LZ4 frame in order, slice it into block
+  //                            descriptors, push them to decoders in batches.
+  //   * decoders (io_threads): decompress blocks into the output buffer and hand
+  //                            each to the section scanner, which advances the
+  //                            decoded-byte frontier and publishes section ranges.
+  //
+  // Consumers are spawned first so they are parked waiting before the readers (which
+  // run on this thread) start producing. scoped_thread_group joins the background
+  // threads on scope exit; any stage's failure is captured in `latch` and rethrown here.
+  void run()
+  {
+    std::exception_ptr startup_error;
+    {
+      scoped_thread_group background;
+      try {
+        background.reserve(io_threads + 1);
+        background.emplace([this] { run_scanner_stage(); });
+        for (std::size_t t = 0; t < io_threads; ++t) {
+          background.emplace([this, t] { run_decoder_stage(t); });
+        }
+        run_readers();  // produce on the calling thread, now that consumers are parked
+      } catch (...) {
+        startup_error = std::current_exception();
+        fail_and_notify(startup_error);
+      }
+    }
+    if (startup_error) { std::rethrow_exception(startup_error); }
+    latch.rethrow_if_error();
+  }
+
+  void finalize()
+  {
+    input.output_view_size_ = input.section_scanner_->ready_bytes();
+    input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes);
+    input.section_scanner_->publish_ready(input.output_view_size_);
+  }
+
+  void fail_and_notify(std::exception_ptr eptr)
+  {
+    latch.capture(eptr);
+    window_cv.notify_all();
+    desc_cv.notify_all();
+  }
+
+  void add_compressed_resident(std::size_t bytes)
+  {
+    compressed_resident_bytes.fetch_add(bytes, std::memory_order_relaxed);
+  }
+
+  void try_release_window(std::size_t index)
+  {
+    if (index >= window_count) { return; }
+    if (index >= scanned_through_.load(std::memory_order_acquire)) { return; }
+    window_state_t& state = window_state_[index];
+    if (state.decode_refs.load(std::memory_order_acquire) != 0) { return; }
+    uint8_t expected = 0;
+    if (!state.released.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { return; }
+    std::lock_guard<std::mutex> lock(window_release_mutex);
+    if (windows[index].data) {
+      windows[index].data.reset();
+      compressed_resident_bytes.fetch_sub(windows[index].size, std::memory_order_relaxed);
+    }
+  }
+
+  void mark_windows_scanned_before(std::size_t offset)
+  {
+    assert(offset >= last_mark_offset_);
+    last_mark_offset_               = offset;
+    std::size_t new_scanned_through = std::min(window_count, offset / window_bytes);
+    std::size_t prev                = scanned_through_.load(std::memory_order_relaxed);
+    if (new_scanned_through <= prev) { return; }
+    scanned_through_.store(new_scanned_through, std::memory_order_release);
+    for (std::size_t wi = prev; wi < new_scanned_through; ++wi) {
+      try_release_window(wi);
+    }
+  }
+
+  void run_readers()
+  {
+    parallel_for_indexed(
+      window_count, io_threads, latch, "lz4-window-read-", [this](std::size_t index) {
+        read_window(index);
+      });
+  }
+
+  void read_window(std::size_t index)
+  {
+    try {
+      auto& w = windows[index];
+      w.data.reset(new char[w.size]);
+      add_compressed_resident(w.size);
+      bool ok = false;
+      {
+        MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+        ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset);
+      }
+      if (!ok) {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "Failed to pread LZ4 resident window: %s",
+                        std::strerror(errno));
+      }
+      {
+        MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+        std::lock_guard<std::mutex> lock(window_mutex);
+        window_done[index] = 1;
+      }
+      window_cv.notify_all();
+    } catch (...) {
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  void run_decoder_stage(std::size_t tid)
+  {
+    try {
+      std::string thread_name = "lz4-window-decode-" + std::to_string(tid);
+      nvtx::name_current_thread(thread_name.c_str());
+      while (true) {
+        std::vector<resident_block_desc_t> batch = wait_for_decode_batch();
+        if (batch.empty()) { return; }
+        decode_batch(batch);
+      }
+    } catch (...) {
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  std::vector<resident_block_desc_t> wait_for_decode_batch()
+  {
+    MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
+    std::unique_lock<std::mutex> lock(desc_mutex);
+    desc_cv.wait(lock, [&] { return latch.stopped() || scanner_done || !desc_queue.empty(); });
+    if (latch.stopped() || desc_queue.empty()) { return {}; }
+    std::vector<resident_block_desc_t> batch = std::move(desc_queue.front());
+    desc_queue.pop_front();
+    return batch;
+  }
+
+  void decode_batch(const std::vector<resident_block_desc_t>& batch)
+  {
+    MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
+    for (const auto& block : batch) {
+      decode_block(block);
+    }
+  }
+
+  void decode_block(const resident_block_desc_t& block)
+  {
+    char* dst  = input.output_data_ + block.decompressed_offset;
+    int actual = 0;
+    {
+      MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode);
+      if (block.uncompressed) {
+        std::memcpy(dst, block.src, block.decompressed_size);
+        actual = (int)block.decompressed_size;
+      } else if (block.compressed_size > (std::size_t)std::numeric_limits<int>::max() ||
+                 block.decompressed_size > (std::size_t)std::numeric_limits<int>::max()) {
+        actual = -1;
+      } else {
+        actual = lz4_decompress_safe_runtime(
+          block.src, dst, (int)block.compressed_size, (int)block.decompressed_size);
+      }
+    }
+    if (actual < 0 || (std::size_t)actual > block.decompressed_size) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input block decompressed to invalid size");
+    }
+    release_block_window_ref(block);
+    publish_decoded_block(block, dst, (std::size_t)actual);
+  }
+
+  void release_block_window_ref(const resident_block_desc_t& block)
+  {
+    if (block.window_index == std::numeric_limits<std::size_t>::max()) { return; }
+    uint32_t old =
+      window_state_[block.window_index].decode_refs.fetch_sub(1, std::memory_order_acq_rel);
+    assert(old > 0);
+    if (old == 1) { try_release_window(block.window_index); }
+  }
+
+  void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size)
+  {
+    MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
+    // The scanner advances the contiguous decoded-byte frontier and publishes
+    // section ranges as blocks complete, regardless of decode order.
+    input.section_scanner_->observe_block(block.index, dst, dst + actual_size);
+  }
+
+  void wait_range_ready(std::size_t begin, std::size_t size)
+  {
+    if (size == 0) return;
+    if (begin > input.compressed_size_ || size > input.compressed_size_ - begin) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
+    std::size_t first = begin / window_bytes;
+    std::size_t last  = (begin + size - 1) / window_bytes;
+    if (last >= window_done.size()) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
+    for (std::size_t wi = first; wi <= last; ++wi) {
+      MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
+      std::unique_lock<std::mutex> lock(window_mutex);
+      window_cv.wait(lock, [&] { return latch.stopped() || window_done[wi] != 0; });
+      if (latch.stopped() && window_done[wi] == 0) {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "LZ4 metadata scanner stopped before required window was ready");
+      }
+    }
+  }
+
+  void push_batch(std::vector<resident_block_desc_t>& batch)
+  {
+    if (batch.empty()) return;
+    {
+      MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc);
+      input.commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size);
+    }
+    {
+      MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic);
+      std::lock_guard<std::mutex> lock(desc_mutex);
+      desc_queue.push_back(std::move(batch));
+    }
+    batch.clear();
+    desc_cv.notify_one();
+  }
+
+  void run_scanner_stage()
+  {
+    try {
+      nvtx::name_current_thread("lz4-metadata-scan");
+      scan_lz4_metadata();
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      desc_cv.notify_all();
+    } catch (...) {
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  void scan_lz4_metadata()
+  {
+    lz4_resident_windows_t resident(windows);
+    std::vector<resident_block_desc_t> batch;
+    batch.reserve(lz4_decode_batch_decompressed_bytes / input.block_max_size_ + 1);
+    std::size_t batch_decoded_bytes = 0;
+    std::size_t offset              = input.header_size_;
+    std::size_t decompressed_offset = 0;
+    blocks_scanned.store(0, std::memory_order_relaxed);
+
+    while (true) {
+      MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block header");
+      }
+      uint32_t raw_block_size = resident.read_u32(offset);
+      offset += 4;
+      if (raw_block_size == 0) { break; }
+
+      resident_block_desc_t block =
+        scan_one_block(resident, raw_block_size, offset, decompressed_offset);
+      batch_decoded_bytes += block.decompressed_size;
+      batch.push_back(block);
+      blocks_scanned.fetch_add(1, std::memory_order_relaxed);
+      if (blocks_scanned.load(std::memory_order_relaxed) > input.block_slot_count_) {
+        mps_parser_fail(error_type_t::OutOfMemoryError,
+                        "LZ4 input block count exceeded reserved metadata slots");
+      }
+      if (batch_decoded_bytes >= lz4_decode_batch_decompressed_bytes) {
+        push_batch(batch);
+        batch_decoded_bytes = 0;
+      }
+    }
+
+    scan_frame_footer(offset, decompressed_offset);
+    push_batch(batch);
+    mark_windows_scanned_before(input.compressed_size_);
+  }
+
+  resident_block_desc_t scan_one_block(lz4_resident_windows_t& resident,
+                                       uint32_t raw_block_size,
+                                       std::size_t& offset,
+                                       std::size_t& decompressed_offset)
+  {
+    // --- Decode the block-size word and validate it ---------------------------
+    bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
+    std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
+    if (block_payload_size == 0) {
+      mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block");
+    }
+    if (block_payload_size > input.block_max_size_ && uncompressed) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 uncompressed block exceeds frame block maximum");
+    }
+    if (input.content_size_present_ && decompressed_offset >= input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame contains more blocks than content size allows");
+    }
+
+    // --- Wait until the payload bytes are resident ----------------------------
+    wait_range_ready(offset, block_payload_size);
+    if (offset + block_payload_size > input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading block payload");
+    }
+
+    // --- Determine the decompressed size --------------------------------------
+    // Compressed blocks expand to block_max_size_ (or the content-size remainder
+    // for the final block); uncompressed blocks keep their payload size.
+    std::size_t decompressed_size = block_payload_size;
+    if (!uncompressed) {
+      decompressed_size =
+        input.content_size_present_
+          ? std::min(input.block_max_size_, input.content_size_ - decompressed_offset)
+          : input.block_max_size_;
+    }
+    if (input.content_size_present_ &&
+        decompressed_size > input.content_size_ - decompressed_offset) {
+      mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
+    }
+
+    // --- Stage the payload for the decoder ------------------------------------
+    // Fast path: the whole payload lives in one window, so point the decoder
+    // straight at it (zero copy) and pin that window with a decode_refs bump until
+    // the decode completes. Otherwise it straddles a window boundary: copy it out
+    // into crossing_payloads, which stays alive for the whole run, so no window pin
+    // is needed (and the source window can be released as soon as it is scanned).
+    const char* src          = resident.ptr_if_contiguous(offset, block_payload_size);
+    std::size_t window_index = std::numeric_limits<std::size_t>::max();
+    if (src == nullptr) {
+      crossing_payloads.emplace_back(block_payload_size);
+      resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size);
+      src = crossing_payloads.back().data();
+    } else {
+      window_index = offset / window_bytes;
+      window_state_[window_index].decode_refs.fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    // --- Record the descriptor and advance past the block (+ optional checksum) -
+    resident_block_desc_t block{src,
+                                block_payload_size,
+                                decompressed_offset,
+                                decompressed_size,
+                                blocks_scanned.load(std::memory_order_relaxed),
+                                window_index,
+                                uncompressed};
+    decompressed_offset += decompressed_size;
+    offset += block_payload_size;
+    mark_windows_scanned_before(offset);
+    if (input.block_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    return block;
+  }
+
+  void scan_frame_footer(std::size_t& offset, std::size_t decompressed_offset)
+  {
+    if (input.content_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading content checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    if (input.content_size_present_ && decompressed_offset != input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame ended before declared content size was reached");
+    }
+    if (offset != input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input contains trailing data after the first frame");
+    }
+  }
+
+  // ---- Input + chunking (immutable after construction) ------------------------
+  // The compressed file is split into fixed-size `windows`; `io_threads` reader
+  // threads pull them by index.
+  lz4_input_stream_t& input;
+  const std::size_t window_bytes = lz4_pipeline_batch_bytes;
+  const std::size_t window_count;
+  std::vector<lz4_resident_window_t> windows;
+  const std::size_t io_threads;
+
+  // First-error-wins latch shared by all three stages: stops the pipeline and
+  // retains the first exception for run() to rethrow after the threads join.
+  parallel_error_latch_t latch;
+
+  // ---- Reader -> scanner readiness  (guarded by window_mutex) -----------------
+  // A reader sets window_done[i]=1 once window i is resident; the scanner blocks
+  // on window_cv until every window covering the bytes it needs is ready.
+  std::vector<unsigned char> window_done;
+  std::mutex window_mutex;
+  std::condition_variable window_cv;
+
+  // ---- Window lifecycle / early release ---------------------------------------
+  // windows[i].data is freed exactly once, when the metadata scan has passed window i
+  // (scanned_through_ > i) AND no decoder still pins it (window_state_[i].decode_refs == 0).
+  // scanned_through_ advances monotonically in mark_windows_scanned_before (last_mark_offset_
+  // asserts that monotonicity); decode_refs bumps in scan_one_block and drops in
+  // release_block_window_ref; the per-window `released` CAS makes the free exactly-once.
+  // window_release_mutex serializes the data.reset() + compressed_resident_bytes accounting.
+  std::unique_ptr<window_state_t[]> window_state_;
+  std::atomic_size_t scanned_through_{0};
+  std::size_t last_mark_offset_{0};
+  std::mutex window_release_mutex;
+  std::atomic_size_t compressed_resident_bytes{0};
+
+  // ---- Scanner -> decoder queue  (guarded by desc_mutex) ----------------------
+  // The scanner pushes batches of block descriptors; decoders pop them via desc_cv.
+  // scanner_done signals the scanner has emitted its final batch.
+  std::deque<std::vector<resident_block_desc_t>> desc_queue;
+  bool scanner_done = false;
+  std::mutex desc_mutex;
+  std::condition_variable desc_cv;
+
+  // ---- Scanner scratch / progress ---------------------------------------------
+  // blocks_scanned doubles as the running block index; crossing_payloads holds staged
+  // copies of blocks that straddle a window boundary (see scan_one_block).
+  std::atomic_size_t blocks_scanned{0};
+  std::vector<std::vector<char>> crossing_payloads;
+};
+
+void lz4_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
+  lz4_pipeline_t pipeline(*this);
+  pipeline.run();
+  pipeline.finalize();
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
new file mode 100644
index 0000000000..9d5469e860
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <cuda/cmath>
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include <utilities/error.hpp>
+
+#include <limits>
+#include <stdexcept>
+#include <string>
+
+namespace cuopt::linear_programming::io::detail {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+// Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range
+// are still released by unmapping the owning outer range.
+class mmap_region_t {
+ public:
+  mmap_region_t() = default;
+  mmap_region_t(void* ptr, std::size_t size) noexcept : ptr_(ptr), size_(size) {}
+
+  mmap_region_t(const mmap_region_t&)            = delete;
+  mmap_region_t& operator=(const mmap_region_t&) = delete;
+
+  mmap_region_t(mmap_region_t&& other) noexcept
+    : ptr_(other.ptr_),
+      size_(other.size_),
+      unmap_ptr_(other.unmap_ptr_),
+      unmap_size_(other.unmap_size_)
+  {
+    other.ptr_        = nullptr;
+    other.size_       = 0;
+    other.unmap_ptr_  = nullptr;
+    other.unmap_size_ = 0;
+  }
+
+  mmap_region_t& operator=(mmap_region_t&& other) noexcept
+  {
+    if (this != &other) {
+      reset();
+      ptr_              = other.ptr_;
+      size_             = other.size_;
+      unmap_ptr_        = other.unmap_ptr_;
+      unmap_size_       = other.unmap_size_;
+      other.ptr_        = nullptr;
+      other.size_       = 0;
+      other.unmap_ptr_  = nullptr;
+      other.unmap_size_ = 0;
+    }
+    return *this;
+  }
+
+  ~mmap_region_t() { reset(); }
+
+ private:
+  static mmap_region_t map(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags, fd, offset);
+    if (ptr == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+    return mmap_region_t(ptr, size);
+  }
+
+ public:
+  static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context)
+  {
+    return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context);
+  }
+
+  static mmap_region_t anonymous_aligned(
+    std::size_t size, std::size_t alignment, int prot, int flags, const char* context)
+  {
+    if (!cuda::is_power_of_two(alignment)) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "mmap aligned allocation requires power-of-two alignment");
+    }
+    if (size > std::numeric_limits<std::size_t>::max() - alignment) {
+      mps_parser_fail(error_type_t::OutOfMemoryError, "mmap aligned allocation size overflow");
+    }
+
+    std::size_t raw_size = size + alignment;
+    void* raw            = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0);
+    if (raw == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+
+    uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
+    uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1);
+    return mmap_region_t(reinterpret_cast<void*>(aligned_addr), size, raw, raw_size);
+  }
+
+  static void map_fixed_or_throw(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset);
+    if (ptr == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+  }
+
+  void reset() noexcept
+  {
+    void* base      = unmap_ptr_ != nullptr ? unmap_ptr_ : ptr_;
+    std::size_t len = unmap_ptr_ != nullptr ? unmap_size_ : size_;
+    if (base != nullptr && len != 0) { ::munmap(base, len); }
+    ptr_        = nullptr;
+    size_       = 0;
+    unmap_ptr_  = nullptr;
+    unmap_size_ = 0;
+  }
+
+  void advise(int advice) const noexcept
+  {
+    if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); }
+  }
+
+  void* data() noexcept { return ptr_; }
+  char* char_data() noexcept { return (char*)ptr_; }
+  std::size_t size() const noexcept { return size_; }
+
+ private:
+  mmap_region_t(void* ptr, std::size_t size, void* unmap_ptr, std::size_t unmap_size) noexcept
+    : ptr_(ptr), size_(size), unmap_ptr_(unmap_ptr), unmap_size_(unmap_size)
+  {
+  }
+
+  void* ptr_              = nullptr;
+  std::size_t size_       = 0;
+  void* unmap_ptr_        = nullptr;
+  std::size_t unmap_size_ = 0;
+};
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
new file mode 100644
index 0000000000..3924e2dcd5
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -0,0 +1,478 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "mps_section_scanner.hpp"
+
+#include <utilities/error.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <stdexcept>
+
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
+namespace cuopt::linear_programming::io::detail {
+
+using cuopt::linear_programming::io::error_type_t;
+using cuopt::linear_programming::io::mps_parser_expects;
+using cuopt::linear_programming::io::mps_parser_fail;
+
+namespace {
+
+struct section_record_t {
+  mps_section_kind kind;
+  const char* name;
+  std::size_t len;
+};
+
+constexpr section_record_t section_records[] = {
+  {mps_section_kind::rows, "ROWS", 4},
+  {mps_section_kind::columns, "COLUMNS", 7},
+  {mps_section_kind::rhs, "RHS", 3},
+  {mps_section_kind::bounds, "BOUNDS", 6},
+  {mps_section_kind::ranges, "RANGES", 6},
+  {mps_section_kind::quadobj, "QUADOBJ", 7},
+  {mps_section_kind::qmatrix, "QMATRIX", 7},
+  {mps_section_kind::qcmatrix, "QCMATRIX", 8},
+  {mps_section_kind::endata, "ENDATA", 6},
+};
+
+constexpr const char* header_records[] = {"NAME", "OBJSENSE", "OBJNAME"};
+
+constexpr std::size_t kSimdWidth = sizeof(simde__m256i);
+static_assert(kSimdWidth == 32);
+static_assert((std::size_t)mps_section_kind::rows == 0);
+static_assert((std::size_t)mps_section_kind::endata + 1 == std::size(section_records));
+static_assert((std::size_t)mps_phase_kind::header == 0);
+static_assert((std::size_t)mps_phase_kind::quadratic + 1 == 7);
+
+bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; }
+
+simde__m256i nonblank_column1_mask(simde__m256i bytes)
+{
+  return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' '));
+}
+
+enum class section_record_match_t { invalid, header, section };
+
+bool line_has_record_prefix(const char* line_start, const char* line_end, const char* name)
+{
+  std::size_t len = std::strlen(name);
+  if ((std::size_t)(line_end - line_start) < len || std::memcmp(line_start, name, len) != 0) {
+    return false;
+  }
+  const char* after = line_start + len;
+  return after == line_end || *after <= ' ';
+}
+
+}  // namespace
+
+std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) { return (std::size_t)phase; }
+
+void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range)
+{
+  std::size_t idx = phase_index(phase);
+  omp_event_handle_t event{};
+  bool fulfill = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (ready_[idx].load(std::memory_order_acquire)) { return; }
+    ranges_[idx] = range;
+    ready_[idx].store(true, std::memory_order_release);
+    if (has_event_[idx] && !event_fulfilled_[idx]) {
+      event                 = events_[idx];
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+void mps_phase_registry_t::attach_event(mps_phase_kind phase, omp_event_handle_t event)
+{
+  std::size_t idx = phase_index(phase);
+  bool fulfill    = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    events_[idx]    = event;
+    has_event_[idx] = true;
+    if (ready_[idx].load(std::memory_order_acquire) && !event_fulfilled_[idx]) {
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+bool mps_phase_registry_t::ready(mps_phase_kind phase) const
+{
+  return ready_[phase_index(phase)].load(std::memory_order_acquire);
+}
+
+mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
+{
+  std::size_t idx = phase_index(phase);
+  bool is_ready   = ready_[idx].load(std::memory_order_acquire);
+  assert(is_ready);
+  return ranges_[idx];
+}
+
+void mps_phase_registry_t::publish_endata(const char* begin, bool present)
+{
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (endata_ready_.load(std::memory_order_acquire)) { return; }
+  endata_begin_   = begin;
+  endata_present_ = present;
+  endata_ready_.store(true, std::memory_order_release);
+}
+
+bool mps_phase_registry_t::endata_ready() const
+{
+  return endata_ready_.load(std::memory_order_acquire);
+}
+
+const char* mps_phase_registry_t::endata_begin() const
+{
+  assert(endata_ready());
+  return endata_begin_;
+}
+
+bool mps_phase_registry_t::endata_present() const
+{
+  assert(endata_ready());
+  return endata_present_;
+}
+
+static section_record_match_t is_section_record(const char* line_start,
+                                                const char* line_end,
+                                                mps_section_kind* kind)
+{
+  if (line_start >= line_end) { return section_record_match_t::invalid; }
+
+  for (const char* name : header_records) {
+    if (line_has_record_prefix(line_start, line_end, name)) {
+      return section_record_match_t::header;
+    }
+  }
+
+  for (const section_record_t& record : section_records) {
+    if ((std::size_t)(line_end - line_start) < record.len ||
+        std::memcmp(line_start, record.name, record.len) != 0) {
+      continue;
+    }
+    const char* after = line_start + record.len;
+    while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
+      ++after;
+    }
+    // QCMATRIX records are of the form "QCMATRIX <row>"
+    if (record.kind == mps_section_kind::qcmatrix) {
+      if (after == line_end) { return section_record_match_t::invalid; }
+      *kind = record.kind;
+      return section_record_match_t::section;
+    }
+    if (after != line_end) { return section_record_match_t::invalid; }
+    *kind = record.kind;
+    return section_record_match_t::section;
+  }
+  return section_record_match_t::invalid;
+}
+
+mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data,
+                                                         std::size_t block_count,
+                                                         mps_phase_registry_t& registry)
+  : data_(data),
+    block_count_(block_count),
+    registry_(registry),
+    block_decoded_(std::make_unique<std::atomic<unsigned char>[]>(block_count)),
+    block_begin_offsets_(std::make_unique<std::atomic_size_t[]>(block_count)),
+    block_end_offsets_(std::make_unique<std::atomic_size_t[]>(block_count))
+{
+  for (std::size_t i = 0; i < block_count_; ++i) {
+    block_decoded_[i].store(0, std::memory_order_relaxed);
+    block_begin_offsets_[i].store(0, std::memory_order_relaxed);
+    block_end_offsets_[i].store(0, std::memory_order_relaxed);
+  }
+}
+
+std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind)
+{
+  return (std::size_t)kind;
+}
+
+void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr)
+{
+  std::atomic<const char*>& slot = section_hits_[section_hit_index(kind)];
+  const char* expected           = nullptr;
+  if (slot.compare_exchange_strong(
+        expected, ptr, std::memory_order_release, std::memory_order_acquire)) {
+    notify_ready_phases();
+  }
+}
+
+void mps_section_block_scanner_t::scan_section_range(const char* begin, const char* end)
+{
+  if (begin >= end) return;
+  const char* p = begin;
+
+  // Interior scans that start inside a decoded block skip the leading partial
+  // line. A separate boundary scan covers section titles whose newline/title
+  // bytes straddle adjacent LZ4 blocks.
+  if (p != data_) {
+    const void* nl = __builtin_memchr(p, '\n', (std::size_t)(end - p));
+    if (nl == nullptr) { return; }
+    p = (const char*)nl + 1;
+  }
+
+  auto try_candidate = [&](const char* line_start) {
+    const void* nl       = __builtin_memchr(line_start, '\n', (std::size_t)(end - line_start));
+    const char* line_end = nullptr;
+    if (nl == nullptr) {
+      const char* ready_ptr = data_ + ready_bytes_.load(std::memory_order_acquire);
+      if (end != ready_ptr) { return; }
+      line_end = end;
+    } else {
+      line_end = (const char*)nl;
+    }
+    if (*line_start == '*' || *line_start == '$') { return; }
+    mps_section_kind kind;
+    section_record_match_t match = is_section_record(line_start, line_end, &kind);
+    if (match == section_record_match_t::section) {
+      record_section_hit(kind, line_start);
+      return;
+    }
+    if (match == section_record_match_t::invalid) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "unknown section record: %.*s",
+                      (int)(line_end - line_start),
+                      line_start);
+    }
+  };
+
+  // Handle the very first line of a file (NAME indicator, usually)
+  if (p == data_) {
+    if (p < end && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
+    ++p;
+  }
+
+  // In compliant MPS, indicator records begin in column 1 while data records
+  // begin in column 2+. use "\n[nonblank]" as a needle for the SIMD scan
+  const simde__m256i newline = simde_mm256_set1_epi8('\n');
+  while ((std::size_t)(end - p) >= kSimdWidth) {
+    // The first-line path above increments p when p == data_, so p - 1 is
+    // in-bounds here. Loading the previous vector lets us test "\nX" for all
+    // 32 candidate column-1 bytes with one AVX2 mask.
+    // loadu is comparable to aligned reads on modern SSE/AVX.
+    // might warrant some checks on ARM though
+    simde__m256i current  = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p));
+    simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p - 1));
+    std::uint32_t mask    = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256(
+      simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current)));
+    while (mask != 0) {
+      int bit = __builtin_ctz(mask);
+      try_candidate(p + bit);
+      mask &= mask - 1;
+    }
+    p += kSimdWidth;
+  }
+
+  // scalar tail
+  while (p < end) {
+    if (*(p - 1) == '\n' && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
+    ++p;
+  }
+}
+
+void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::size_t right_index)
+{
+  std::size_t left_begin = block_begin_offsets_[left_index].load(std::memory_order_acquire);
+  std::size_t boundary   = block_begin_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t right_end  = block_end_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t begin =
+    boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin;
+  std::size_t end =
+    right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end;
+  scan_section_range(data_ + begin, data_ + end);
+}
+
+// scans a freshly decoded block for section titles, along with the start/end boundaries if a
+// section title straddles blocks
+void mps_section_block_scanner_t::observe_block(std::size_t block_index,
+                                                const char* begin,
+                                                const char* end)
+{
+  if (block_index >= block_count_) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "MPS section scanner observed invalid LZ4 block index");
+  }
+
+  // --- Scan this block, then record its extent and mark it decoded. The release store on
+  //     block_decoded_ publishes the two relaxed offset stores above it.
+  scan_section_range(begin, end);
+  block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed);
+  block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed);
+  block_decoded_[block_index].store(1, std::memory_order_release);
+
+  // --- Rescan the seams with already-decoded neighbors, in case a title straddles the boundary.
+  if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index - 1, block_index);
+  }
+  if (block_index + 1 < block_count_ &&
+      block_decoded_[block_index + 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index, block_index + 1);
+  }
+
+  // --- Extend the contiguous decoded-byte frontier and publish any newly bounded phases.
+  advance_ready_frontier();
+}
+
+void mps_section_block_scanner_t::advance_ready_frontier()
+{
+  std::size_t new_ready = 0;
+  bool grew             = false;
+  {
+    std::lock_guard<std::mutex> lock(frontier_mutex_);
+    while (next_block_ < block_count_ &&
+           block_decoded_[next_block_].load(std::memory_order_acquire)) {
+      new_ready = block_end_offsets_[next_block_].load(std::memory_order_acquire);
+      ++next_block_;
+      grew = true;
+    }
+  }
+  if (grew) { publish_ready(new_ready); }
+}
+
+void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
+{
+  ready_bytes_.store(ready_bytes, std::memory_order_release);
+  std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0;
+  scan_section_range(data_ + begin, data_ + ready_bytes);
+  notify_ready_phases();
+}
+
+std::size_t mps_section_block_scanner_t::ready_bytes() const noexcept
+{
+  return ready_bytes_.load(std::memory_order_acquire);
+}
+
+void mps_section_block_scanner_t::notify_ready_phases()
+{
+  // Publication model: each present phase runs from its own section header to
+  // the first later section header that has been discovered. Optional sections
+  // publish present=false once a later boundary proves they cannot still appear.
+  // ENDATA, or final ready bytes for truncated/non-newline files, is the final
+  // boundary for the trailing optional/quadratic phases.
+  std::lock_guard<std::mutex> lock(publish_mutex_);
+  std::size_t ready     = ready_bytes_.load(std::memory_order_acquire);
+  const char* ready_ptr = data_ + ready;
+  const char* rows =
+    section_hits_[section_hit_index(mps_section_kind::rows)].load(std::memory_order_acquire);
+  const char* columns =
+    section_hits_[section_hit_index(mps_section_kind::columns)].load(std::memory_order_acquire);
+  const char* rhs =
+    section_hits_[section_hit_index(mps_section_kind::rhs)].load(std::memory_order_acquire);
+  const char* bounds =
+    section_hits_[section_hit_index(mps_section_kind::bounds)].load(std::memory_order_acquire);
+  const char* ranges =
+    section_hits_[section_hit_index(mps_section_kind::ranges)].load(std::memory_order_acquire);
+  const char* quadobj =
+    section_hits_[section_hit_index(mps_section_kind::quadobj)].load(std::memory_order_acquire);
+  const char* qmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qmatrix)].load(std::memory_order_acquire);
+  const char* qcmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qcmatrix)].load(std::memory_order_acquire);
+  const char* endata =
+    section_hits_[section_hit_index(mps_section_kind::endata)].load(std::memory_order_acquire);
+  auto available = [&](const char* p) { return p != nullptr && p <= ready_ptr; };
+  bool final_ready =
+    block_count_ == 0 ||
+    (block_decoded_[block_count_ - 1].load(std::memory_order_acquire) &&
+     ready == block_end_offsets_[block_count_ - 1].load(std::memory_order_acquire));
+  const char* final_boundary    = available(endata) ? endata : (final_ready ? ready_ptr : nullptr);
+  auto earliest_available_after = [&](const char* after,
+                                      std::initializer_list<const char*> candidates) {
+    const char* best = nullptr;
+    for (const char* p : candidates) {
+      if (!available(p) || (after != nullptr && p <= after)) { continue; }
+      if (best == nullptr || p < best) { best = p; }
+    }
+    return best;
+  };
+  auto publish_optional = [&](mps_phase_kind phase,
+                              const char* self,
+                              const char* predecessor,
+                              std::initializer_list<const char*> later_candidates) {
+    if (registry_.ready(phase)) { return; }
+    if (available(self)) {
+      const char* end = earliest_available_after(self, later_candidates);
+      if (end != nullptr) { registry_.publish(phase, {self, end, true}); }
+      return;
+    }
+    if (predecessor != nullptr &&
+        earliest_available_after(predecessor, later_candidates) != nullptr) {
+      registry_.publish(phase, {nullptr, nullptr, false});
+    }
+  };
+
+  // Three publication shapes follow:
+  //   (1) mandatory header/rows/columns -- each spans from its start to the next mandatory
+  //       section; published as soon as that bounding section is available.
+  //   (2) optional rhs/ranges/bounds via publish_optional -- present=true once bounded, or
+  //       present=false once a later section proves the optional one cannot still appear.
+  //   (3) quadratic -- starts at the earliest of the three quad markers (quadobj/qmatrix/qcmatrix).
+  // final_boundary (ENDATA, or the final ready frontier for truncated files) closes the tail.
+  if (available(rows) && !registry_.ready(mps_phase_kind::header)) {
+    registry_.publish(mps_phase_kind::header, {data_, rows, true});
+  }
+  if (available(rows) && available(columns) && !registry_.ready(mps_phase_kind::rows)) {
+    registry_.publish(mps_phase_kind::rows, {rows, columns, true});
+  }
+  if (available(columns) && !registry_.ready(mps_phase_kind::columns)) {
+    const char* columns_end = earliest_available_after(
+      columns, {rhs, ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+    if (columns_end != nullptr) {
+      registry_.publish(mps_phase_kind::columns, {columns, columns_end, true});
+    }
+  }
+
+  publish_optional(mps_phase_kind::rhs,
+                   rhs,
+                   columns,
+                   {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::ranges,
+                   ranges,
+                   rhs ? rhs : columns,
+                   {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::bounds,
+                   bounds,
+                   ranges ? ranges : (rhs ? rhs : columns),
+                   {quadobj, qmatrix, qcmatrix, final_boundary});
+
+  if (!registry_.ready(mps_phase_kind::quadratic)) {
+    const char* quadratic_begin = nullptr;
+    if (available(quadobj)) { quadratic_begin = quadobj; }
+    if (available(qmatrix) && (quadratic_begin == nullptr || qmatrix < quadratic_begin)) {
+      quadratic_begin = qmatrix;
+    }
+    if (available(qcmatrix) && (quadratic_begin == nullptr || qcmatrix < quadratic_begin)) {
+      quadratic_begin = qcmatrix;
+    }
+    if (quadratic_begin != nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {quadratic_begin, final_boundary, true});
+    } else if (quadratic_begin == nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false});
+    }
+  }
+
+  if (available(endata)) {
+    registry_.publish_endata(endata, true);
+  } else if (final_ready && final_boundary != nullptr) {
+    registry_.publish_endata(final_boundary, false);
+  }
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
new file mode 100644
index 0000000000..5d05e8b2f8
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include <omp.h>
+
+// The section scanner handles freshly read/decoded blocks and scans them for section titles while
+// they're still warm in cache it then publishes read/decoded input ranges to the parser workers,
+// which handle their respective sections in parallel.
+
+namespace cuopt::linear_programming::io::detail {
+
+enum class mps_section_kind {
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadobj,
+  qmatrix,
+  qcmatrix,
+  endata,
+};
+
+enum class mps_phase_kind {
+  header,
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadratic,
+};
+
+struct mps_phase_range_t {
+  const char* begin = nullptr;
+  const char* end   = nullptr;
+  bool present      = false;
+};
+
+class mps_phase_registry_t {
+ public:
+  void publish(mps_phase_kind phase, mps_phase_range_t range);
+  void attach_event(mps_phase_kind phase, omp_event_handle_t event);
+
+  bool ready(mps_phase_kind phase) const;
+  // range() acquire-loads ready_[phase] (pairs with publish()'s release store) before
+  // reading ranges_[phase]. Callers must not invoke range() until the phase is published.
+  mps_phase_range_t range(mps_phase_kind phase) const;
+
+  void publish_endata(const char* begin, bool present);
+  bool endata_ready() const;
+  const char* endata_begin() const;
+  bool endata_present() const;
+
+ private:
+  // mutex_ guards ranges_/events_/has_event_/event_fulfilled_ and the endata_* fields for writers.
+  // Readers observe ready_[phase] / endata_ready_ (release-stored under the lock on publish,
+  // acquire-loaded here) and may then read the matching range lock-free -- see range()'s contract.
+  static constexpr std::size_t phase_count = 7;
+
+  static std::size_t phase_index(mps_phase_kind phase);
+
+  mps_phase_range_t ranges_[phase_count]{};
+  std::atomic<bool> ready_[phase_count]{};
+  omp_event_handle_t events_[phase_count]{};
+  bool has_event_[phase_count]{};
+  bool event_fulfilled_[phase_count]{};
+  const char* endata_begin_ = nullptr;
+  bool endata_present_      = false;
+  std::atomic<bool> endata_ready_{false};
+  mutable std::mutex mutex_;
+};
+
+// Turns out-of-order decoded blocks into ordered section-range publications for the parser:
+//
+//   producer --observe_block(i,...)--> [SIMD-scan block i for section titles] --> section_hits_
+//                                       [advance contiguous decoded-byte frontier (ready_bytes_)]
+//                                       --> notify_ready_phases --> registry --> parser tasks
+//
+// Producers (the LZ4 decoders / raw readers) call observe_block for each block in any order.
+// Per block the scanner (1) SIMD-scans it for section titles starting in column 1 and records
+// the first byte of each section via a first-writer-wins CAS; (2) advances a contiguous
+// decoded-byte frontier across whatever leading blocks are now present; and (3) recomputes which
+// phases are fully bounded and publishes their [begin,end) ranges to the registry, unblocking the
+// matching parser task. A title can straddle two blocks, so adjacent decoded blocks are also
+// rescanned over a small overlap (boundary_overlap).
+class mps_section_block_scanner_t {
+ public:
+  mps_section_block_scanner_t(const char* data,
+                              std::size_t block_count,
+                              mps_phase_registry_t& registry);
+
+  // Records a freshly decoded block, scans it for section titles, advances the
+  // contiguous decoded-byte frontier across out-of-order completions, and
+  // publishes any newly available section ranges. Producers only need to feed
+  // blocks in any order; the frontier and publication live entirely here.
+  void observe_block(std::size_t block_index, const char* begin, const char* end);
+  void publish_ready(std::size_t ready_bytes);
+
+  // Current contiguous decoded-byte frontier; producers use this as the final
+  // view size once all blocks have been observed.
+  std::size_t ready_bytes() const noexcept;
+
+ private:
+  static constexpr std::size_t section_count = 9;
+  // Section titles are short; 128 bytes is enough to rescan around a decoded
+  // block boundary and catch a newline/title pair split across adjacent blocks.
+  static constexpr std::size_t boundary_overlap = 128;
+
+  static std::size_t section_hit_index(mps_section_kind kind);
+
+  void scan_section_range(const char* begin, const char* end);
+  void scan_boundary(std::size_t left_index, std::size_t right_index);
+  void record_section_hit(mps_section_kind kind, const char* ptr);
+  void notify_ready_phases();
+  void advance_ready_frontier();
+
+  // Concurrency: observe_block runs concurrently on many producer threads.
+  //   * frontier_mutex_ guards next_block_ and the ready_bytes_ frontier advance.
+  //   * publish_mutex_  serializes notify_ready_phases so each phase publishes once, in order.
+  //   * block_decoded_[i] is release-stored after block_begin/end_offsets_[i] (relaxed), so an
+  //     acquire-load of a set flag makes those offsets visible to the reader.
+  //   * section_hits_[k] is a first-writer-wins CAS holding the earliest byte of section k.
+  //   * registry_ carries its own internal lock.
+  const char* data_        = nullptr;
+  std::size_t block_count_ = 0;
+  mps_phase_registry_t& registry_;
+  std::mutex publish_mutex_;
+  std::unique_ptr<std::atomic<unsigned char>[]> block_decoded_;
+  std::unique_ptr<std::atomic_size_t[]> block_begin_offsets_;
+  std::unique_ptr<std::atomic_size_t[]> block_end_offsets_;
+  std::atomic_size_t ready_bytes_{0};
+  std::atomic<const char*> section_hits_[section_count]{};
+  std::mutex frontier_mutex_;
+  std::size_t next_block_ = 0;
+};
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
new file mode 100644
index 0000000000..0f47b45f56
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -0,0 +1,132 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#ifdef MPS_FAST_NVTX
+#include <nvtx3/nvToolsExt.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+namespace cuopt::linear_programming::io::detail::nvtx {
+
+namespace colors {
+constexpr std::uint32_t generic  = 0xff8b949e;
+constexpr std::uint32_t io       = 0xff58a6ff;
+constexpr std::uint32_t decode   = 0xff3fb950;
+constexpr std::uint32_t rows     = 0xffd29922;
+constexpr std::uint32_t columns  = 0xffff7b72;
+constexpr std::uint32_t rhs      = 0xffa371f7;
+constexpr std::uint32_t bounds   = 0xfff0883e;
+constexpr std::uint32_t ranges   = 0xff79c0ff;
+constexpr std::uint32_t names    = 0xff56d364;
+constexpr std::uint32_t alloc    = 0xffdb61a2;
+constexpr std::uint32_t finalize = 0xffc9d1d9;
+}  // namespace colors
+
+inline std::uint32_t color_for_name(std::string_view name) noexcept
+{
+  if (name.find("lz4") != std::string_view::npos || name.find("read") != std::string_view::npos) {
+    return colors::io;
+  }
+  if (name.find("decode") != std::string_view::npos ||
+      name.find("decompress") != std::string_view::npos) {
+    return colors::decode;
+  }
+  if (name.find("row") != std::string_view::npos) { return colors::rows; }
+  if (name.find("column") != std::string_view::npos || name.find("csr") != std::string_view::npos) {
+    return colors::columns;
+  }
+  if (name.find("rhs") != std::string_view::npos) { return colors::rhs; }
+  if (name.find("bound") != std::string_view::npos) { return colors::bounds; }
+  if (name.find("range") != std::string_view::npos) { return colors::ranges; }
+  if (name.find("name") != std::string_view::npos ||
+      name.find("materialize") != std::string_view::npos) {
+    return colors::names;
+  }
+  if (name.find("alloc") != std::string_view::npos ||
+      name.find("resize") != std::string_view::npos ||
+      name.find("mmap") != std::string_view::npos) {
+    return colors::alloc;
+  }
+  if (name.find("finalize") != std::string_view::npos) { return colors::finalize; }
+  return colors::generic;
+}
+
+class scoped_range_t {
+ public:
+  explicit scoped_range_t(const char* name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
+  {
+    push(name, color, category);
+  }
+
+  explicit scoped_range_t(std::string name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
+    : owned_name_(std::move(name))
+  {
+    push(owned_name_.c_str(), color, category);
+  }
+
+  ~scoped_range_t() { end(); }
+
+  void end()
+  {
+#ifdef MPS_FAST_NVTX
+    if (active_) {
+      nvtxRangePop();
+      active_ = false;
+    }
+#endif
+  }
+
+  scoped_range_t(const scoped_range_t&)            = delete;
+  scoped_range_t& operator=(const scoped_range_t&) = delete;
+
+ private:
+  void push([[maybe_unused]] const char* name,
+            [[maybe_unused]] std::uint32_t color,
+            [[maybe_unused]] std::uint32_t category)
+  {
+#ifdef MPS_FAST_NVTX
+    nvtxEventAttributes_t event{};
+    event.version       = NVTX_VERSION;
+    event.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    event.colorType     = NVTX_COLOR_ARGB;
+    event.color         = color;
+    event.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+    event.message.ascii = name;
+    event.category      = category;
+    nvtxRangePushEx(&event);
+    active_ = true;
+#endif
+  }
+
+  std::string owned_name_;
+#ifdef MPS_FAST_NVTX
+  bool active_ = false;
+#endif
+};
+
+inline void name_current_thread([[maybe_unused]] const char* name)
+{
+#ifdef MPS_FAST_NVTX
+  nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name);
+#endif
+}
+
+}  // namespace cuopt::linear_programming::io::detail::nvtx
+
+#define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b
+#define MPS_FAST_NVTX_CONCAT(a, b)       MPS_FAST_NVTX_CONCAT_INNER(a, b)
+#define MPS_NVTX_RANGE(name, color)                                                   \
+  ::cuopt::linear_programming::io::detail::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT( \
+    _mps_nvtx_range_, __LINE__)(name, color)
diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp
index 77b92d90e9..30d9c41f9f 100644
--- a/cpp/src/io/file_to_string.cpp
+++ b/cpp/src/io/file_to_string.cpp
@@ -9,6 +9,8 @@
 
 #include <utilities/error.hpp>
 
+#include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <memory>
 #include <string>
@@ -22,9 +24,9 @@
 #include <zlib.h>
 #endif  // MPS_PARSER_WITH_ZLIB
 
-#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB)
+#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) || defined(MPS_PARSER_WITH_LZ4)
 #include <dlfcn.h>
-#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB
+#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB || MPS_PARSER_WITH_LZ4
 
 namespace {
 using cuopt::linear_programming::io::error_type_t;
@@ -207,22 +209,184 @@ std::vector<char> zlib_file_to_string(const std::string& file)
 }  // end namespace
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+namespace {
+// Minimal liblz4 frame ABI declarations; keep in sync with lz4frame.h.
+struct LZ4F_dctx;
+using LZ4F_errorCode_t = size_t;
+struct LZ4F_frameInfo_t {
+  int blockSizeID;
+  int blockMode;
+  int contentChecksumFlag;
+  int frameType;
+  unsigned long long contentSize;
+  unsigned dictID;
+  int blockChecksumFlag;
+};
+using LZ4F_createDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx**, unsigned);
+using LZ4F_freeDecompressionContext_t   = LZ4F_errorCode_t (*)(LZ4F_dctx*);
+using LZ4F_getFrameInfo_t               = LZ4F_errorCode_t (*)(LZ4F_dctx*,
+                                                 LZ4F_frameInfo_t*,
+                                                 const void*,
+                                                 size_t*);
+using LZ4F_decompress_t =
+  LZ4F_errorCode_t (*)(LZ4F_dctx*, void*, size_t*, const void*, size_t*, const void*);
+using LZ4F_isError_t      = unsigned (*)(LZ4F_errorCode_t);
+using LZ4F_getErrorName_t = const char* (*)(LZ4F_errorCode_t);
+
+std::vector<char> lz4_file_to_string(const std::string& file)
+{
+  struct DlCloseDeleter {
+    void operator()(void* fp)
+    {
+      mps_parser_expects_fatal(
+        dlclose(fp) == 0, error_type_t::ValidationError, "Error closing liblz4.so!");
+    }
+  };
+  struct Lz4DctxDeleter {
+    void operator()(LZ4F_dctx* f)
+    {
+      if (f != nullptr) {
+        const LZ4F_errorCode_t err = fptr(f);
+        mps_parser_expects_fatal(
+          !is_error(err), error_type_t::ValidationError, "Error closing lz4 file!");
+      }
+    }
+    LZ4F_freeDecompressionContext_t fptr = nullptr;
+    LZ4F_isError_t is_error              = nullptr;
+  };
+
+  void* raw_lz4handle = nullptr;
+  for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+    raw_lz4handle = dlopen(soname, RTLD_LAZY);
+    if (raw_lz4handle != nullptr) break;
+  }
+  std::unique_ptr<void, DlCloseDeleter> lz4handle{raw_lz4handle};
+  mps_parser_expects(lz4handle != nullptr,
+                     error_type_t::ValidationError,
+                     "Could not open .lz4 file since liblz4 was not found "
+                     "(tried liblz4.so.1, liblz4.so). In order to open .lz4 files directly, "
+                     "please ensure liblz4 is installed. Alternatively, decompress the .lz4 file "
+                     "manually and open the uncompressed file. Given path: %s",
+                     file.c_str());
+
+  LZ4F_createDecompressionContext_t LZ4F_createDecompressionContext =
+    reinterpret_cast<LZ4F_createDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_createDecompressionContext"));
+  LZ4F_freeDecompressionContext_t LZ4F_freeDecompressionContext =
+    reinterpret_cast<LZ4F_freeDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_freeDecompressionContext"));
+  LZ4F_getFrameInfo_t LZ4F_getFrameInfo =
+    reinterpret_cast<LZ4F_getFrameInfo_t>(dlsym(lz4handle.get(), "LZ4F_getFrameInfo"));
+  LZ4F_decompress_t LZ4F_decompress =
+    reinterpret_cast<LZ4F_decompress_t>(dlsym(lz4handle.get(), "LZ4F_decompress"));
+  LZ4F_isError_t LZ4F_isError =
+    reinterpret_cast<LZ4F_isError_t>(dlsym(lz4handle.get(), "LZ4F_isError"));
+  LZ4F_getErrorName_t LZ4F_getErrorName =
+    reinterpret_cast<LZ4F_getErrorName_t>(dlsym(lz4handle.get(), "LZ4F_getErrorName"));
+  mps_parser_expects(
+    LZ4F_createDecompressionContext != nullptr && LZ4F_freeDecompressionContext != nullptr &&
+      LZ4F_getFrameInfo != nullptr && LZ4F_decompress != nullptr && LZ4F_isError != nullptr &&
+      LZ4F_getErrorName != nullptr,
+    error_type_t::ValidationError,
+    "Error loading liblz4! Library version might be incompatible. Please decompress the .lz4 "
+    "file manually and open the uncompressed file. Given path: %s",
+    file.c_str());
+
+  std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "rb")};
+  mps_parser_expects(fp != nullptr,
+                     error_type_t::ValidationError,
+                     "Error opening input file! Given path: %s",
+                     file.c_str());
+  mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0,
+                     error_type_t::ValidationError,
+                     "Error seeking input file! Given path: %s",
+                     file.c_str());
+  const long compressed_size = ftell(fp.get());
+  mps_parser_expects(compressed_size != -1L,
+                     error_type_t::ValidationError,
+                     "Error sizing input file! Given path: %s",
+                     file.c_str());
+  std::vector<char> compressed(compressed_size);
+  rewind(fp.get());
+  mps_parser_expects(fread(compressed.data(), sizeof(char), compressed_size, fp.get()) ==
+                       static_cast<size_t>(compressed_size),
+                     error_type_t::ValidationError,
+                     "Error reading input file! Given path: %s",
+                     file.c_str());
+
+  constexpr unsigned lz4f_version = 100;
+  LZ4F_dctx* raw_dctx             = nullptr;
+  LZ4F_errorCode_t lz4_status     = LZ4F_createDecompressionContext(&raw_dctx, lz4f_version);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Could not open lz4 compressed file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  std::unique_ptr<LZ4F_dctx, Lz4DctxDeleter> dctx{raw_dctx,
+                                                  {LZ4F_freeDecompressionContext, LZ4F_isError}};
+
+  const char* src = compressed.data();
+  size_t src_size = compressed.size();
+  LZ4F_frameInfo_t frame_info{};
+  size_t src_used = src_size;
+  lz4_status      = LZ4F_getFrameInfo(dctx.get(), &frame_info, src, &src_used);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Error reading lz4 frame info for input file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  src += src_used;
+  src_size -= src_used;
+
+  std::vector<char> buf;
+  if (frame_info.contentSize > 0) { buf.reserve((size_t)frame_info.contentSize + 1); }
+  const size_t readbufsize = 1ull << 24;  // 16MiB
+  std::vector<char> readbuf(readbufsize);
+  while (lz4_status != 0) {
+    size_t dst_size = readbuf.size();
+    src_used        = src_size;
+    lz4_status = LZ4F_decompress(dctx.get(), readbuf.data(), &dst_size, src, &src_used, nullptr);
+    mps_parser_expects(!LZ4F_isError(lz4_status),
+                       error_type_t::ValidationError,
+                       "Error in lz4 decompression of input file '%s': %s",
+                       file.c_str(),
+                       LZ4F_getErrorName(lz4_status));
+    if (dst_size > 0) { buf.insert(buf.end(), begin(readbuf), begin(readbuf) + dst_size); }
+    src += src_used;
+    src_size -= src_used;
+    mps_parser_expects(src_used != 0 || dst_size != 0 || lz4_status == 0,
+                       error_type_t::ValidationError,
+                       "Stalled lz4 decompression of input file! Given path: %s",
+                       file.c_str());
+  }
+  buf.push_back('\0');
+  return buf;
+}
+}  // end namespace
+#endif  // MPS_PARSER_WITH_LZ4
+
 namespace cuopt::linear_programming::io::detail {
 
 std::vector<char> file_to_string(const std::string& file)
 {
+  std::string lower(file);
+  std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
+    return (char)std::tolower(c);
+  });
+
 #ifdef MPS_PARSER_WITH_BZIP2
-  if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".bz2") {
-    return bz2_file_to_string(file);
-  }
+  if (lower.ends_with(".bz2")) { return bz2_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_BZIP2
 
 #ifdef MPS_PARSER_WITH_ZLIB
-  if (file.size() > 3 && file.substr(file.size() - 3, 3) == ".gz") {
-    return zlib_file_to_string(file);
-  }
+  if (lower.ends_with(".gz")) { return zlib_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+  if (lower.ends_with(".lz4")) { return lz4_file_to_string(file); }
+#endif  // MPS_PARSER_WITH_LZ4
+
   // Faster than using C++ I/O
   std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "r")};
   mps_parser_expects(fp != nullptr,
diff --git a/cpp/src/io/file_to_string.hpp b/cpp/src/io/file_to_string.hpp
index 94b2df821d..3b1924e12c 100644
--- a/cpp/src/io/file_to_string.hpp
+++ b/cpp/src/io/file_to_string.hpp
@@ -17,6 +17,7 @@ namespace cuopt::linear_programming::io::detail {
 // The dispatcher looks at the extension:
 //   - ".bz2" → libbz2 (dlopen'd at runtime), if MPS_PARSER_WITH_BZIP2.
 //   - ".gz"  → libz   (dlopen'd at runtime), if MPS_PARSER_WITH_ZLIB.
+//   - ".lz4" → liblz4 (dlopen'd at runtime), if MPS_PARSER_WITH_LZ4.
 //   - otherwise → plain fopen.
 // The returned buffer's size includes the null terminator.
 std::vector<char> file_to_string(const std::string& file);
diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp
index 5f7cecda94..9d4dea2bbf 100644
--- a/cpp/src/io/mps_parser.cpp
+++ b/cpp/src/io/mps_parser.cpp
@@ -797,9 +797,9 @@ void mps_parser_t<i_t, f_t>::parse_rows(std::string_view line)
   }
   if (type == Objective) {
     // Keep only the first name or OBJNAME since it was set before
-    if (objective_name.empty())
-      objective_name = name;
-    else
+    if (objective_name.empty()) objective_name = name;
+    // aligns with CPLEX/SCIP behavior
+    else if (name != objective_name)
       ignored_objective_names.emplace(name);
     // If we wanted to strictly follow MPS definition: a new objective row ('N') should be treated
     // as an unbounded constraints, aka an extra contraints row with lower bound -infinity and upper
diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp
index 93d9d9c73c..c9b3a351c6 100644
--- a/cpp/src/io/parser.cpp
+++ b/cpp/src/io/parser.cpp
@@ -7,8 +7,13 @@
 
 #include <cuopt/linear_programming/io/parser.hpp>
 
+#include <experimental_mps_fast/fast_parser.hpp>
 #include <mps_parser_internal.hpp>
 
+#include <utilities/logger.hpp>
+
+#include <cstdint>
+
 namespace cuopt::linear_programming::io {
 
 template <typename i_t, typename f_t>
@@ -35,4 +40,18 @@ template mps_data_model_t<int, float> read_mps_from_string(std::string_view mps_
 template mps_data_model_t<int, double> read_mps_from_string(std::string_view mps_contents,
                                                             bool fixed_mps_format);
 
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
+{
+  CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
+  return detail::parse_mps_fast_file<i_t, f_t>(mps_file_path);
+}
+
+template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int64_t, float> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+template mps_data_model_t<int64_t, double> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+
 }  // namespace cuopt::linear_programming::io
diff --git a/cpp/src/io/utilities/error.hpp b/cpp/src/io/utilities/error.hpp
index 58ac3891e1..c1b28fc7ff 100644
--- a/cpp/src/io/utilities/error.hpp
+++ b/cpp/src/io/utilities/error.hpp
@@ -34,6 +34,30 @@ inline std::string error_to_string(error_type_t error)
   return std::string("UnAccountedError");
 }
 
+[[noreturn]] inline void mps_parser_throw(error_type_t error_type, const char* msg)
+{
+  throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
+                         "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+}
+
+/**
+ * @brief Report an unrecoverable parser error.
+ *
+ * @param[error_type_t] error enum error type
+ * @param[const char *] fmt String format for error message
+ * @param variable set of arguments used for fmt
+ * @throw std::logic_error always
+ */
+[[noreturn]] inline void mps_parser_fail(error_type_t error_type, const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  char msg[2048];
+  vsnprintf(msg, sizeof(msg), fmt, args);
+  va_end(args);
+  mps_parser_throw(error_type, msg);
+}
+
 /**
  * @brief Function for checking (pre-)conditions that throws an exception when a
  * condition is false
@@ -52,9 +76,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f
     char msg[2048];
     vsnprintf(msg, sizeof(msg), fmt, args);
     va_end(args);
-
-    throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
-                           "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+    mps_parser_throw(error_type, msg);
   }
 }
 
diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp
new file mode 100644
index 0000000000..70658aa9b3
--- /dev/null
+++ b/cpp/src/utilities/perf_counters.hpp
@@ -0,0 +1,194 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <array>
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+namespace cuopt::linear_programming::io::detail {
+
+// Utils to return to total resident set size (used physical pages)
+static size_t parse_status_kb_line(const char* line, const char* key)
+{
+  size_t key_len = std::strlen(key);
+  if (std::strncmp(line, key, key_len) != 0) { return 0; }
+  const char* p = line + key_len;
+  while (*p == ' ' || *p == '\t') {
+    ++p;
+  }
+  char* end_ptr = nullptr;
+  size_t value  = std::strtol(p, &end_ptr, 10);
+  return value;
+}
+
+static std::pair<size_t, size_t> current_process_rss_kb()
+{
+  FILE* file = std::fopen("/proc/self/status", "r");
+  if (file == nullptr) { return {0, 0}; }
+
+  size_t rss_kb = 0;
+  size_t hwm_kb = 0;
+  char line[256];
+  while (std::fgets(line, sizeof(line), file) != nullptr) {
+    if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); }
+    if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); }
+    if (rss_kb != 0 && hwm_kb != 0) { break; }
+  }
+  std::fclose(file);
+  return {rss_kb, hwm_kb};
+}
+
+struct perf_counter_spec_t {
+  const char* name;
+  uint32_t type;
+  uint64_t config;
+};
+
+static constexpr uint64_t perf_cache_config(uint64_t cache, uint64_t op, uint64_t result)
+{
+  return cache | (op << 8) | (result << 16);
+}
+
+// Small scoped Linux perf_event_open wrapper for coarse phase diagnostics.
+//
+// Important limitations:
+// - Counters are per-thread: construct one instance inside each worker whose
+//   work should be measured, then aggregate snapshots.
+// - These are generic perf events; exact mappings vary by CPU. Some events may
+//   be unavailable or unhelpful, e.g. store-side DTLB misses on this node.
+// - This deliberately does not use event groups or time_enabled/time_running
+//   scaling, so counts are approximate if the kernel multiplexes counters.
+static constexpr std::array<perf_counter_spec_t, 8> PERF_COUNTER_SPECS = {{
+  {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES},
+  {"instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS},
+  {"cache_refs", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES},
+  {"cache_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES},
+  {"branch_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES},
+  {"backend_stall_cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND},
+  {"dtlb_load_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+  {"dtlb_store_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_WRITE, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+}};
+
+struct perf_counter_snapshot_t {
+  bool active                                            = false;
+  int open_errno                                         = 0;
+  std::array<uint64_t, PERF_COUNTER_SPECS.size()> values = {};
+};
+
+class thread_perf_counters_t {
+ public:
+  thread_perf_counters_t()
+  {
+    fds_.fill(-1);
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      perf_event_attr attr = {};
+      attr.type            = PERF_COUNTER_SPECS[i].type;
+      attr.size            = sizeof(attr);
+      attr.config          = PERF_COUNTER_SPECS[i].config;
+      attr.disabled        = 1;
+      attr.exclude_kernel  = 1;
+      attr.exclude_hv      = 1;
+
+      int fd = (int)syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+      if (fd < 0) {
+        if (first_errno_ == 0) { first_errno_ = errno; }
+        continue;
+      }
+      fds_[i] = fd;
+      active_ = true;
+    }
+
+    if (active_) {
+      for (int fd : fds_) {
+        if (fd >= 0) {
+          ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+          ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+        }
+      }
+    }
+  }
+
+  thread_perf_counters_t(const thread_perf_counters_t&)            = delete;
+  thread_perf_counters_t& operator=(const thread_perf_counters_t&) = delete;
+
+  ~thread_perf_counters_t() { close_all(); }
+
+  perf_counter_snapshot_t stop()
+  {
+    perf_counter_snapshot_t snapshot;
+    snapshot.active     = active_;
+    snapshot.open_errno = first_errno_;
+
+    for (size_t i = 0; i < fds_.size(); ++i) {
+      int fd = fds_[i];
+      if (fd < 0) continue;
+      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      uint64_t value = 0;
+      if (read(fd, &value, sizeof(value)) == (ssize_t)sizeof(value)) { snapshot.values[i] = value; }
+    }
+    close_all();
+    active_ = false;
+    return snapshot;
+  }
+
+ private:
+  void close_all()
+  {
+    for (int& fd : fds_) {
+      if (fd >= 0) {
+        close(fd);
+        fd = -1;
+      }
+    }
+  }
+
+  bool active_     = false;
+  int first_errno_ = 0;
+  std::array<int, PERF_COUNTER_SPECS.size()> fds_;
+};
+
+static inline void print_perf_totals(const char* label,
+                                     const std::vector<perf_counter_snapshot_t>& snapshots)
+{
+  std::array<unsigned long long, PERF_COUNTER_SPECS.size()> totals = {};
+  bool any_active                                                  = false;
+  int first_errno                                                  = 0;
+  for (const auto& snapshot : snapshots) {
+    if (snapshot.open_errno != 0 && first_errno == 0) { first_errno = snapshot.open_errno; }
+    if (!snapshot.active) continue;
+    any_active = true;
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      totals[i] += snapshot.values[i];
+    }
+  }
+
+  if (!any_active) {
+    std::fprintf(stderr, "[PERF] %s unavailable errno=%d\n", label, first_errno);
+    return;
+  }
+
+  double ipc       = totals[0] == 0 ? 0.0 : (double)totals[1] / (double)totals[0];
+  double miss_rate = totals[2] == 0 ? 0.0 : (double)totals[3] / (double)totals[2];
+  std::fprintf(stderr, "[PERF] %s", label);
+  for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+    std::fprintf(stderr, " %s=%llu", PERF_COUNTER_SPECS[i].name, totals[i]);
+  }
+  std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate);
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt
index bc057db1e2..6db30755c3 100644
--- a/cpp/tests/linear_programming/CMakeLists.txt
+++ b/cpp/tests/linear_programming/CMakeLists.txt
@@ -21,6 +21,16 @@ ConfigureTest(MPS_PARSER_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp
     LABELS numopt)
 
+ConfigureTest(MPS_FAST_PARSER_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp
+    LABELS numopt)
+target_include_directories(MPS_FAST_PARSER_TEST
+    PRIVATE
+    "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast"
+)
+target_link_libraries(MPS_FAST_PARSER_TEST PRIVATE simde::simde)
+
 # ##################################################################################################
 # - C API Tests----------------------------------------------------------------------
 # The C API tests require a separate library to be linked against. So we don't use the ConfigureTest macro.
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
new file mode 100644
index 0000000000..8bde21bb61
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -0,0 +1,188 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_fp64_parser.hpp"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <clocale>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace cuopt::linear_programming::io::detail {
+
+namespace {
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+double reference_strtod(std::string_view token)
+{
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end = nullptr;
+  errno     = 0;
+  return std::strtod(normalized.c_str(), &end);
+}
+
+double parse_token(std::string_view token)
+{
+  const char* p = token.data();
+  return fp64::parse_fp64_advance(p, token.data() + token.size());
+}
+
+void check_bitwise_strtod(std::string_view token)
+{
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end        = nullptr;
+  errno            = 0;
+  const double ref = std::strtod(normalized.c_str(), &end);
+  EXPECT_EQ(end, normalized.c_str() + normalized.size());
+
+  std::string padded(token);
+  padded.append(40, ' ');
+  const char* p             = padded.data();
+  const double padded_value = fp64::parse_fp64_advance(p, padded.data() + padded.size());
+  EXPECT_EQ(p, padded.data() + token.size());
+
+  const uint64_t ref_bits = bits(ref);
+  EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'";
+  EXPECT_EQ(ref_bits, bits(padded_value)) << "padded parse mismatch for '" << token << "'";
+}
+
+std::string random_token(std::mt19937_64& rng)
+{
+  std::uniform_int_distribution<int> sign_dist(0, 4);
+  std::uniform_int_distribution<int> digit_dist(0, 9);
+  std::uniform_int_distribution<int> shape_dist(0, 5);
+  std::uniform_int_distribution<int> len_dist(1, 19);
+  std::uniform_int_distribution<int> exp_dist(-30, 30);
+
+  std::string token;
+  int sign = sign_dist(rng);
+  if (sign == 0) {
+    token.push_back('-');
+  } else if (sign == 1) {
+    token.push_back('+');
+  }
+
+  int shape = shape_dist(rng);
+  if (shape == 0) {
+    token.append("0.");
+    int frac_len = std::uniform_int_distribution<int>(1, 19)(rng);
+    for (int i = 0; i < frac_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+  } else {
+    int int_len = len_dist(rng);
+    token.push_back(static_cast<char>('1' + std::uniform_int_distribution<int>(0, 8)(rng)));
+    for (int i = 1; i < int_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+    if (shape >= 2) {
+      token.push_back('.');
+      int remaining = 24 - static_cast<int>(token.size());
+      int max_frac  = std::max(0, std::min(19, remaining));
+      int frac_len  = max_frac == 0 ? 0 : std::uniform_int_distribution<int>(0, max_frac)(rng);
+      for (int i = 0; i < frac_len; ++i) {
+        token.push_back(static_cast<char>('0' + digit_dist(rng)));
+      }
+    }
+  }
+
+  if (shape == 5) {
+    int exp            = exp_dist(rng);
+    std::string suffix = "e" + std::to_string(exp);
+    if (token.size() + suffix.size() <= 25) { token += suffix; }
+  }
+
+  if (token.size() > 25) { token.resize(25); }
+  return token;
+}
+
+}  // namespace
+
+TEST(FastFp64ParserTest, CommonTableMatchesStrtodBitwise)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  const std::vector<std::string_view> cases = {
+    "0",
+    "-0",
+    "1",
+    "-1",
+    "+1",
+    "2",
+    "42",
+    "123456789",
+    "57.",
+    "-57.",
+    "0.1",
+    "0.01",
+    "0.12345678901234",
+    "0.1234567890123456",
+    "0.3333333333333333",
+    "0.6508282938248958",
+    "3.14159",
+    "3130000",
+    "8594600.16",
+    "2344.55",
+    "0.000000000000001",
+    "9999999999999999",
+    "1844674407370955161",
+    "1e0",
+    "1e-9",
+    "1E12",
+    "-2.5e3",
+    "3.125D-2",
+  };
+
+  for (std::string_view token : cases) {
+    check_bitwise_strtod(token);
+  }
+}
+
+TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::string text = "123.45  ABC";
+  const char* p    = text.data();
+  double value     = fp64::parse_fp64_advance(p, text.data() + text.size());
+
+  EXPECT_EQ(bits(reference_strtod("123.45")), bits(value));
+  EXPECT_EQ(text.data() + 6, p);
+  EXPECT_EQ(std::string_view("  ABC"), std::string_view(p, 5));
+}
+
+TEST(FastFp64ParserTest, RejectsMalformedNumericSuffix)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  for (const char* token : {"1x", "1e", "1d+", "1e+"}) {
+    SCOPED_TRACE(token);
+    EXPECT_THROW(parse_token(token), std::exception);
+  }
+}
+
+TEST(FastFp64ParserTest, FixedSeedRandomDifferential)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::mt19937_64 rng(0x4d50535f46415354ULL);
+  for (int i = 0; i < 100000; ++i) {
+    std::string token = random_token(rng);
+    ASSERT_LE(token.size(), 25U);
+    check_bitwise_strtod(token);
+  }
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
new file mode 100644
index 0000000000..771462a9ab
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -0,0 +1,936 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "mps_section_scanner.hpp"
+
+#include <cuopt/linear_programming/io/parser.hpp>
+#include <mps_parser_internal.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+
+#include <unistd.h>
+
+namespace cuopt::linear_programming::io::detail {
+
+namespace {
+
+struct TempMpsFile {
+  explicit TempMpsFile(std::string contents)
+  {
+    char path_template[128];
+    std::snprintf(path_template,
+                  sizeof(path_template),
+                  "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps",
+                  static_cast<long>(getpid()));
+    int fd = mkstemps(path_template, 4);
+    if (fd < 0) {
+      throw std::runtime_error(std::string("mkstemps failed: ") + std::strerror(errno));
+    }
+    path       = path_template;
+    FILE* file = fdopen(fd, "wb");
+    if (file == nullptr) {
+      close(fd);
+      throw std::runtime_error(std::string("fdopen failed: ") + std::strerror(errno));
+    }
+    if (!contents.empty() &&
+        std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) {
+      std::fclose(file);
+      throw std::runtime_error(std::string("failed to write temporary MPS file: ") +
+                               std::strerror(errno));
+    }
+    if (std::fclose(file) != 0) {
+      throw std::runtime_error(std::string("failed to close temporary MPS file: ") +
+                               std::strerror(errno));
+    }
+  }
+
+  TempMpsFile(const TempMpsFile&)            = delete;
+  TempMpsFile& operator=(const TempMpsFile&) = delete;
+
+  ~TempMpsFile()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+struct TempOwnedPath {
+  explicit TempOwnedPath(std::string p) : path(std::move(p)) {}
+  TempOwnedPath(const TempOwnedPath&)            = delete;
+  TempOwnedPath& operator=(const TempOwnedPath&) = delete;
+
+  ~TempOwnedPath()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+std::string_view range_text(const mps_phase_range_t& range)
+{
+  if (!range.present) { return {}; }
+  return std::string_view(range.begin, static_cast<size_t>(range.end - range.begin));
+}
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+template <typename T>
+void expect_vectors_bitwise_equal(const std::vector<T>& reference,
+                                  const std::vector<T>& fast,
+                                  std::string_view field,
+                                  std::string_view context)
+{
+  static_assert(std::is_trivially_copyable_v<T>);
+  SCOPED_TRACE(std::string(context) + " " + std::string(field));
+  ASSERT_EQ(reference.size(), fast.size()) << "size";
+  if (reference.empty()) { return; }
+  EXPECT_EQ(0, std::memcmp(reference.data(), fast.data(), reference.size() * sizeof(T)));
+}
+
+void check_models_match_reference_bitwise(const parser_model_t<int, double>& fast,
+                                          const mps_data_model_t<int, double>& reference,
+                                          std::string_view context)
+{
+  EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars";
+  EXPECT_EQ(reference.n_constraints_, fast.n_constraints_)
+    << std::string(context) + " n_constraints";
+  EXPECT_EQ(reference.get_nnz(), fast.get_nnz()) << std::string(context) + " nnz";
+  EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize";
+  EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name";
+  EXPECT_EQ(reference.objective_name_, fast.objective_name_)
+    << std::string(context) + " objective_name";
+
+  EXPECT_EQ(bits(reference.objective_scaling_factor_), bits(fast.objective_scaling_factor_))
+    << std::string(context) + " objective_scaling_factor";
+  EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_))
+    << std::string(context) + " objective_offset";
+
+  expect_vectors_bitwise_equal(reference.A_, fast.A_, "A", context);
+  EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices";
+  EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets";
+  expect_vectors_bitwise_equal(reference.b_, fast.b_, "b", context);
+  expect_vectors_bitwise_equal(reference.c_, fast.c_, "c", context);
+  expect_vectors_bitwise_equal(reference.variable_lower_bounds_,
+                               fast.variable_lower_bounds_,
+                               "variable_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.variable_upper_bounds_,
+                               fast.variable_upper_bounds_,
+                               "variable_upper_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_lower_bounds_,
+                               fast.constraint_lower_bounds_,
+                               "constraint_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_upper_bounds_,
+                               fast.constraint_upper_bounds_,
+                               "constraint_upper_bounds",
+                               context);
+  EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types";
+  EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types";
+  EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names";
+  EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names";
+
+  ASSERT_EQ(reference.quadratic_constraints_.size(), fast.quadratic_constraints_.size())
+    << std::string(context) + " quadratic_constraints size";
+  for (size_t q = 0; q < reference.quadratic_constraints_.size(); ++q) {
+    const auto& ref_qc  = reference.quadratic_constraints_[q];
+    const auto& fast_qc = fast.quadratic_constraints_[q];
+    SCOPED_TRACE(std::string(context) + " quadratic_constraint " + std::to_string(q));
+    EXPECT_EQ(ref_qc.constraint_row_index, fast_qc.constraint_row_index);
+    EXPECT_EQ(ref_qc.constraint_row_name, fast_qc.constraint_row_name);
+    EXPECT_EQ(ref_qc.constraint_row_type, fast_qc.constraint_row_type);
+    EXPECT_EQ(bits(ref_qc.rhs_value), bits(fast_qc.rhs_value));
+    expect_vectors_bitwise_equal(
+      ref_qc.linear_values, fast_qc.linear_values, "linear_values", context);
+    EXPECT_EQ(ref_qc.linear_indices, fast_qc.linear_indices);
+    expect_vectors_bitwise_equal(ref_qc.vals, fast_qc.vals, "qc_vals", context);
+    EXPECT_EQ(ref_qc.rows, fast_qc.rows);
+    EXPECT_EQ(ref_qc.cols, fast_qc.cols);
+  }
+}
+
+mps_data_model_t<int, double> parse_reference_model(const std::string& path)
+{
+  mps_data_model_t<int, double> reference;
+  mps_parser_t<int, double> parser(reference, path, false);
+  return reference;
+}
+
+void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
+{
+  TempMpsFile file(std::move(contents));
+  auto fast      = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  auto reference = parse_reference_model(file.path);
+  check_models_match_reference_bitwise(fast, reference, fixture_name);
+}
+
+std::string row_name(size_t i)
+{
+  std::ostringstream out;
+  out << 'R' << std::setw(6) << std::setfill('0') << i;
+  return out.str();
+}
+
+int find_var_index(const parser_model_t<int, double>& model, std::string_view name)
+{
+  for (size_t i = 0; i < model.var_names_.size(); ++i) {
+    if (model.var_names_[i] == name) { return static_cast<int>(i); }
+  }
+  return -1;
+}
+
+void check_model_shapes(
+  const parser_model_t<int, double>& model, int rows, int vars, int nnz, std::string_view context)
+{
+  EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows";
+  EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars";
+  EXPECT_EQ(nnz, model.nnz_) << std::string(context) + " nnz";
+  EXPECT_EQ(static_cast<size_t>(rows + 1), model.A_offsets_.size())
+    << std::string(context) + " offsets";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_.size()) << std::string(context) + " values";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_indices_.size()) << std::string(context) + " indices";
+}
+
+std::string section_split_fixture()
+{
+  return "NAME SPLITS\n"
+         "ROWS\n"
+         " N OBJ\n"
+         " L R1\n"
+         "COLUMNS\n"
+         " X1 OBJ 1 R1 2\n"
+         "RHS\n"
+         " RHS1 R1 3\n"
+         "BOUNDS\n"
+         " UP BND X1 4\n"
+         "ENDATA\n";
+}
+
+std::string to_crlf(std::string text)
+{
+  std::string converted;
+  converted.reserve(text.size() + text.size() / 8);
+  for (char c : text) {
+    if (c == '\n') {
+      converted += "\r\n";
+    } else {
+      converted.push_back(c);
+    }
+  }
+  return converted;
+}
+
+}  // namespace
+
+TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks)
+{
+  const std::string mps =
+    "NAME EDGE\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " x1 OBJ 1\n"
+    " x1 rowA 2\n"
+    "RHS\n"
+    " rhs rowA 3\n"
+    "ENDATA\n";
+
+  const size_t columns_pos = mps.find("COLUMNS");
+  EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split";
+  const size_t split = columns_pos + 3;
+
+  mps_phase_registry_t registry;
+  mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+  scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+  scanner.publish_ready(0);
+  scanner.observe_block(0, mps.data(), mps.data() + split);
+  scanner.publish_ready(mps.size());
+
+  EXPECT_TRUE(registry.ready(mps_phase_kind::header)) << "header not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready";
+
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::columns)).starts_with("COLUMNS"))
+    << "columns range begins at wrong boundary";
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::rhs)).starts_with("RHS"))
+    << "rhs range begins at wrong boundary";
+}
+
+TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte)
+{
+  const std::string mps                       = section_split_fixture();
+  const std::vector<std::string_view> headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"};
+
+  for (std::string_view header : headers) {
+    const size_t pos = mps.find(header);
+    EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture";
+    for (size_t offset = 1; offset < header.size(); ++offset) {
+      const size_t split = pos + offset;
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+      scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+      scanner.observe_block(0, mps.data(), mps.data() + split);
+      scanner.publish_ready(mps.size());
+
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::bounds)) << "bounds not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic))
+        << "quadratic sentinel not ready after split";
+    }
+  }
+}
+
+TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows)
+{
+  const std::string mps =
+    "NAME BAD\n"
+    "ROWS\n"
+    " N OBJ\n"
+    "FOO\n"
+    "COLUMNS\n"
+    " x OBJ 1\n"
+    "ENDATA\n";
+
+  EXPECT_THROW(
+    {
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 1, registry);
+      scanner.observe_block(0, mps.data(), mps.data() + mps.size());
+      scanner.publish_ready(mps.size());
+    },
+    std::logic_error);
+}
+
+TEST(FastMpsParserEdgeTest, ParserRejectsUnknownSectionRecords)
+{
+  TempMpsFile file(
+    "NAME BAD_UNKNOWN_SECTION\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 3\n"
+    "BOUNDS\n"
+    " FR BND1 X1\n"
+    "QSECTION      R1\n"
+    " X1 X1 1\n"
+    "ENDATA\n");
+
+  EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+               std::exception);
+}
+
+TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference)
+{
+  verify_fixture_bitwise("bounds_defaults_and_types",
+                         "NAME BOUNDS_EDGE\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L rowA\n"
+                         "COLUMNS\n"
+                         " XFREE rowA 1\n"
+                         " XUP0 rowA 1\n"
+                         " XNEG rowA 1\n"
+                         " XBV rowA 1\n"
+                         " XFX rowA 1\n"
+                         " XLI rowA 1\n"
+                         "RHS\n"
+                         " RHS1 rowA 10\n"
+                         "BOUNDS\n"
+                         " FR BND XFREE\n"
+                         " UP BND XUP0 0\n"
+                         " UP BND XNEG -1\n"
+                         " BV BND XBV\n"
+                         " FX BND XFX 7\n"
+                         " LI BND XLI 2\n"
+                         " UI BND XLI 9\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins)
+{
+  const std::string contents =
+    "NAME BOUNDS_DUP\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 rowA 1\n"
+    "RHS\n"
+    " RHS1 rowA 10\n"
+    "BOUNDS\n"
+    " LO BND X1 0\n"
+    " UP BND X1 5\n"
+    " UP BND X1 3\n"
+    " LO BND X1 2\n"
+    "ENDATA\n";
+
+  verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents);
+  TempMpsFile file(contents);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  EXPECT_EQ(1, model.n_vars_) << "n_vars";
+  EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound";
+  EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound";
+}
+
+TEST(FastMpsParserEdgeTest, NondenseRowAndColumnNamesUseHashPath)
+{
+  verify_fixture_bitwise("nondense_row_and_column_names",
+                         "NAME HASH_NAMES\n"
+                         "ROWS\n"
+                         " N obj.row\n"
+                         " G demand-east\n"
+                         " L capacity-west\n"
+                         " E balance.17\n"
+                         "COLUMNS\n"
+                         " alpha obj.row 4.5 demand-east 1\n"
+                         " beta_two capacity-west -2 balance.17 3\n"
+                         " z-last demand-east 7 balance.17 -1\n"
+                         "RHS\n"
+                         " rhs demand-east 2 capacity-west 9\n"
+                         " rhs balance.17 0\n"
+                         "BOUNDS\n"
+                         " LO b alpha -5\n"
+                         " UP b beta_two 6\n"
+                         " FR b z-last\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath)
+{
+  TempMpsFile file(
+    "NAME OPTIONALS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 rowA 2\n"
+    "RHS\n"
+    " RHS1 rowA 0\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars";
+  EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default";
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), model.variable_upper_bounds_.at(0));
+}
+
+TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically)
+{
+  TempMpsFile file(
+    "NAME BOUNDS_ONLY\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " XMAIN OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 0\n"
+    "BOUNDS\n"
+    " UP B AUX_Z 9\n"
+    " LO B AUX_Z -3\n"
+    " BV B AUX_A\n"
+    " SC B AUX_S 5\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 4, 1, "bounds-only");
+  EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name";
+  EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1";
+  EXPECT_EQ(std::string("AUX_S"), model.var_names_.at(2)) << "bounds-only sorted name 2";
+  EXPECT_EQ(std::string("AUX_Z"), model.var_names_.at(3)) << "bounds-only sorted name 3";
+
+  const int aux_a = find_var_index(model, "AUX_A");
+  const int aux_s = find_var_index(model, "AUX_S");
+  const int aux_z = find_var_index(model, "AUX_Z");
+  ASSERT_GE(aux_a, 0);
+  ASSERT_GE(aux_s, 0);
+  ASSERT_GE(aux_z, 0);
+  EXPECT_EQ('I', model.var_types_.at(aux_a)) << "bounds-only BV type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(aux_a)) << "bounds-only BV lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(aux_a)) << "bounds-only BV ub";
+  EXPECT_EQ('S', model.var_types_.at(aux_s)) << "bounds-only SC type";
+  EXPECT_EQ(5.0, model.variable_upper_bounds_.at(aux_s)) << "bounds-only SC ub";
+  EXPECT_EQ(-3.0, model.variable_lower_bounds_.at(aux_z)) << "bounds-only duplicate lb";
+  EXPECT_EQ(9.0, model.variable_upper_bounds_.at(aux_z)) << "bounds-only duplicate ub";
+}
+
+TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds)
+{
+  TempMpsFile file(
+    "NAME MARKERS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " MARK000 'MARKER' 'INTORG'\n"
+    " XINT OBJ 1 R1 1\n"
+    " MARK001 'MARKER' 'INTEND'\n"
+    " XCONT OBJ 2 R1 2\n"
+    " MARK002 'MARKER' 'INTORG'\n"
+    " XBIN OBJ 3 R1 3\n"
+    " MARK003 'MARKER' 'INTEND'\n"
+    "RHS\n"
+    " RHS1 R1 10\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 3, 3, "integer markers");
+  const int xint  = find_var_index(model, "XINT");
+  const int xcont = find_var_index(model, "XCONT");
+  const int xbin  = find_var_index(model, "XBIN");
+  ASSERT_GE(xint, 0);
+  ASSERT_GE(xcont, 0);
+  ASSERT_GE(xbin, 0);
+  EXPECT_EQ('I', model.var_types_.at(xint)) << "XINT type";
+  EXPECT_EQ('C', model.var_types_.at(xcont)) << "XCONT type";
+  EXPECT_EQ('I', model.var_types_.at(xbin)) << "XBIN type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xint)) << "XINT default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xint)) << "XINT default ub";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xbin)) << "XBIN default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xbin)) << "XBIN default ub";
+}
+
+TEST(FastMpsParserEdgeTest, NumericParsingIntegrationMatchesReferenceBitwise)
+{
+  verify_fixture_bitwise("numeric_parsing_integration",
+                         "NAME NUMBERS\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L R1\n"
+                         " G R2\n"
+                         " E R3\n"
+                         "COLUMNS\n"
+                         " X0 OBJ 0.12345678901234 R1 1e-9\n"
+                         " X1 OBJ -2.5E3 R2 0.12345678901234567890123\n"
+                         " X2 R3 9999999999999999\n"
+                         "RHS\n"
+                         " RHS1 R1 3.14159 R2 -0.000000000000001\n"
+                         " RHS1 R3 42\n"
+                         "RANGES\n"
+                         " RNG R1 0.25 R2 1E2\n"
+                         "BOUNDS\n"
+                         " LO B X0 -123456789\n"
+                         " UP B X0 123456789\n"
+                         " FX B X1 0.3333333333333333\n"
+                         " FR B X2\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, CrlfLineEndingsMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("crlf_line_endings",
+                         to_crlf("NAME CRLF_EDGE\n"
+                                 "OBJSENSE\n"
+                                 " MAX\n"
+                                 "ROWS\n"
+                                 " N OBJ\n"
+                                 " L R1\n"
+                                 "COLUMNS\n"
+                                 " X1 OBJ 1 R1 2\n"
+                                 "RHS\n"
+                                 " RHS1 R1 3\n"
+                                 "BOUNDS\n"
+                                 " UP B X1 4\n"
+                                 "ENDATA\n"));
+}
+
+TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("comment_placement_supported_cases",
+                         "* leading star comment\n"
+                         "$ leading dollar comment\n"
+                         "NAME COMMENTS\n"
+                         "$ comment between NAME and ROWS\n"
+                         "ROWS\n"
+                         "* comment after ROWS header\n"
+                         " N OBJ $ row objective comment\n"
+                         "$ comment between ROW records\n"
+                         " L R1 $ row constraint comment\n"
+                         "COLUMNS\n"
+                         "* comment after COLUMNS header\n"
+                         " X1 OBJ 1 R1 2 $ inline column comment\n"
+                         "$ comment before next column\n"
+                         " X2 OBJ -1 R1 3\n"
+                         "RHS\n"
+                         "$ comment after RHS header\n"
+                         " RHS1 R1 5 $ inline rhs comment\n"
+                         "BOUNDS\n"
+                         "* comment after BOUNDS header\n"
+                         " LO B X1 0 $ inline bound comment\n"
+                         "$ comment before ENDATA\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective)
+{
+  verify_fixture_bitwise("objective_metadata",
+                         "NAME OBJMETA\n"
+                         "OBJSENSE\n"
+                         " MAX\n"
+                         "OBJNAME\n"
+                         " COST\n"
+                         "ROWS\n"
+                         " N ALT\n"
+                         " N COST\n"
+                         " L R1\n"
+                         "COLUMNS\n"
+                         " X1 ALT 100 COST 5\n"
+                         " X1 R1 1\n"
+                         " X2 COST -2 R1 3\n"
+                         "RHS\n"
+                         " RHS1 COST 7 R1 11\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
+{
+  {
+    TempMpsFile file(
+      "NAME BADOBJ\n"
+      "OBJSENSE\n"
+      " SIDEWAYS\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADCOLROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 MISSING 1\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADRHSROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 MISSING 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADBOUND\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " XX B X1 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADSC\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " SC B X1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+}
+
+TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary)
+{
+  constexpr size_t row_count = 180000;
+  std::string mps;
+  mps.reserve(8 * 1024 * 1024);
+  mps += "NAME BIGCOLS\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " XBIG ";
+    mps += row_name(i);
+    mps += " 1\n";
+  }
+  mps += " XTAIL ";
+  mps += row_name(1);
+  mps += " 2\nRHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 0\nENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(
+    model, static_cast<int>(row_count), 2, static_cast<int>(row_count + 1), "large columns");
+  EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name";
+  EXPECT_EQ(std::string("XTAIL"), model.var_names_.at(1)) << "large tail column name";
+}
+
+TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered)
+{
+  constexpr size_t repeat_count = 700000;
+  std::string mps;
+  mps.reserve(12 * 1024 * 1024);
+  mps +=
+    "NAME BIGBOUNDS\nROWS\n N OBJ\n L R1\nCOLUMNS\n alpha OBJ 1 R1 1\nRHS\n RHS1 R1 0\nBOUNDS\n";
+  for (size_t i = 0; i < repeat_count; ++i) {
+    mps += " UP B alpha ";
+    mps += std::to_string(i % 1000);
+    mps += '\n';
+  }
+  mps += "ENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 1, 1, "large bounds");
+  EXPECT_EQ(static_cast<double>((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0))
+    << "large repeated bounds last value";
+}
+
+TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput)
+{
+  constexpr size_t row_count = 70000;
+  std::string mps;
+  mps.reserve(4 * 1024 * 1024);
+  mps += "NAME LZ4PARITY\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " X";
+    mps += std::to_string(i);
+    mps += ' ';
+    mps += row_name(i);
+    mps += " 0.125\n";
+  }
+  mps += "RHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 1\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath lz4_file(raw_file.path + ".lz4");
+  const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path;
+  if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; }
+
+  auto raw = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto lz4 = parse_mps_fast_file<int, double>(lz4_file.path, FileReadMethod::Read);
+
+  check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
+  EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count";
+  EXPECT_EQ(raw.row_names_.size(), lz4.row_names_.size()) << "lz4 row name count";
+  EXPECT_EQ(raw.A_, lz4.A_) << "lz4 A values";
+  EXPECT_EQ(raw.A_indices_, lz4.A_indices_) << "lz4 A indices";
+  EXPECT_EQ(raw.A_offsets_, lz4.A_offsets_) << "lz4 A offsets";
+  EXPECT_EQ(raw.c_, lz4.c_) << "lz4 objective";
+  EXPECT_EQ(raw.b_, lz4.b_) << "lz4 rhs";
+  EXPECT_EQ(raw.var_types_, lz4.var_types_) << "lz4 var types";
+  EXPECT_EQ(raw.variable_lower_bounds_, lz4.variable_lower_bounds_) << "lz4 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, lz4.variable_upper_bounds_) << "lz4 upper bounds";
+}
+
+TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
+{
+  std::string mps;
+  mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n";
+  mps += " X1 OBJ 1 R1 2.5\n X2 R1 -3.25 R2 4\n";
+  mps += "RHS\n RHS1 R1 7 R2 8\nBOUNDS\n BV BND X1\n UP BND X2 10\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath gzip_file(raw_file.path + ".gz");
+  TempOwnedPath bzip2_file(raw_file.path + ".bz2");
+
+  const std::string gzip_cmd  = "gzip -c " + raw_file.path + " > " + gzip_file.path;
+  const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path;
+  if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; }
+  if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; }
+
+  auto raw   = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto gzip  = parse_mps_fast_file<int, double>(gzip_file.path, FileReadMethod::Read);
+  auto bzip2 = parse_mps_fast_file<int, double>(bzip2_file.path, FileReadMethod::Read);
+
+  check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
+  check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
+  EXPECT_EQ(raw.A_, gzip.A_) << "gzip A values";
+  EXPECT_EQ(raw.A_, bzip2.A_) << "bzip2 A values";
+  EXPECT_EQ(raw.A_indices_, gzip.A_indices_) << "gzip A indices";
+  EXPECT_EQ(raw.A_indices_, bzip2.A_indices_) << "bzip2 A indices";
+  EXPECT_EQ(raw.A_offsets_, gzip.A_offsets_) << "gzip A offsets";
+  EXPECT_EQ(raw.A_offsets_, bzip2.A_offsets_) << "bzip2 A offsets";
+  EXPECT_EQ(raw.c_, gzip.c_) << "gzip objective";
+  EXPECT_EQ(raw.c_, bzip2.c_) << "bzip2 objective";
+  EXPECT_EQ(raw.b_, gzip.b_) << "gzip rhs";
+  EXPECT_EQ(raw.b_, bzip2.b_) << "bzip2 rhs";
+  EXPECT_EQ(raw.variable_lower_bounds_, gzip.variable_lower_bounds_) << "gzip lower bounds";
+  EXPECT_EQ(raw.variable_lower_bounds_, bzip2.variable_lower_bounds_) << "bzip2 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, gzip.variable_upper_bounds_) << "gzip upper bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, bzip2.variable_upper_bounds_) << "bzip2 upper bounds";
+  EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types";
+  EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types";
+}
+
+TEST(FastMpsParserEdgeTest, QcMatrixRowsMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("qcmatrix rows",
+                         "NAME QCMATRIX_TEST\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L LIN\n"
+                         " L QC1\n"
+                         " G QC2\n"
+                         "COLUMNS\n"
+                         " X1 OBJ 1 LIN 2\n"
+                         " X1 QC1 3 QC2 4\n"
+                         " X2 OBJ 2 LIN 5\n"
+                         " X2 QC1 6 QC2 7\n"
+                         "RHS\n"
+                         " RHS1 LIN 10 QC1 11\n"
+                         " RHS1 QC2 12\n"
+                         "QCMATRIX   QC1\n"
+                         " X1 X1 1.25\n"
+                         " X1 X2 -2.5\n"
+                         "QCMATRIX   QC2\n"
+                         " X2 X2 3.75\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, QcMatrixMalformedCasesMatchReference)
+{
+  const std::vector<std::string> cases = {
+    "NAME DUP_QC\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 1\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 2\n"
+    "ENDATA\n",
+    "NAME BAD_QC_ROW\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX UNKNOWN\n"
+    " X1 X1 1\n"
+    "ENDATA\n",
+    "NAME BAD_QC_VAR\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 XBAD 1\n"
+    "ENDATA\n"};
+
+  for (const auto& mps : cases) {
+    TempMpsFile file(mps);
+    EXPECT_THROW(((void)parse_reference_model(file.path)), std::exception);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception);
+  }
+}
+
+TEST(FastMpsParserEdgeTest, QuadraticParserRejectsUnknownColumnOneRecords)
+{
+  const std::vector<std::string> records = {"QSECTION      QC1",
+                                            "CSECTION      QC1        0              QUAD"};
+
+  for (const auto& record : records) {
+    TempMpsFile file(
+      "NAME BAD_QUAD_RECORD\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L QC1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 QC1 2\n"
+      " X2 OBJ 3 QC1 4\n"
+      "RHS\n"
+      " RHS1 QC1 5\n"
+      "QMATRIX\n"
+      " X1 X1 1\n" +
+      record +
+      "\n"
+      " X2 X2 2\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception)
+      << record;
+  }
+}
+
+}  // namespace cuopt::linear_programming::io::detail
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index af1368865d..70f7beb2dc 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -56,6 +56,21 @@ bool file_exists(const std::string& file)
 
 namespace {
 
+struct mps_reader_param_t {
+  const char* name;
+  mps_reader_type_t reader;
+};
+
+constexpr mps_reader_param_t default_mps_reader_param{"default_reader",
+                                                      mps_reader_type_t::default_reader};
+constexpr mps_reader_param_t fast_mps_reader_param{"fast_experimental",
+                                                   mps_reader_type_t::fast_experimental};
+
+std::string mps_reader_param_name(const ::testing::TestParamInfo<mps_reader_param_t>& info)
+{
+  return info.param.name;
+}
+
 // Non-template forwarding wrapper around read_lp_from_string<int, double>.
 // Exists only so EXPECT_THROW(read_lp_string(R"LP(...)LP"), exc) is parsed
 // correctly — gtest's macro splits its args on top-level commas, and the
@@ -115,20 +130,21 @@ double q_entry(const mps_data_model_t<int, double>& m, int row, int col)
 // ===========================================================================
 // Per-fixture test classes. Each class describes one named problem fixture
 // and owns the checker for that problem's expected parsed data model. The
-// MPS and LP TEST_F cases within a fixture share the same `check_model`
+// MPS TEST_P and LP TEST_F cases within a fixture share the same `check_model`
 // method, so the expected values live in exactly one place per fixture.
 //
 // All fixtures inherit a common base that supplies read_mps_file and
 // read_lp_file helpers.
 // ===========================================================================
 
-class parser_fixture_base : public ::testing::Test {
+class parser_fixture_base : public ::testing::TestWithParam<mps_reader_param_t> {
  protected:
-  static mps_data_model_t<int, double> read_mps_file(const std::string& file,
-                                                     bool fixed_format = true)
+  mps_data_model_t<int, double> read_mps_file(const std::string& file,
+                                              bool fixed_format = true) const
   {
     const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
-    return read_mps<int, double>(root + "/" + file, fixed_format);
+    const auto reader       = GetParam().reader;
+    return read<int, double>(root + "/" + file, reader, fixed_format);
   }
 
   static mps_data_model_t<int, double> read_lp_file(const std::string& file)
@@ -357,9 +373,13 @@ TEST(mps_parser, bad_mps_files)
   }
 }
 
-TEST_F(good_mps_1_test, mps)
+TEST_P(good_mps_1_test, mps)
+{
+  check_model(read_mps_file("linear_programming/good-mps-1.mps", false));
+}
+
+TEST_F(good_mps_1_test, mps_parser_internals)
 {
-  check_model(read_mps_file("linear_programming/good-mps-1.mps"));
   // Parser-struct fields that are MPS-only (not exposed via the data model).
   auto mps = read_from_mps("linear_programming/good-mps-1.mps");
   EXPECT_EQ("good-1", mps.problem_name);
@@ -592,9 +612,13 @@ TEST(mps_parser_free_format, bad_mps_files_free_format)
   }
 }
 
-TEST_F(up_low_bounds_test, mps)
+TEST_P(up_low_bounds_test, mps)
 {
   check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
+}
+
+TEST_F(up_low_bounds_test, mps_parser_internals)
+{
   auto mps = read_from_mps("linear_programming/lp_model_with_var_bounds.mps", false);
   EXPECT_EQ("lp_model_with_var_bounds", mps.problem_name);
   EXPECT_EQ("OBJ", mps.objective_name);
@@ -607,14 +631,14 @@ TEST_F(up_low_bounds_test, lp)
   check_model(read_lp_file("linear_programming/lp_model_with_var_bounds.lp"));
 }
 
-TEST_F(good_mps_1_test, mps_free_format)
+TEST_P(good_mps_1_test, mps_free_format)
 {
   // free-format-mps-1.mps encodes the same problem as good-mps-1 with default
   // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker.
   check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false));
 }
 
-TEST_F(some_var_bounds_test, mps)
+TEST_P(some_var_bounds_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
 }
@@ -624,7 +648,7 @@ TEST_F(some_var_bounds_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-some-var-bounds.lp"));
 }
 
-TEST_F(fixed_var_bound_test, mps)
+TEST_P(fixed_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps"));
 }
@@ -634,7 +658,7 @@ TEST_F(fixed_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-fixed-var.lp"));
 }
 
-TEST_F(free_var_bound_test, mps)
+TEST_P(free_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-free-var.mps"));
 }
@@ -644,7 +668,7 @@ TEST_F(free_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-free-var.lp"));
 }
 
-TEST_F(lower_inf_var_bound_test, mps)
+TEST_P(lower_inf_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
 }
@@ -662,7 +686,7 @@ TEST(mps_bounds, rhs_cost)
   EXPECT_EQ(int(-5), mps.objective_offset_value);
 }
 
-TEST_F(upper_inf_var_bound_test, mps)
+TEST_P(upper_inf_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
 }
@@ -817,9 +841,13 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types)
   };
 }
 
-TEST_F(mip_with_bounds_test, mps)
+TEST_P(mip_with_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
+}
+
+TEST_F(mip_with_bounds_test, mps_parser_internals)
+{
   auto mps = read_from_mps("mixed_integer_programming/good-mip-mps-1.mps", false);
   EXPECT_EQ("COST", mps.objective_name);
   ASSERT_EQ(int(2), mps.row_types.size());
@@ -877,7 +905,7 @@ TEST(mps_parser, good_mps_file_mip_no_marker)
   EXPECT_EQ(10., mps.variable_upper_bounds[1]);
 }
 
-TEST_F(mip_no_bounds_test, mps)
+TEST_P(mip_no_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
 }
@@ -887,7 +915,7 @@ TEST_F(mip_no_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-no-bounds.lp"));
 }
 
-TEST_F(mip_partial_bounds_test, mps)
+TEST_P(mip_partial_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
 }
@@ -897,6 +925,32 @@ TEST_F(mip_partial_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-partial-bounds.lp"));
 }
 
+#define INSTANTIATE_MPS_READER_TEST(Fixture)                                                   \
+  INSTANTIATE_TEST_SUITE_P(mps_readers,                                                        \
+                           Fixture,                                                            \
+                           ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \
+                           mps_reader_param_name)
+
+#define INSTANTIATE_DEFAULT_MPS_READER_TEST(Fixture) \
+  INSTANTIATE_TEST_SUITE_P(                          \
+    mps_readers, Fixture, ::testing::Values(default_mps_reader_param), mps_reader_param_name)
+
+INSTANTIATE_MPS_READER_TEST(good_mps_1_test);
+INSTANTIATE_MPS_READER_TEST(up_low_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test);
+// fast mps parser doesn't support fixed format
+INSTANTIATE_DEFAULT_MPS_READER_TEST(some_var_bounds_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(fixed_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test);
+
+// NOTE: INSTANTIATE_MPS_READER_TEST / INSTANTIATE_DEFAULT_MPS_READER_TEST are intentionally
+// left defined here; the QP/QCQP file fixtures below reuse them. They are #undef-ed after the
+// last instantiation.
+
 #ifdef MPS_PARSER_WITH_BZIP2
 TEST(mps_parser, good_mps_file_bzip2_compressed)
 {
@@ -998,13 +1052,14 @@ TEST(qps_parser, quadratic_objective_basic)
   EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]);
 }
 
+class qps_file_reader_test : public parser_fixture_base {};
+
 // Test actual QPS files from the dataset
-TEST(qps_parser, test_qps_files)
+TEST_P(qps_file_reader_test, test_qps_files)
 {
   // Test QP_Test_1.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_1.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_1.qps", false);
 
     EXPECT_EQ("QP_Test_1", parsed_data.get_problem_name());
     EXPECT_EQ(2, parsed_data.get_n_variables());    // C------1 and C------2
@@ -1023,8 +1078,7 @@ TEST(qps_parser, test_qps_files)
 
   // Test QP_Test_2.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_2.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_2.qps", false);
 
     EXPECT_EQ("QP_Test_2", parsed_data.get_problem_name());
     EXPECT_EQ(3, parsed_data.get_n_variables());    // C------1, C------2, C------3
@@ -2582,6 +2636,19 @@ TEST(read, qps_extension_dispatches_to_mps_parser)
   EXPECT_EQ(m.get_variable_names()[0], "x");
 }
 
+TEST(read, qps_extension_dispatches_to_fast_experimental_reader)
+{
+  temp_file_t tmp(".qps");
+  {
+    std::ofstream out(tmp.string());
+    out << kTrivialMps;
+  }
+  auto m = read<int, double>(tmp.string(), mps_reader_type_t::fast_experimental);
+  ASSERT_EQ(m.get_variable_names().size(), 1u);
+  EXPECT_EQ(m.get_variable_names()[0], "x");
+  EXPECT_NEAR(m.get_variable_upper_bounds()[0], 10.0, tolerance);
+}
+
 TEST(read, mps_gz_extension_dispatches_to_mps_parser)
 {
   auto m = read<int, double>(cuopt::test::get_rapids_dataset_root_dir() +
@@ -2796,13 +2863,12 @@ TEST(qps_parser, qcmatrix_append_api)
 }
 
 // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic.
-TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
+TEST_P(qps_file_reader_test, qcmatrix_mps_linear_rhs_and_bounds)
 {
   if (!file_exists("qcqp/QC_Test_1.mps")) {
     GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false);
+  const auto model = read_mps_file("qcqp/QC_Test_1.mps", false);
 
   ASSERT_TRUE(model.has_quadratic_constraints());
   const auto& qcs = model.get_quadratic_constraints();
@@ -2848,13 +2914,12 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
   EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value);
 }
 
-TEST(qps_parser, qcqp_p0033_mps_sections)
+TEST_P(qps_file_reader_test, qcqp_p0033_mps_sections)
 {
   if (!file_exists("qcqp/p0033_qc1.mps")) {
     GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false);
+  const auto model = read_mps_file("qcqp/p0033_qc1.mps", false);
 
   EXPECT_EQ(12, model.get_n_constraints());
   EXPECT_EQ(33, model.get_n_variables());
@@ -2897,4 +2962,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1)
   auto reloaded_2 = read_mps<int, double>(temp_file_2.string(), false);
   compare_data_models(reloaded, reloaded_2);
 }
+
+INSTANTIATE_MPS_READER_TEST(qps_file_reader_test);
+
+#undef INSTANTIATE_MPS_READER_TEST
+#undef INSTANTIATE_DEFAULT_MPS_READER_TEST
 }  // namespace cuopt::linear_programming::io
diff --git a/thirdparty/THIRD_PARTY_LICENSES b/thirdparty/THIRD_PARTY_LICENSES
index a70fa8ce1c..e09000b56d 100644
--- a/thirdparty/THIRD_PARTY_LICENSES
+++ b/thirdparty/THIRD_PARTY_LICENSES
@@ -512,3 +512,63 @@ Copyright notice:
 
   Jean-loup Gailly        Mark Adler
   jloup@gzip.org          madler@alumni.caltech.edu
+
+
+-----------------------------------------------------------------------------------------
+== LZ4
+
+Usage: cuopt uses LZ4 through dynamically loaded library symbols
+
+Copyright (c) Yann Collet. All rights reserved.
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+-----------------------------------------------------------------------------------------
+== SIMDe
+
+Usage: cuopt uses SIMDe in experimental fast MPS parser SIMD compatibility code
+
+Copyright (c) 2017 Evan Nemerson <evan@nemerson.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.