diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7e2dd099c1..98f7848fed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -202,9 +202,25 @@ endif () find_package(OpenMP REQUIRED) message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") -# MPS/QPS parser supports compressed inputs via bzip2 and zlib +# Resolve libgomp from the active C++ compiler, not FindOpenMP's generic -lgomp (which can +# resolve to an older system libgomp on Rocky/RHEL wheel builders). The fast MPS parser uses +# OpenMP 5.0 detached tasks (omp_fulfill_event); compile and link must use the same libgomp. +execute_process( + COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libgomp.so + OUTPUT_VARIABLE CUOPT_LIBGOMP_FILE + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if (NOT IS_ABSOLUTE "${CUOPT_LIBGOMP_FILE}") + message(FATAL_ERROR "Could not resolve libgomp from ${CMAKE_CXX_COMPILER}: '${CUOPT_LIBGOMP_FILE}'") +endif () +get_filename_component(CUOPT_LIBGOMP_DIR "${CUOPT_LIBGOMP_FILE}" DIRECTORY) +message(STATUS "cuOpt: libgomp for OpenMP link = ${CUOPT_LIBGOMP_FILE}") +list(APPEND CUOPT_CXX_FLAGS -fopenmp) + +# MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON) option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON) +option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON) if (CUOPT_PARSER_WITH_BZIP2) find_package(BZip2 REQUIRED) add_compile_definitions(MPS_PARSER_WITH_BZIP2) @@ -213,6 +229,10 @@ if (CUOPT_PARSER_WITH_ZLIB) find_package(ZLIB REQUIRED) add_compile_definitions(MPS_PARSER_WITH_ZLIB) endif () +if (CUOPT_PARSER_WITH_LZ4) + # No headers or link target needed; the experimental reader loads one liblz4 symbol at runtime. + add_compile_definitions(MPS_PARSER_WITH_LZ4) +endif () # Debug options if (CMAKE_BUILD_TYPE MATCHES Debug) @@ -250,6 +270,20 @@ else () find_package(RAFT REQUIRED) endif () +rapids_cpm_find(simde 0.8.2 + CPM_ARGS + GIT_REPOSITORY https://github.com/simd-everywhere/simde.git + GIT_TAG v0.8.2 + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE +) + +if (NOT TARGET simde::simde) + add_library(simde::simde INTERFACE IMPORTED GLOBAL) + set_target_properties(simde::simde + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${simde_SOURCE_DIR}") +endif () + FetchContent_Declare( papilo GIT_REPOSITORY "https://github.com/scipopt/papilo.git" @@ -436,16 +470,27 @@ if (BUILD_TESTS) endif () set(CUOPT_SRC_FILES) +set(MPS_FAST_SRC_FILES) add_subdirectory(src) if (HOST_LINEINFO) - set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") + set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") endif () +# Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND + CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") + set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2") +endif () + +# TODO: figure out a set of flags for ARM that fits the range of CPUs we wish to support (neoverse?) +# NEON should be universal on aarch64 and enough for our purposes (parsing) though + # Apply -UNDEBUG only to solver source files (not gRPC infrastructure). # Must happen before gRPC files are appended to CUOPT_SRC_FILES. # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO). if (DEFINE_ASSERT) - set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG") endif () @@ -470,7 +515,7 @@ if (NOT SKIP_GRPC_BUILD) # The conda-forge abseil shared library is built with NDEBUG and does not # export that symbol (abseil-cpp#1624). Without this, Debug builds fail # at runtime with "undefined symbol: absl::…::Mutex::Dtor". - set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG") endif (NOT SKIP_GRPC_BUILD) @@ -483,6 +528,7 @@ set_target_properties(cuopt INSTALL_RPATH "\$ORIGIN" INTERFACE_POSITION_INDEPENDENT_CODE ON CXX_SCAN_FOR_MODULES OFF + LINKER_LANGUAGE CXX ) target_compile_definitions(cuopt @@ -552,8 +598,7 @@ add_dependencies(cuopt PSLP) set(CUOPT_PRIVATE_CUDA_LIBS CUDA::curand CUDA::cusolver - TBB::tbb - OpenMP::OpenMP_CXX) + TBB::tbb) list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt) @@ -596,10 +641,17 @@ target_link_libraries(cuopt ${CUDSS_LIB_FILE} PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} + simde::simde $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) +# Force libgomp from the active C++ toolchain into libcuopt.so. OpenMP::OpenMP_CXX and/or +# -fopenmp alone can leave omp_fulfill_event undefined (CUDA-linked target + --as-needed) or +# resolve a trailing bare -lgomp to an older system libgomp at executable link time. +target_link_directories(cuopt PRIVATE ${CUOPT_LIBGOMP_DIR}) +target_link_libraries(cuopt PRIVATE "-Wl,--no-as-needed" gomp "-Wl,--as-needed") + # ################################################################################################## # - generate tests -------------------------------------------------------------------------------- @@ -737,7 +789,6 @@ if (NOT BUILD_LP_ONLY) target_link_libraries(cuopt_cli PUBLIC cuopt - OpenMP::OpenMP_CXX ${CUDSS_LIBRARIES} TBB::tbb PRIVATE @@ -779,7 +830,6 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) target_link_libraries(solve_MIP PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE ) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") @@ -809,7 +859,6 @@ if (BUILD_LP_BENCHMARKS) target_link_libraries(solve_LP PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE ) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") @@ -862,7 +911,6 @@ if (NOT SKIP_GRPC_BUILD) target_link_libraries(cuopt_grpc_server PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE protobuf::libprotobuf gRPC::grpc++ diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 37876cac7a..13991ad1e3 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -90,11 +90,13 @@ inline cuopt::init_logger_t dummy_logger( * .mps/.qps and their .gz/.bz2 variants → MPS parser; * anything else is rejected. * @param initial_solution_file Path to initial solution file in SOL format + * @param mps_reader MPS reader implementation selected by the CLI * @param settings Merged solver settings (config file loaded in main, then CLI overrides applied) */ int run_single_file(const std::string& file_path, const std::string& initial_solution_file, bool solve_relaxation, + cuopt::linear_programming::io::mps_reader_type_t mps_reader, cuopt::linear_programming::solver_settings_t& settings) { cuopt::init_logger_t log(settings.get_parameter(CUOPT_LOG_FILE), @@ -108,7 +110,7 @@ int run_single_file(const std::string& file_path, { CUOPT_LOG_INFO("Reading file %s", base_filename.c_str()); try { - mps_data_model = cuopt::linear_programming::io::read(file_path); + mps_data_model = cuopt::linear_programming::io::read(file_path, mps_reader); } catch (const std::logic_error& e) { CUOPT_LOG_ERROR("Parser exception: %s", e.what()); parsing_failed = true; @@ -284,8 +286,8 @@ int main(int argc, char* argv[]) program.add_argument("filename") .help( "input problem file; format dispatched by extension (case-insensitive). " - "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants " - "(e.g. .lp.gz, .mps.bz2, .qps.gz)") + "Supported: .lp, .mps, .qps and their .gz / .bz2 / .lz4 compressed variants " + "(e.g. .lp.gz, .mps.bz2, .qps.lz4).") .nargs(1) .required(); @@ -303,6 +305,14 @@ int main(int argc, char* argv[]) .help("path to parameter config file (key = value format, supports all parameters)") .default_value(std::string("")); + program.add_argument("--mps-reader") + .help( + "MPS reader implementation: default uses the production parser; experimental-fast uses the " + "experimental SIMD parser for free-format LP/MIP/QP/QCQP (SOCP) .mps/.qps files and their " + ".gz/.bz2/.lz4 compressed variants") + .default_value(std::string("default")) + .choices("default", "experimental-fast"); + program.add_argument("--dump-hyper-params") .help("print hyper-parameters only in config file format and exit") .default_value(false) @@ -403,6 +413,12 @@ int main(int argc, char* argv[]) const auto initial_solution_file = program.get("--initial-solution"); const auto solve_relaxation = program.get("--relaxation"); const auto params_file = program.get("--params-file"); + const auto mps_reader_arg = program.get("--mps-reader"); + + auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader; + if (mps_reader_arg == "experimental-fast") { + mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental; + } cuopt::linear_programming::solver_settings_t settings; try { @@ -432,5 +448,5 @@ int main(int argc, char* argv[]) RAFT_CUDA_TRY(cudaSetDevice(0)); } - return run_single_file(file_name, initial_solution_file, solve_relaxation, settings); + return run_single_file(file_name, initial_solution_file, solve_relaxation, mps_reader, settings); } diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index a63e40f31f..7122282e70 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -11,17 +11,26 @@ #include #include +#include #include #include #include namespace cuopt::linear_programming::io { +/** + * @brief Selects which MPS reader implementation should be used by dispatching entry points. + * + * The experimental fast reader is intentionally opt-in. It supports the same free-format + * MPS/QPS scope as read_mps(): LP, MIP, QP (QUADOBJ/QMATRIX), and QCQP/SOCP (QCMATRIX). + */ +enum class mps_reader_type_t { default_reader, fast_experimental }; + /** * @brief Reads the equation from an MPS or QPS file. * * The input file can be a plain text file in MPS-/QPS-format or a compressed MPS/QPS - * file (.mps.gz or .mps.bz2). + * file (.mps.gz, .mps.bz2, or .mps.lz4). * * Read this link http://lpsolve.sourceforge.net/5.5/mps-format.htm for more * details on both free and fixed MPS format. @@ -32,8 +41,8 @@ namespace cuopt::linear_programming::io { * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ) * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP) * - * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression - * libraries zlib or libbzip2 are installed, respectively. + * Note: Compressed MPS files .mps.gz, .mps.bz2, and .mps.lz4 can only be read if + * zlib, libbzip2, or liblz4 are installed, respectively. * * @param[in] mps_file_path Path to MPS/QPSfile. * @param[in] fixed_mps_format If MPS/QPS file should be parsed as fixed, false by default @@ -43,6 +52,19 @@ template mps_data_model_t read_mps(const std::string& mps_file_path, bool fixed_mps_format = false); +/** + * @brief Reads an MPS/QPS problem with the experimental SIMD-optimized reader. + * + * Supports the same free-format LP/MIP/QP/QCQP (SOCP-relevant QCMATRIX) scope as read_mps(). + * Fixed MPS format forcing is not supported. Accepts .mps/.qps and their .gz/.bz2/.lz4 variants + * (compression is detected from the file path, same as read_mps()). + * + * @param[in] mps_file_path Path to a raw or compressed .mps or .qps file. + * @return mps_data_model_t A fully formed LP/MIP/QP problem which represents the given file. + */ +template +mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); + /** * @brief Reads an MPS problem from in-memory file contents. * @@ -111,38 +133,72 @@ mps_data_model_t read_lp_from_string(std::string_view lp_contents); * @brief Reads an optimization problem from a file, dispatching on the file * extension. Extension matching is case-insensitive. * - * Routing: - * - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps() - * - .lp, .lp.gz, .lp.bz2 → read_lp() + * Routing (case-insensitive extensions): + * - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4 + * → read_mps() when mps_reader == default_reader, or read_mps_fast_experimental() + * when mps_reader == fast_experimental (fixed_mps_format must be false) + * - .lp, .lp.gz, .lp.bz2, .lp.lz4 → read_lp() * - anything else → std::logic_error * * This is the entry point of choice for user-facing tools (CLI, C API) that * want both formats to "just work" without an explicit format flag. * * @param[in] path Path to the input file. + * @param[in] mps_reader Selects the MPS reader implementation for MPS/QPS inputs. * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format; * ignored for LP inputs. False by default. * @return mps_data_model_t The parsed problem. */ template -inline mps_data_model_t read(const std::string& path, bool fixed_mps_format = false) +inline mps_data_model_t read(const std::string& path, + mps_reader_type_t mps_reader, + bool fixed_mps_format = false) { std::string lower(path); std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); - if (lower.ends_with(".mps") || lower.ends_with(".mps.gz") || lower.ends_with(".mps.bz2") || - lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2")) { - return read_mps(path, fixed_mps_format); + for (const char* compression_suffix : {".bz2", ".gz", ".lz4"}) { + if (lower.ends_with(compression_suffix)) { + lower.resize(lower.size() - std::strlen(compression_suffix)); + break; + } } - if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) { - return read_lp(path); + if (lower.ends_with(".mps") || lower.ends_with(".qps")) { + if (mps_reader == mps_reader_type_t::fast_experimental) { + if (fixed_mps_format) { + throw std::logic_error( + "experimental fast MPS reader does not support fixed MPS format forcing"); + } + return read_mps_fast_experimental(path); + } + return read_mps(path, fixed_mps_format); } + if (lower.ends_with(".lp")) { return read_lp(path); } throw std::logic_error( "read: unrecognized input file extension. Supported (case-insensitive): " - ".mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, .lp.bz2. " + ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, " + ".lp, .lp.gz, .lp.bz2, .lp.lz4. " "Given path: " + path); } +/** + * @brief Reads an optimization problem from a file, dispatching on the file + * extension. Extension matching is case-insensitive. + * + * Uses the default MPS reader. See the 3-argument read() overload for routing + * details and supported extensions. + * + * @param[in] path Path to the input file. + * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format; + * ignored for LP inputs. False by default. + * @return mps_data_model_t The parsed problem. + */ +template +inline mps_data_model_t read(const std::string& path, bool fixed_mps_format = false) +{ + return read(path, mps_reader_type_t::default_reader, fixed_mps_format); +} + } // namespace cuopt::linear_programming::io diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index 1ae6988466..6883cce82f 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -25,3 +25,4 @@ add_subdirectory(branch_and_bound) add_subdirectory(cuts) set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${UTIL_SRC_FILES} PARENT_SCOPE) +set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE) diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt index cc4affa890..cafcffb23f 100644 --- a/cpp/src/io/CMakeLists.txt +++ b/cpp/src/io/CMakeLists.txt @@ -3,6 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 # cmake-format: on +set(MPS_FAST_SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp +) + set(PARSERS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/data_model_view.cpp ${CMAKE_CURRENT_SOURCE_DIR}/file_to_string.cpp @@ -13,6 +20,8 @@ set(PARSERS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/cython_parser.cpp + ${MPS_FAST_SRC_FILES} ) set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${PARSERS_SRC_FILES} PARENT_SCOPE) +set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE) diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp new file mode 100644 index 0000000000..02aca44dc3 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -0,0 +1,436 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace fp64 { + +#define FASTP64_MIN_EXP_10 (-307) +#define FASTP64_MAX_EXP_10 288 +#define FASTP64_POWER_COUNT (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1) +#define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1) +#define FASTP64_EXPONENT_MASK 0x7FF +#define FASTP64_HALF_MASK 0x1FF + +// Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm +// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 +// (8), 2021. +// verified on a large corpus of FP64 values: https://github.com/lemire/simple_fastfloat_benchmark + +struct power_10_lut_entry_t { + uint64_t high; + uint64_t low; + int biased_e2; +}; + +// util class to perform 256bit precision arithmetic in constexpr to build the eisel-lemire lookup +// table +struct cuopt_uint256_t { + std::array limb{}; + + constexpr uint32_t mul_u32(uint32_t m) + { + unsigned __int128 carry = 0; + for (uint64_t& v : limb) { + unsigned __int128 x = (unsigned __int128)v * m + carry; + v = (uint64_t)x; + carry = x >> 64; + } + return (uint32_t)carry; + } + + constexpr cuopt_uint256_t shl_small(int bits) const + { + cuopt_uint256_t out; + if (bits == 0) return *this; + for (int i = 3; i >= 0; --i) { + uint64_t v = limb[i] << bits; + if (i > 0) v |= limb[i - 1] >> (64 - bits); + out.limb[i] = v; + } + return out; + } +}; + +struct cuopt_normalized_uint256_t { + cuopt_uint256_t sig; + int exp2 = 0; + + static constexpr cuopt_normalized_uint256_t one() + { + cuopt_normalized_uint256_t x; + x.sig.limb[3] = uint64_t{1} << 63; + x.exp2 = -255; + return x; + } + + constexpr void mul10() + { + uint32_t carry = sig.mul_u32(10); + int shift = 32 - std::countl_zero(carry); + // The normalized 256-bit value always overflows into carry after *10; keep + // the guard explicit because the cross-limb path shifts by 64 - shift. + if (shift == 0) { return; } + cuopt_uint256_t out; + for (int i = 0; i < 4; ++i) { + uint64_t lower = sig.limb[i] >> shift; + uint64_t upper = 0; + if (i + 1 < 4) { + upper = sig.limb[i + 1] << (64 - shift); + } else { + upper = (uint64_t)carry << (64 - shift); + } + out.limb[i] = lower | upper; + } + sig = out; + exp2 += shift; + } + + constexpr void div10() + { + constexpr uint64_t div10_shift_4_threshold = 0xA000000000000000ULL; + int shift = sig.limb[3] < div10_shift_4_threshold ? 4 : 3; + uint64_t extra = sig.limb[3] >> (64 - shift); + cuopt_uint256_t shifted = sig.shl_small(shift); + + cuopt_uint256_t quotient; + unsigned __int128 rem = extra; + for (int i = 3; i >= 0; --i) { + unsigned __int128 cur = (rem << 64) | shifted.limb[i]; + quotient.limb[i] = (uint64_t)(cur / 10); + rem = cur % 10; + } + sig = quotient; + exp2 -= shift; + } +}; + +constexpr power_10_lut_entry_t make_power(const cuopt_normalized_uint256_t& p) +{ + int e2 = p.exp2 + 192; + return {p.sig.limb[3], p.sig.limb[2], 1150 + e2}; +} + +// build time LUT for the lemire trick +constexpr std::array make_power_table() +{ + std::array table{}; + cuopt_normalized_uint256_t p = cuopt_normalized_uint256_t::one(); + table[-FASTP64_MIN_EXP_10] = make_power(p); + + for (int e = 1; e <= FASTP64_MAX_EXP_10; ++e) { + p.mul10(); + table[e - FASTP64_MIN_EXP_10] = make_power(p); + } + + p = cuopt_normalized_uint256_t::one(); + for (int e = -1; e >= FASTP64_MIN_EXP_10; --e) { + p.div10(); + table[e - FASTP64_MIN_EXP_10] = make_power(p); + } + return table; +} + +inline constexpr auto fast_fp64_parse_lut = make_power_table(); + +inline constexpr std::array small_powers = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + +inline constexpr std::array small_integer_powers = {1ULL, + 10ULL, + 100ULL, + 1000ULL, + 10000ULL, + 100000ULL, + 1000000ULL, + 10000000ULL, + 100000000ULL, + 1000000000ULL, + 10000000000ULL, + 100000000000ULL, + 1000000000000ULL, + 10000000000000ULL, + 100000000000000ULL, + 1000000000000000ULL}; + +struct parsed_decimal_t { + bool negative = false; + bool fast_eligible = false; + uint64_t mantissa = 0; + int exp10 = 0; +}; + +static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; } + +// SWAR 8char run of digits -> integer representation +// better and more portable than AVX2 stuff since AVX2 doesn't like swizzling across 16B lanes +// saw no real difference w/ 16B SSE +static inline bool parse_8_digits(const char* p, uint32_t& out) +{ + // comply with strict aliasing rules + std::array bytes{}; + std::memcpy(bytes.data(), p, bytes.size()); + uint64_t raw = std::bit_cast(bytes); + uint64_t high = raw & 0xF0F0F0F0F0F0F0F0ULL; + uint64_t low_check = (raw + 0x0606060606060606ULL) & 0xF0F0F0F0F0F0F0F0ULL; + if (high != 0x3030303030303030ULL || low_check != 0x3030303030303030ULL) { return false; } + + uint64_t v = raw - 0x3030303030303030ULL; + uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL; + uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL; + out = (uint32_t)((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL); + return true; +} + +static inline void parse_u64_digits_advance(const char*& p, const char* end, uint64_t& out) +{ + while (p < end && is_digit(*p)) { + if (end - p >= 8) { + uint32_t chunk = 0; + if (parse_8_digits(p, chunk)) { + out = out * 100000000ULL + (uint64_t)chunk; + p += 8; + continue; + } + } + out = out * 10 + (uint64_t)(*p - '0'); + ++p; + } +} + +static inline void scan_digit_run(const char*& p, + const char* end, + bool after_dot, + parsed_decimal_t& out, + bool& saw_digit, + int& frac_digits, + int& sig_digits, + bool& too_many_digits) +{ + while (p < end) { + uint32_t chunk = 0; + if (end - p >= 8 && parse_8_digits(p, chunk)) { + saw_digit = true; + if (after_dot) frac_digits += 8; + + if (!too_many_digits) { + if (sig_digits == 0 && chunk == 0) { + p += 8; + continue; + } + + if (sig_digits + 8 <= 19) { + out.mantissa = out.mantissa * 100000000ULL + chunk; + sig_digits += 8; + } else { + too_many_digits = true; + } + } + + p += 8; + continue; + } + + if (!is_digit(*p)) return; + saw_digit = true; + int digit = *p - '0'; + if (after_dot) ++frac_digits; + if (!too_many_digits && (digit != 0 || sig_digits != 0)) { + if (sig_digits < 19) { + out.mantissa = (out.mantissa * 10) + (uint64_t)digit; + ++sig_digits; + } else { + too_many_digits = true; + } + } + ++p; + } +} + +static inline bool parse_decimal_advance(const char*& p, const char* end, parsed_decimal_t& out) +{ + if (p < end && (*p == '-' || *p == '+')) { + out.negative = *p == '-'; + ++p; + } + + bool saw_digit = false; + int frac_digits = 0; + int sig_digits = 0; + bool too_many_digits = false; + + scan_digit_run(p, end, false, out, saw_digit, frac_digits, sig_digits, too_many_digits); + if (p < end && *p == '.') { + ++p; + scan_digit_run(p, end, true, out, saw_digit, frac_digits, sig_digits, too_many_digits); + } + + if (!saw_digit) return false; + + int explicit_exp = 0; + if (p < end && (*p == 'e' || *p == 'E' || *p == 'd' || *p == 'D')) { + const char* exp_start = p; + ++p; + bool exp_negative = false; + if (p < end && (*p == '-' || *p == '+')) { + exp_negative = *p == '-'; + ++p; + } + if (p == end || !is_digit(*p)) { + p = exp_start; + } else { + int exp_value = 0; + while (p < end && is_digit(*p)) { + if (exp_value < 1000000) exp_value = exp_value * 10 + (*p - '0'); + ++p; + } + explicit_exp = exp_negative ? -exp_value : exp_value; + } + } + + out.exp10 = explicit_exp - frac_digits; + out.fast_eligible = !too_many_digits; + return true; +} + +// fallback to stdlib for edge case or ambiguous roundings (very rare) +static inline double fallback_strtod(std::string_view s) +{ + char stack_buf[32]; + // The MPS specs mandate that numeric tokens are not longer than 25 characters + if (s.size() >= sizeof(stack_buf)) { + mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length"); + } + std::memcpy(stack_buf, s.data(), s.size()); + stack_buf[s.size()] = '\0'; + for (size_t i = 0; i < s.size(); ++i) { + if (stack_buf[i] == 'd' || stack_buf[i] == 'D') stack_buf[i] = 'e'; + } + + char* parse_end = nullptr; + errno = 0; + double value = std::strtod(stack_buf, &parse_end); + if (parse_end != stack_buf + s.size() || errno == ERANGE) { + mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token"); + } + return value; +} + +// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 +// (8), 2021. +static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits) +{ + if (exp10 < FASTP64_MIN_EXP_10 || exp10 > FASTP64_MAX_EXP_10) { return false; } + + const power_10_lut_entry_t p = fast_fp64_parse_lut[exp10 - FASTP64_MIN_EXP_10]; + int lz = std::countl_zero(man); + uint64_t norm = man << lz; + int adj_e2 = p.biased_e2 - lz; + + unsigned __int128 product = (unsigned __int128)norm * p.high; + uint64_t hi = (uint64_t)(product >> 64); + uint64_t lo = (uint64_t)product; + + // If the high product lands near the 9-bit halfway window, include the low + // 64x64 product to disambiguate rounding before deciding whether to fallback. + if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && lo + norm < norm) { + unsigned __int128 low_product = (unsigned __int128)norm * p.low; + uint64_t low_hi = (uint64_t)(low_product >> 64); + uint64_t low_lo = (uint64_t)low_product; + uint64_t old_lo = lo; + lo += low_hi; + hi += lo < old_lo ? 1 : 0; + if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && + lo == std::numeric_limits::max() && low_lo + norm < low_lo) { + return false; + } + } + + uint64_t hi_msb = hi >> 63; + // Extract 54 bits: 53 significand bits plus one rounding bit. The product + // may be shifted by one depending on whether hi already has its top bit set. + uint64_t x54 = hi >> (9 + hi_msb); + adj_e2 -= (int)(1 - hi_msb); + + // Exact halfway with round-to-even ambiguity; let strtod handle the rare tie. + if (lo == 0 && (hi & FASTP64_HALF_MASK) == 0 && (x54 & 3) == 1) { return false; } + + // Round 54 -> 53 bits, carry into the exponent if rounding overflows. + uint64_t x53 = (x54 + (x54 & 1)) >> 1; + uint64_t overflow = x53 >> 53; + uint64_t ret_man = (x53 >> overflow) & FASTP64_MANTISSA_MASK; + int ret_exp = adj_e2 + (int)overflow; + if (ret_exp <= 0 || ret_exp >= FASTP64_EXPONENT_MASK) { return false; } + + bits = ((uint64_t)ret_exp << 52) | ret_man; + return true; +} + +static inline double assemble_fp64(const parsed_decimal_t& dec) +{ + uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0; + if (dec.mantissa == 0) { return std::bit_cast(bits); } + + if (dec.fast_eligible) { + double small = 0.0; + bool used_small = false; + if (dec.exp10 >= 0 && dec.exp10 < (int)small_integer_powers.size()) { + uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10]; + if (dec.mantissa <= limit) { + small = (double)dec.mantissa * small_powers[dec.exp10]; + used_small = true; + } + } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) { + small = (double)dec.mantissa / small_powers[-dec.exp10]; + used_small = true; + } + if (used_small) { return dec.negative ? -small : small; } + + uint64_t mag_bits = 0; + if (eisel_lemire(dec.mantissa, dec.exp10, mag_bits)) { + return std::bit_cast(bits | mag_bits); + } + } + + return std::numeric_limits::quiet_NaN(); +} + +static inline double parse_fp64_advance(const char*& p, const char* end) +{ + const char* start = p; + parsed_decimal_t dec; + if (!parse_decimal_advance(p, end, dec)) { + return fallback_strtod(std::string_view(start, (size_t)(p - start))); + } + + double v = assemble_fp64(dec); + if (v == v) { + if (p < end && (unsigned char)*p > 32) { + mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token"); + } + return v; + } + return fallback_strtod(std::string_view(start, (size_t)(p - start))); +} + +} // namespace fp64 +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp new file mode 100644 index 0000000000..8897bfef1c --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -0,0 +1,386 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "fast_fp64_parser.hpp" + +#include +#include +#include + +#include +#include + +#ifndef LIKELY +#define LIKELY(x) __builtin_expect(!!(x), 1) +#endif + +#ifndef UNLIKELY +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#endif + +namespace cuopt::linear_programming::io::detail { + +enum scan_mode { + skip_whitespace, + until_whitespace, +}; + +// util to serially scan along an in-memory input buffer +// contains optimized primitives for most parsing operations +struct cursor_t { + const char* start; + const char* ptr; + const char* end; + + cursor_t(const char* data, std::size_t size) : start(data), ptr(data), end(data + size) {} + + bool done() const { return ptr >= end; } + + // used in error reporting + std::pair linecol_position() const + { + std::size_t line = 1; + const char* line_start = start; + for (const char* p = start; p < ptr; ++p) { + if (*p == '\n') { + ++line; + line_start = p + 1; + } + } + std::size_t column = (std::size_t)(ptr - line_start) + 1; + return {line, column}; + } + + [[noreturn]] void error(const char* msg, ...) + { + auto [line, col] = linecol_position(); + va_list args; + va_start(args, msg); + char msg_buf[512]; + std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args); + va_end(args); + mps_parser_fail(error_type_t::ValidationError, "%zu:%zu: %s", line, col, msg_buf); + } + + void advance(std::size_t n) + { + if (ptr + n > end) { mps_parser_fail(error_type_t::ValidationError, "Unexpected end of file"); } + ptr += n; + } + + template + static const char* scalar_scan(const char* p, const char* end) + { + while (p < end) { + unsigned char c = (unsigned char)*p; + if constexpr (mode == skip_whitespace) { + if (c > 32 || c == '\n') return p; + } else { + if (c <= 32) return p; + } + p++; + } + return end; + } + + // scans for the first non-whitespace (or vice versa) + template + static const char* simd_scan(const char* p, const char* end) + { + const simde__m256i v32 = simde_mm256_set1_epi8(32); // space/control characters + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + while (p + 32 <= end) { + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)p); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + + unsigned int mask; + if constexpr (mode == skip_whitespace) { + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); + } else { + mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); + } + + if (mask != 0) { return p + __builtin_ctz(mask); } + p += 32; + } + return scalar_scan(p, end); + } + + void skip_ws() { ptr = simd_scan(ptr, end); } + + bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); } + + void consume_eol() + { + if (ptr < end && *ptr == '\r') { + ptr++; + if (ptr < end && *ptr == '\n') { ptr++; } + return; + } + if (ptr < end && *ptr == '\n') { ptr++; } + } + + // could be SIMD but comments are usually rare + void skip_comment_line() + { + while (!done() && *ptr != '\n' && *ptr != '\r') { + ptr++; + } + consume_eol(); + } + + void skip_to_eol() + { + while (!done() && *ptr != '\n' && *ptr != '\r') { + ptr++; + } + } + + // useful for parsing NAME/OBJNAME which may span multiple "fields" according to the MPS spec + std::string_view read_rest_of_line_trimmed() + { + const char* begin = ptr; + const char* line_end = begin; + while (line_end < end && *line_end != '\n' && *line_end != '\r') { + ++line_end; + } + + while (begin < line_end && (*begin == ' ' || *begin == '\t')) { + ++begin; + } + while (line_end > begin && (line_end[-1] == ' ' || line_end[-1] == '\t')) { + --line_end; + } + ptr = line_end; + return std::string_view(begin, (std::size_t)(line_end - begin)); + } + + inline __attribute__((always_inline)) std::string_view read_field() + { + if (UNLIKELY(done())) { return {}; } + + const char* field_start = ptr; + if (UNLIKELY(end - ptr < 32)) { + ptr = scalar_scan(ptr, end); + const char* field_end = ptr; + if (ptr < end) { skip_ws(); } + return std::string_view(field_start, field_end - field_start); + } + + const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + // all input streams provide trailing padding, so this 32B load is valid + // whenever end - ptr >= 32 + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); + + if (UNLIKELY(ws_mask == 0)) { + ptr = simd_scan(ptr + 32, end); + const char* field_end = ptr; + if (ptr < end) { skip_ws(); } + return std::string_view(field_start, field_end - field_start); + } + + int field_end_off = __builtin_ctz(ws_mask); + const char* field_end = ptr + field_end_off; + + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + unsigned int stop_mask = + (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); + unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1); + + if (LIKELY(after_field != 0)) { + ptr = ptr + __builtin_ctz(after_field); + } else { + ptr = field_end; + if (ptr < end) { skip_ws(); } + } + + return std::string_view(field_start, field_end - field_start); + } + + // read but do not consume + inline __attribute__((always_inline)) std::string_view peek_field() + { + if (UNLIKELY(done())) { return {}; } + const char* field_end = simd_scan(ptr, end); + return std::string_view(ptr, field_end - ptr); + } + + static inline std::string_view peek_field_at(const char* line_start, const char* section_end) + { + cursor_t cursor(line_start, (std::size_t)(section_end - line_start)); + cursor.skip_ws(); + return cursor.peek_field(); + } + + // usually in MPS fields go in pair. these can usually be extracted in a single 32B load + inline __attribute__((always_inline)) std::pair + read_two_fields() + { + auto slow = [&] { + auto f1 = read_field(); + auto f2 = read_field(); + return std::pair{f1, f2}; + }; + + if (UNLIKELY(end - ptr < 32)) { return slow(); } + + const char* field1_start = ptr; + const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + // Same padded-buffer contract as read_field(). + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + + unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32); + unsigned int ws_mask = ~printable_mask; + + if (UNLIKELY(ws_mask == 0)) { return slow(); } + int field1_end_off = __builtin_ctz(ws_mask); + + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); + unsigned int barrier_after_field1 = (printable_mask | nl_mask) >> field1_end_off; + if (UNLIKELY(barrier_after_field1 == 0)) { return slow(); } + int field2_rel_off = __builtin_ctz(barrier_after_field1); + if (UNLIKELY(ptr[field1_end_off + field2_rel_off] == '\n' || + ptr[field1_end_off + field2_rel_off] == '\r')) { + return slow(); + } + int field2_start_off = field1_end_off + field2_rel_off; + + unsigned int ws_after_field2_start = ws_mask >> field2_start_off; + if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); } + int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start); + + unsigned int stop_mask = printable_mask | nl_mask; + unsigned int stop_after_field2 = stop_mask >> field2_end_off; + if (LIKELY(stop_after_field2 != 0)) { + ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2); + } else { + ptr = ptr + field2_end_off; + skip_ws(); + } + + return {std::string_view(field1_start, field1_end_off), + std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)}; + } +}; + +static inline void expect(cursor_t& cursor, const char* field) +{ + auto id = cursor.read_field(); + if (UNLIKELY(id != field)) { + cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data()); + } +} + +static inline void accept_comment_line(cursor_t& cursor) +{ + for (;;) { + while (!cursor.done() && cursor.eol()) { + cursor.consume_eol(); + } + if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; } + cursor.skip_comment_line(); + } +} + +static inline void expect_eol(cursor_t& cursor) +{ + if (UNLIKELY(!cursor.eol())) { + auto got = cursor.peek_field(); + cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data()); + } + + for (;;) { + while (cursor.eol()) { + cursor.consume_eol(); + } + if (UNLIKELY(cursor.done())) { return; } + + if (UNLIKELY(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) { + cursor.skip_comment_line(); + continue; + } + + if (LIKELY(cursor.ptr[0] == ' ') && LIKELY(cursor.ptr + 1 < cursor.end)) { cursor.ptr += 1; } + + if (UNLIKELY(cursor.done())) { return; } + char c = cursor.ptr[0]; + if (UNLIKELY(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) { + cursor.skip_ws(); + if (cursor.eol()) { continue; } + } + break; + } +} + +static inline std::string_view peek(cursor_t& cursor) { return cursor.peek_field(); } + +static inline bool accept(cursor_t& cursor, const char* field) +{ + if (peek(cursor) == field) { + expect(cursor, field); + return true; + } + return false; +} + +static inline void expect_section(cursor_t& cursor, const char* section) +{ + expect(cursor, section); + expect_eol(cursor); +} + +static inline double expect_number(cursor_t& cursor) +{ + auto num = cursor.read_field(); + if (num.empty()) { cursor.error("expected number, got empty field"); } + const char* p = num.data(); + return fp64::parse_fp64_advance(p, p + num.size()); +} + +static inline double expect_number_fast_pm_one(cursor_t& cursor) +{ + const char* p = cursor.ptr; + if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') { + cursor.ptr = p + 2; + cursor.skip_ws(); + return -1.0; + } + if (cursor.end - p >= 2 && p[0] == '1' && p[1] <= ' ') { + cursor.ptr = p + 1; + cursor.skip_ws(); + return 1.0; + } + return expect_number(cursor); +} + +static inline bool accept_section(cursor_t& cursor, const char* section) +{ + if (accept(cursor, section)) { + expect_eol(cursor); + return true; + } + return false; +} + +static inline bool accept_comment(cursor_t& cursor) +{ + if (UNLIKELY(!cursor.done() && cursor.ptr[0] == '$')) { + cursor.skip_to_eol(); + return true; + } + return false; +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp new file mode 100644 index 0000000000..02038c6fd9 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -0,0 +1,3219 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "fast_parser.hpp" +#include "fast_parse_primitives.hpp" +#include "file_reader.hpp" +#include "hash_table_smallstr.hpp" +#include "mmap_region.hpp" +#include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" + +#include +#if defined(MPS_FAST_PERF_COUNTERS) || defined(MPS_FAST_TIMERS) +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MPS_FAST_COMPACT_ROW_HASH +#define MPS_FAST_THP_PREFAULT + +namespace cuopt::linear_programming::io::detail { + +static constexpr size_t KiB = 1024; +static constexpr size_t MiB = 1024 * KiB; +static constexpr size_t GiB = 1024 * MiB; + +// per-chunk row-count scratch tile for the column parsing workers +// small enough to remain warm in L1 +static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; +static constexpr int MPS_ROWS_THREAD_CAP = 16; +static constexpr int MPS_COLUMNS_THREAD_CAP = 32; +static constexpr int MPS_BOUNDS_THREAD_CAP = 32; +static constexpr int MPS_NAMES_THREAD_CAP = 16; +// avoid openmp setup for small bounds sections +static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256 * MiB; +// ordered-name fallback is cheap enough to parallelize on smaller bounds sections +static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB; +// lower bound on columns chunk size to avoid tiny parser tasks +static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB; +// parser-wide thread cap switch; very small files lose to scheduling overhead +static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull; +// thread caps for small and large files +static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16; +static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32; + +static int parser_thread_cap_for_size(size_t bytes) +{ + int size_cap = bytes < MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES ? MPS_SMALL_FILE_THREAD_CAP + : MPS_LARGE_FILE_THREAD_CAP; + return std::max(1, std::min(size_cap, omp_get_max_threads())); +} + +static int phase_thread_count(int phase_cap) +{ + const int available_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads(); + return std::max(1, std::min(phase_cap, available_threads)); +} + +// Arena allocator for the strings (row names, column names) to avoid the dreadful overheads of +// glibc's malloc and std::vector +class chunk_name_arena_t { + public: + void reserve(size_t bytes) + { + if (bytes > next_slab_size_) { next_slab_size_ = bytes; } + } + + std::string_view copy(std::string_view name) + { + char* dst = allocate(name.size() + 1); + std::memcpy(dst, name.data(), name.size()); + dst[name.size()] = '\0'; + return std::string_view(dst, name.size()); + } + + private: + struct slab_t { + std::vector data; + size_t used = 0; + }; + + char* allocate(size_t bytes) + { + if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().data.size()) { + size_t capacity = std::max(bytes, next_slab_size_); + slab_t slab; + slab.data.resize(capacity); + slabs_.push_back(std::move(slab)); + next_slab_size_ = std::max(next_slab_size_ * 2, capacity); + } + slab_t& slab = slabs_.back(); + char* ptr = slab.data.data() + slab.used; + slab.used += bytes; + return ptr; + } + + std::vector slabs_; + size_t next_slab_size_ = 64 * KiB; +}; + +struct timer_entry_t { + const char* name; + double elapsed_ms; + size_t rss_kb; + size_t hwm_kb; +}; + +static std::vector& get_timer_buffer() +{ + static std::vector buffer; + buffer.reserve(100); + return buffer; +} + +static std::mutex& get_timer_mutex() +{ + static std::mutex mutex; + return mutex; +} + +static void flush_timers() +{ +#ifdef MPS_FAST_TIMERS + std::lock_guard lock(get_timer_mutex()); + auto& buffer = get_timer_buffer(); + for (const auto& entry : buffer) { + std::fprintf(stderr, + "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f\n", + entry.name, + entry.elapsed_ms, + (double)entry.rss_kb / (double)(GiB / KiB), + (double)entry.hwm_kb / (double)(GiB / KiB)); + } + buffer.clear(); +#endif +} + +enum class materialize_touch_t { + write_2mb, + write_4kb, +}; + +// instanciate a range using mmap anon pages with hugepage hints, and materialize them +// by touching each to nudge the kernel into invoking its THP mechanism +static void materialize_hugepages([[maybe_unused]] const char* label, + void* data, + size_t bytes, + materialize_touch_t touch) +{ + if (data == nullptr || bytes == 0) return; + + constexpr size_t two_mb = 2 * MiB; + size_t page_size = system_page_size(); + uintptr_t start = reinterpret_cast(data); + uintptr_t end = start + bytes; + uintptr_t aligned_start = start & ~(uintptr_t)(page_size - 1); + uintptr_t aligned_end = (end + page_size - 1) & ~(uintptr_t)(page_size - 1); + size_t aligned_bytes = (size_t)(aligned_end - aligned_start); + + errno = 0; + madvise((void*)(aligned_start), aligned_bytes, MADV_HUGEPAGE); + + size_t step = touch == materialize_touch_t::write_2mb ? two_mb : page_size; + volatile char* ptr = (volatile char*)(data); + for (size_t offset = 0; offset < bytes; offset += step) { + ptr[offset] = ptr[offset]; + } + ptr[bytes - 1] = ptr[bytes - 1]; +} + +template +static void materialize_vector_hugepages(const char* label, + std::vector& values, + materialize_touch_t touch) +{ + materialize_hugepages(label, values.data(), values.size() * sizeof(T), touch); +} + +class scoped_timer_t { + public: + scoped_timer_t([[maybe_unused]] const char* name, double* accumulator = nullptr) +#ifdef MPS_FAST_TIMERS + : name_(name), + accumulator_(accumulator), + nvtx_(name, nvtx::color_for_name(name)), + start_(std::chrono::high_resolution_clock::now()){} +#else + : accumulator_(accumulator) + { + } +#endif + + ~scoped_timer_t() + { +#ifdef MPS_FAST_TIMERS + auto end = std::chrono::high_resolution_clock::now(); + double elapsed_ms = std::chrono::duration(end - start_).count(); + nvtx_.end(); + if (accumulator_) { *accumulator_ += elapsed_ms; } + auto [rss_kb, hwm_kb] = current_process_rss_kb(); + std::lock_guard lock(get_timer_mutex()); + get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb}); +#endif + } + + scoped_timer_t(const scoped_timer_t&) = delete; + scoped_timer_t& operator=(const scoped_timer_t&) = delete; + + private: +#ifdef MPS_FAST_TIMERS + const char* name_; +#endif + double* accumulator_; +#ifdef MPS_FAST_TIMERS + nvtx::scoped_range_t nvtx_; + std::chrono::high_resolution_clock::time_point start_; +#endif +}; + +class omp_max_active_levels_guard_t { + public: + explicit omp_max_active_levels_guard_t(int value) : old_value_(omp_get_max_active_levels()) + { + omp_set_max_active_levels(value); + } + + ~omp_max_active_levels_guard_t() { omp_set_max_active_levels(old_value_); } + + omp_max_active_levels_guard_t(const omp_max_active_levels_guard_t&) = delete; + omp_max_active_levels_guard_t& operator=(const omp_max_active_levels_guard_t&) = delete; + + private: + int old_value_ = 0; +}; + +static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section) +{ + const char* row_end = row_start; + while (row_end < cursor.end && *row_end > ' ') { + row_end++; + } + cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start); +} + +// Two modes for row/column name lookup: +// - hash: arbitrary names via hash table (rows) or var_names_map (columns) +// - dense_ordered: sequential numeric suffixes like R0001/R0002 or V0/V1 +enum class index_mode_t { + hash, + dense_ordered, +}; + +// Every 19-digit decimal string fits in uint64_t; 20+ digits may not and are wildly unlikely in the +// context of dense MPS rows/cols +static constexpr size_t dense_suffix_max_digits = 19; + +static inline size_t decimal_digits_u64(uint64_t value) +{ + size_t digits = 1; + while (value >= 10) { + value /= 10; + digits++; + } + return digits; +} + +static inline bool parse_trailing_u64(std::string_view name, + std::string_view& prefix, + uint64_t& value, + size_t& suffix_width) +{ + size_t pos = name.size(); + while (pos > 0 && fp64::is_digit(name[pos - 1])) { + pos--; + } + if (pos == name.size()) { return false; } + + suffix_width = name.size() - pos; + if (suffix_width > dense_suffix_max_digits) { return false; } + + uint64_t parsed = 0; + for (size_t i = pos; i < name.size(); ++i) { + parsed = parsed * 10 + (uint64_t)(name[i] - '0'); + } + + prefix = std::string_view(name.data(), pos); + value = parsed; + return true; +} + +// necessary to handle cases like R0001, ..., R2000, ... +static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width) +{ + return suffix_width > 1 && name[name.size() - suffix_width] == '0'; +} + +static inline size_t dense_initial_pad_width(std::string_view name, size_t suffix_width) +{ + return dense_suffix_is_zero_padded(name, suffix_width) ? suffix_width : 0; +} + +static inline bool dense_suffix_width_ok(uint64_t value, size_t suffix_width, size_t pad_width) +{ + size_t digits = decimal_digits_u64(value); + size_t expected_width = std::max(pad_width, digits); + return suffix_width == expected_width; +} + +struct dense_name_index_t { + std::string prefix; + uint64_t min_id = 0; + uint64_t max_id = 0; + size_t pad_width = 0; + + void reset() + { + prefix.clear(); + min_id = 0; + max_id = 0; + pad_width = 0; + } + + bool suffix_width_ok(uint64_t value, size_t suffix_width) const + { + return dense_suffix_width_ok(value, suffix_width, pad_width); + } + + size_t lookup(std::string_view name) const + { + std::string_view parsed_prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, parsed_prefix, value, suffix_width)) { return SIZE_MAX; } + if (parsed_prefix != prefix || !suffix_width_ok(value, suffix_width)) { return SIZE_MAX; } + if (value < min_id || value > max_id) { return SIZE_MAX; } + return (size_t)(value - min_id); + } + + void format_name(size_t idx, std::string& out) const + { + uint64_t value = min_id + idx; + char digits_buf[32]; + auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value); + if (ec != std::errc()) { + out.assign(prefix); + return; + } + size_t digits_len = (size_t)(digits_end - digits_buf); + size_t width = std::max(pad_width, digits_len); + out.resize(prefix.size() + width); + std::memcpy(out.data(), prefix.data(), prefix.size()); + char* suffix = out.data() + prefix.size(); + if (width > digits_len) { + std::memset(suffix, '0', width - digits_len); + suffix += width - digits_len; + } + std::memcpy(suffix, digits_buf, digits_len); + } +}; + +struct dense_observe_state_t { + bool candidate = true; + dense_name_index_t index; + size_t count = 0; +}; + +static inline void observe_dense_name(bool& candidate, + dense_name_index_t& index, + size_t& observed_count, + std::string_view name, + uint64_t expected_id = std::numeric_limits::max()) +{ + if (!candidate) { return; } + + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { + candidate = false; + return; + } + + if (observed_count == 0) { + index.prefix.assign(prefix); + index.min_id = value; + index.max_id = value; + index.pad_width = dense_initial_pad_width(name, suffix_width); + observed_count = 1; + return; + } + + if (prefix != index.prefix) { + candidate = false; + return; + } + + if (expected_id != std::numeric_limits::max() && value != expected_id) { + candidate = false; + return; + } + + if (!index.suffix_width_ok(value, suffix_width)) { + candidate = false; + return; + } + + index.max_id = value; + observed_count++; +} + +// Maps MPS row/column names to indices via one of two strategies, chosen per problem: +// +// * dense_ordered - when every name in a section is a shared prefix followed by a +// contiguous run of integers (e.g. R0001, R0002, ... or x1, x2, ...). The index is +// then computed straight from the parsed integer (value - min_id), so no hash table +// is built or probed. This is the common, fast case for solver-generated models. +// * hash - the general fallback (smallstr_hash_table_t) for arbitrary names. +// +// Each section decides its own mode while scanning: it stays a dense_ordered "candidate" +// as long as names keep matching the prefix + consecutive-integer + zero-pad-width rule +// (see observe_dense_name), and the first violation drops it to the hash path. The chosen +// mode lives in row_index_mode / col_index_mode, and every lookup branches on it +// (row_lookup / read_row_lookup vs the dense_ordered variants below). Holding this in mind +// explains most of the paired/dual code paths throughout this file. +template +struct parse_state_t { + mps_data_model_t& problem; + cursor_t& cursor; + + // backed by the input buffer + std::vector row_names_sv; + // backed by the arena allocator + std::vector var_names_sv; + std::vector var_name_arenas; + std::string_view problem_name_sv; + std::string_view objective_name_sv; + // secondary 'N' rows in ROWS — rare; membership distinguishes them from unknown row names + std::unordered_set ignored_objective_names; + + // Column name lookup for labels like V0, V1, ... + index_mode_t col_index_mode = index_mode_t::hash; + dense_name_index_t col_dense; + + smallstr_hash_table_t row_hash_; + + // Row name lookup for labels like R0001, R0002, ... + index_mode_t row_index_mode = index_mode_t::hash; + bool row_dense_candidate = true; + dense_name_index_t row_dense; + + // var_names still uses STL (only used in parse_bounds, not as hot) + std::unordered_map var_names_map; + + mmap_region_t temp_A_region; + mmap_region_t temp_A_indices_region; + f_t* temp_A = nullptr; + i_t* temp_A_indices = nullptr; + size_t temp_csr_nnz = 0; + bool temp_csr_materialized = false; + + struct bounds_only_var_t { + f_t lb = f_t{0}; + f_t ub = std::numeric_limits::infinity(); + char type = 'C'; + }; + + // some writers introduce zero-column variables only in BOUNDS. + std::map bounds_only_vars; + + struct qcmatrix_block_t { + size_t row_idx = SIZE_MAX; + std::string_view row_name; + std::vector> entries; + }; + + std::vector qcmatrix_blocks; + + parse_state_t(mps_data_model_t& p, cursor_t& c) : problem(p), cursor(c) {} + + void init_row_hash_table() + { + if (init_row_dense_ordered_table()) { return; } + init_row_hash_table_impl(); + } + + void observe_objective_row_name(std::string_view name) + { + if (objective_name_sv.empty()) { + objective_name_sv = name; + } else if (name != objective_name_sv) { + ignored_objective_names.insert(name); + } + } + + bool init_row_dense_ordered_table() + { + scoped_timer_t timer("row_dense_finalize"); + size_t n_rows = row_names_sv.size(); + if (!row_dense_candidate || n_rows == 0) { return false; } + if (row_dense.max_id < row_dense.min_id) { return false; } + uint64_t dense_count = row_dense.max_id - row_dense.min_id + 1; + if (dense_count != n_rows) { return false; } + + row_index_mode = index_mode_t::dense_ordered; + return true; + } + + // Insert all rows into the hash table. The perf-counter instrumentation is isolated in + // these two helpers so its #ifdefs do not fragment init_row_hash_table_impl's setup flow; + // both compile down to a bare insert loop when MPS_FAST_PERF_COUNTERS is off. + void insert_rows_partitioned( + int num_threads, + const std::array& partition_offsets, + const std::vector& row_order, + const std::vector& row_hashes) + { + scoped_timer_t timer("row_hash_insert_partitioned"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots(MPS_ROW_HASH_PARTITIONS); +#endif +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { + size_t p = (size_t)part_id; +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { + size_t idx = row_order[pos]; + row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx); + } +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[p] = perf_counters.stop(); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("row_hash_insert_partitioned", perf_snapshots); +#endif + } + + void insert_rows_serial(size_t n_rows) + { +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + for (size_t idx = 0; idx < n_rows; ++idx) { + row_hash_.insert_serial(row_names_sv[idx], idx); + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("row_hash_insert_all", {perf_counters.stop()}); +#endif + } + + void init_row_hash_table_impl() + { + scoped_timer_t timer("row_hash_init_total"); + size_t n_rows = row_names_sv.size(); + const int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); + const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1; +#ifdef MPS_FAST_COMPACT_ROW_HASH + constexpr bool compact_row_hash = true; +#else + constexpr bool compact_row_hash = false; +#endif + std::vector row_hashes; + std::vector row_order; + std::array partition_counts = {}; + std::array partition_offsets = {}; + + if (use_partitioned) { + scoped_timer_t timer("row_hash_partition_metadata"); + row_hashes.resize(n_rows); + size_t inline_rows = 0; + for (size_t idx = 0; idx < n_rows; ++idx) { + std::string_view name = row_names_sv[idx]; + if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { + row_hash_.note_long_name(name, idx); + continue; + } + uint32_t hash = fnv1a_hash(name.data(), name.size()); + row_hashes[idx] = hash; + ++partition_counts[hash_partition_for(hash)]; + ++inline_rows; + } + + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + partition_offsets[p + 1] = partition_offsets[p] + partition_counts[p]; + } + + row_order.resize(inline_rows); + auto next_offsets = partition_offsets; + for (size_t idx = 0; idx < n_rows; ++idx) { + if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; } + size_t part = hash_partition_for(row_hashes[idx]); + row_order[next_offsets[part]++] = idx; + } + } + + if (use_partitioned) { + row_hash_.configure_partitioned_buckets(partition_counts, compact_row_hash); + } else { + row_hash_.configure_serial_buckets(n_rows, compact_row_hash); + } + + { + scoped_timer_t timer("row_hash_mmap"); + row_hash_.allocate_mmap("row hash table"); + } + +#ifdef MPS_FAST_THP_PREFAULT + { + scoped_timer_t timer("row_hash_thp_prefault"); + materialize_hugepages("row_names_ht", + row_hash_.slots(), + row_hash_.region().size(), + materialize_touch_t::write_2mb); + } +#endif + + { + scoped_timer_t timer("row_hash_insert_all"); + row_hash_.reset_build_probe_stats(); + if (use_partitioned) { + insert_rows_partitioned(num_threads, partition_offsets, row_order, row_hashes); + } else { + insert_rows_serial(n_rows); + } + row_hash_.print_build_probe_report(n_rows); + } + +#ifdef MPS_FAST_MADV_COLLAPSE + { + scoped_timer_t timer("row_hash_madv_collapse"); + row_hash_.region().advise(MADV_COLLAPSE); + } +#endif + } + + size_t row_lookup(std::string_view name) const + { + if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); } + return row_hash_.lookup(name); + } + + size_t read_row_lookup_dense_ordered(cursor_t& cursor) const + { + const char* start = cursor.ptr; + const char* p = start; + + size_t prefix_len = row_dense.prefix.size(); + if (prefix_len > 0) { + if ((size_t)(cursor.end - p) < prefix_len || + std::memcmp(p, row_dense.prefix.data(), prefix_len) != 0) { + cursor.read_field(); + return SIZE_MAX; + } + p += prefix_len; + } + + const char* digits_start = p; + uint64_t value = 0; + fp64::parse_u64_digits_advance(p, cursor.end, value); + + size_t suffix_width = (size_t)(p - digits_start); + if (suffix_width == 0 || suffix_width > dense_suffix_max_digits || p >= cursor.end || + *p > ' ' || !row_dense.suffix_width_ok(value, suffix_width) || value < row_dense.min_id || + value > row_dense.max_id) { + cursor.ptr = start; + cursor.read_field(); + return SIZE_MAX; + } + + cursor.ptr = p; + cursor.skip_ws(); + return (size_t)(value - row_dense.min_id); + } + + size_t read_row_lookup(cursor_t& cursor) const + { + if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { + return read_row_lookup_dense_ordered(cursor); + } + + auto row_name = cursor.read_field(); + return row_hash_.lookup(row_name); + } +}; + +// ============================================================================= +// Section parsers +// ============================================================================= + +template +static void parse_name_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_name"); + if (peek(state.cursor) == "ROWS") { return; } + expect(state.cursor, "NAME"); + if (!state.cursor.eol()) { state.problem_name_sv = state.cursor.read_rest_of_line_trimmed(); } + expect_eol(state.cursor); +} + +template +static void parse_objsense_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_objsense"); + if (accept(state.cursor, "OBJSENSE")) { + if (state.cursor.eol()) { expect_eol(state.cursor); } + auto sense = state.cursor.read_field(); + if (sense == "MIN" || sense == "MINIMIZE") { + state.problem.maximize_ = false; + } else if (sense == "MAX" || sense == "MAXIMIZE") { + state.problem.maximize_ = true; + } else { + state.cursor.error("expected MIN/MAX or MINIMIZE/MAXIMIZE, got '%s'", sense.data()); + } + accept_comment(state.cursor); + expect_eol(state.cursor); + } +} + +template +static void parse_objname_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_objname"); + if (accept(state.cursor, "OBJNAME")) { + if (state.cursor.eol()) { expect_eol(state.cursor); } + state.objective_name_sv = state.cursor.read_field(); + accept_comment(state.cursor); + expect_eol(state.cursor); + } +} + +struct row_chunk_boundary_t { + const char* start; + const char* end; +}; + +struct row_chunk_info_t { + size_t constraints = 0; + bool malformed = false; + std::vector objective_names; + bool has_first_constraint = false; + std::string_view first_constraint_name; +}; + +static const char* rows_find_next_line(const char* p, const char* end) +{ + while (p < end && *p != '\n') + p++; + if (p < end) p++; + return p; +} + +static bool parse_rows_line_fast(const char*& p, + const char* end, + char& row_type, + std::string_view& row_name) +{ + p = cursor_t::simd_scan(p, end); + if (p >= end) { return false; } + if (*p == '\n') { + p++; + return false; + } + if (*p == '*' || *p == '$') { + p = rows_find_next_line(p, end); + return false; + } + + row_type = *p++; + p = cursor_t::simd_scan(p, end); + + const char* name_start = p; + p = cursor_t::simd_scan(p, end); + if (name_start == p) { return false; } + row_name = std::string_view(name_start, (size_t)(p - name_start)); + + // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and + // field 3 may start with '$' to comment the rest of the record. + // could be SIMD'd, but in practice the newline is right after the row name + p = rows_find_next_line(p, end); + return true; +} + +// row chunks are established based on byte count, thus boundaries can land in the middle of a row +// this cleans up chunks to have row line boundaries +static std::vector compute_row_chunk_boundaries(const char* rows_start, + const char* rows_end, + int num_threads) +{ + scoped_timer_t timer("rows_compute_chunk_boundaries"); + + std::vector boundaries((size_t)num_threads); + size_t total_size = (size_t)(rows_end - rows_start); + size_t chunk_size = total_size / (size_t)num_threads; + + boundaries[0].start = rows_start; + for (int t = 0; t < num_threads; ++t) { + if (t == num_threads - 1) { + boundaries[(size_t)t].end = rows_end; + } else { + const char* boundary = rows_start + (size_t)(t + 1) * chunk_size; + boundary = rows_find_next_line(boundary, rows_end); + boundaries[(size_t)t].end = boundary; + boundaries[(size_t)t + 1].start = boundary; + } + } + + return boundaries; +} + +// reads the row section in chunks and inserts into the worker's hash table partition +// Parallel ROWS parser: count constraints per chunk, prefix-sum, then fill the output arrays +// in parallel (with per-chunk dense-name reconciliation at the end). Must keep the same line +// grammar as its serial twin parse_rows_section_serial_impl; parse_rows_section chooses between +// them by size. Returns false if a chunk hit a malformed line (nothing committed for the fill +// pass), so the caller can reset and retry serially for clean error reporting. +template +static bool parse_rows_section_parallel_impl(parse_state_t& state, + const char* rows_start, + const char* rows_end, + int num_threads) +{ + scoped_timer_t timer("parse_rows_parallel"); + + auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads); + std::vector infos((size_t)num_threads); + + { + scoped_timer_t timer("rows_count_parallel"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows); + const char* p = boundaries[(size_t)t].start; + const char* end = boundaries[(size_t)t].end; + row_chunk_info_t info; + + while (p < end) { + char row_type = 0; + std::string_view row_name; + const char* before = p; + if (!parse_rows_line_fast(p, end, row_type, row_name)) { + if (p == before) { + info.malformed = true; + break; + } + continue; + } + + if (row_type == 'N') { + info.objective_names.push_back(row_name); + } else { + if (!info.has_first_constraint) { + info.first_constraint_name = row_name; + info.has_first_constraint = true; + } + info.constraints++; + } + } + + infos[(size_t)t] = info; + } + } + + if (std::any_of( + infos.begin(), infos.end(), [](const row_chunk_info_t& info) { return info.malformed; })) { + return false; + } + + // prefix sum to do a paralle scatter of every row entries into the global output arrays + std::vector offsets((size_t)num_threads + 1, 0); + { + scoped_timer_t timer("rows_prefix_sum"); + for (int t = 0; t < num_threads; ++t) { + offsets[(size_t)t + 1] = offsets[(size_t)t] + infos[(size_t)t].constraints; + } + } + + size_t total_rows = offsets[(size_t)num_threads]; + if (UNLIKELY(total_rows > (size_t)INT_MAX)) { + state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", total_rows); + } + { + scoped_timer_t timer("rows_resize_outputs"); + state.row_names_sv.resize(total_rows); + state.problem.row_types_.resize(total_rows); + } + + if (state.objective_name_sv.empty()) { + for (const auto& info : infos) { + if (!info.objective_names.empty()) { + state.objective_name_sv = info.objective_names.front(); + break; + } + } + } + for (const auto& info : infos) { + for (std::string_view name : info.objective_names) { + if (name != state.objective_name_sv) { state.ignored_objective_names.insert(name); } + } + } + + bool dense_candidate = total_rows > 0; + std::string_view dense_prefix; + uint64_t dense_base_id = 0; + size_t dense_pad_width = 0; + + if (dense_candidate) { + std::string_view first_name; + for (const auto& info : infos) { + if (info.has_first_constraint) { + first_name = info.first_constraint_name; + break; + } + } + + uint64_t first_value = 0; + size_t first_suffix_width = 0; + if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) { + dense_candidate = false; + } else { + dense_base_id = first_value; + dense_pad_width = dense_initial_pad_width(first_name, first_suffix_width); + } + } + + std::vector dense_ok_by_chunk((size_t)num_threads, 1); + + { + scoped_timer_t timer("rows_fill_parallel"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + MPS_NVTX_RANGE(std::string("rows_fill_chunk ") + std::to_string(t), nvtx::colors::rows); + const char* p = boundaries[(size_t)t].start; + const char* end = boundaries[(size_t)t].end; + size_t out = offsets[(size_t)t]; + + bool local_dense_ok = dense_candidate; + dense_name_index_t dense_index; + if (local_dense_ok) { + dense_index.prefix.assign(dense_prefix); + dense_index.min_id = dense_base_id; + dense_index.max_id = dense_base_id; + dense_index.pad_width = dense_pad_width; + } + + while (p < end) { + char row_type = 0; + std::string_view row_name; + const char* before = p; + if (!parse_rows_line_fast(p, end, row_type, row_name)) { + if (p == before) { + local_dense_ok = false; + break; + } + continue; + } + + if (row_type == 'N') { continue; } + + state.row_names_sv[out] = row_name; + state.problem.row_types_[out] = row_type; + + if (local_dense_ok) { + size_t observed_count = out; + observe_dense_name( + local_dense_ok, dense_index, observed_count, row_name, dense_base_id + out); + } + out++; + } + + dense_ok_by_chunk[(size_t)t] = local_dense_ok ? 1 : 0; + } + } + + { + scoped_timer_t timer("rows_dense_metadata"); + for (uint8_t ok : dense_ok_by_chunk) { + dense_candidate = dense_candidate && ok; + } + state.row_dense_candidate = dense_candidate; + if (dense_candidate) { + state.row_dense.prefix.assign(dense_prefix); + state.row_dense.min_id = dense_base_id; + state.row_dense.max_id = dense_base_id + total_rows - 1; + state.row_dense.pad_width = dense_pad_width; + } + } + + return true; +} + +template +static void parse_rows_section_serial_impl(parse_state_t& state, const char* rows_end) +{ + scoped_timer_t timer("parse_rows_serial"); + + while (state.cursor.ptr < rows_end) { + auto row_type = state.cursor.ptr[0]; + state.cursor.advance(1); + state.cursor.skip_ws(); + + auto row_name = state.cursor.read_field(); + // ROWS fields after the row name are unused; tolerate annotations/comments there. + state.cursor.skip_to_eol(); + + // 'N' type is the objective row - store its name but don't add to constraints + if (row_type == 'N') { + state.observe_objective_row_name(row_name); + } else { + size_t row_idx = state.row_names_sv.size(); + state.row_names_sv.push_back(row_name); + observe_dense_name( + state.row_dense_candidate, + state.row_dense, + row_idx, + row_name, + row_idx == 0 ? std::numeric_limits::max() : state.row_dense.min_id + row_idx); + state.problem.row_types_.push_back(row_type); + } + expect_eol(state.cursor); + } + if (UNLIKELY(state.row_names_sv.size() > (size_t)INT_MAX)) { + state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", + state.row_names_sv.size()); + } +} + +template +static void parse_rows_section(parse_state_t& state, const char* rows_end) +{ + scoped_timer_t timer("parse_rows"); + expect_section(state.cursor, "ROWS"); + + { + scoped_timer_t timer("parse_rows_scan"); + const char* rows_start = state.cursor.ptr; + + size_t rows_bytes = (size_t)(rows_end - state.cursor.ptr); + int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); + bool parsed_parallel = false; + if (rows_bytes >= 512 * MiB && num_threads > 1) { + parsed_parallel = + parse_rows_section_parallel_impl(state, state.cursor.ptr, rows_end, num_threads); + // serial fallback in case a likely malformed chunk has been encounter + // makes error reporting much easier + if (!parsed_parallel) { + state.row_names_sv.clear(); + state.problem.row_types_.clear(); + state.row_dense_candidate = true; + state.row_dense.reset(); + state.cursor.ptr = rows_start; + parse_rows_section_serial_impl(state, rows_end); + } + } else { + parse_rows_section_serial_impl(state, rows_end); + } + state.cursor.ptr = rows_end; + } + + state.problem.n_constraints_ = (i_t)state.row_names_sv.size(); + state.problem.b_.resize((size_t)state.problem.n_constraints_); + + { + scoped_timer_t timer("parse_rows_hash_init"); + state.init_row_hash_table(); + } +} + +// Columns parser + +// integer variable markers +struct marker_info_t { + enum Type { INTORG, INTEND }; + Type type; + size_t after_local_var_idx; // SIZE_MAX means "before first variable" +}; + +struct row_count_block_t { + size_t block_id = 0; + size_t storage_offset = 0; +}; + +// Each column parsing worker owns chunks of the global CSC which are parsed in parallel and then +// later scattered into the final CSR +struct chunk_result_t { + std::vector values; + std::vector row_indices; + std::vector col_offsets; + std::vector var_names; + chunk_name_arena_t var_name_arena; + std::vector markers; + std::vector> objective_entries; // local_col_idx -> coefficient + // COLUMNS is parsed as chunk-local CSC. To build the global CSR, each chunk needs row counts + // first, then row-local write cursors for scatter. Store those counts only for touched + // 4096-row blocks instead of allocating a dense chunks*n_rows matrix + // The same slots are rewritten as write cursors after the global CSR row offsets are known + std::vector row_count_storage; + std::vector row_count_blocks; + std::vector row_count_block_dir; + dense_observe_state_t dense_col_stats; +}; + +struct chunk_boundary_t { + const char* start; + const char* end; +}; + +struct bounds_chunk_boundary_t { + const char* start; + const char* end; +}; + +// enables representing row counts per chunk as a sparse representation w/ 4096 granularity +// works well since nnzs are often clustered around the same matrix blocks +static inline int64_t& column_row_count_slot(chunk_result_t& result, size_t row_idx) +{ + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = result.row_count_block_dir[block_id]; + if (UNLIKELY(block_pos < 0)) { + block_pos = (int32_t)result.row_count_blocks.size(); + result.row_count_block_dir[block_id] = block_pos; + row_count_block_t block; + block.block_id = block_id; + block.storage_offset = result.row_count_storage.size(); + result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0); + result.row_count_blocks.push_back(std::move(block)); + } + return result + .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local]; +} + +static bool dense_col_chunk_padding_compatible(const dense_observe_state_t& stats, + size_t global_pad_width) +{ + if (global_pad_width > 0) { + return stats.index.pad_width == global_pad_width || + (stats.index.pad_width == 0 && + decimal_digits_u64(stats.index.min_id) >= global_pad_width); + } + return stats.index.pad_width == 0; +} + +static const char* find_next_line(const char* p, const char* end) +{ + while (p < end && *p != '\n') + p++; + if (p < end) p++; + return p; +} + +static std::string_view peek_bounds_line_var_name(const char* line_start, const char* end) +{ + const char* p = line_start; + for (int field = 0; field < 2; ++field) { + while (p < end && *p <= ' ' && *p != '\n') + p++; + while (p < end && *p > ' ') + p++; + } + while (p < end && *p <= ' ' && *p != '\n') + p++; + const char* var_start = p; + while (p < end && *p > ' ') + p++; + return std::string_view(var_start, (size_t)(p - var_start)); +} + +static const char* find_line_start(const char* section_start, const char* p) +{ + while (p > section_start && p[-1] != '\n') + --p; + return p; +} + +static std::vector compute_bounds_chunk_boundaries( + const char* section_start, const char* section_end, int num_threads) +{ + scoped_timer_t timer("bounds_compute_chunk_boundaries"); + + const size_t total_size = (size_t)(section_end - section_start); + const size_t chunk_size = total_size / (size_t)num_threads; + + std::vector boundaries((size_t)num_threads); + boundaries[0].start = section_start; + for (int t = 0; t < num_threads; ++t) { + if (t == num_threads - 1) { + boundaries[(size_t)t].end = section_end; + } else { + const char* boundary = + find_next_line(section_start + (size_t)(t + 1) * chunk_size, section_end); + + // Keep consecutive BOUNDS records for the same variable in one chunk. + // Then each thread owns full LO/UP-style groups and can apply file order locally. + while (boundary < section_end) { + const char* prev_line = find_line_start(section_start, boundary - 1); + const auto prev_var = peek_bounds_line_var_name(prev_line, section_end); + const auto next_var = peek_bounds_line_var_name(boundary, section_end); + if (prev_var.empty() || next_var.empty() || prev_var != next_var) { break; } + boundary = find_next_line(boundary, section_end); + } + + boundaries[(size_t)t].end = boundary; + boundaries[(size_t)t + 1].start = boundary; + } + } + return boundaries; +} + +static std::vector compute_chunk_boundaries(const char* columns_start, + const char* columns_end, + int num_threads) +{ + scoped_timer_t timer("compute_chunk_boundaries"); + + size_t total_size = (size_t)(columns_end - columns_start); + size_t chunk_size = total_size / (size_t)num_threads; + + std::vector boundaries(num_threads); + + for (int t = 0; t < num_threads; t++) { + if (t == 0) { boundaries[t].start = columns_start; } + + if (t == num_threads - 1) { + boundaries[t].end = columns_end; + } else { + // Find estimated position and align to line boundary + const char* estimated_end = columns_start + (t + 1) * chunk_size; + const char* line_start = estimated_end; + while (line_start < columns_end && *line_start != '\n') + line_start++; + if (line_start < columns_end) line_start++; + + // Read column name at this line + std::string_view col_name = cursor_t::peek_field_at(line_start, columns_end); + + // Scan forward until column name changes (to avoid splitting a column) + const char* boundary = line_start; + while (boundary < columns_end) { + const char* next_line = find_next_line(boundary, columns_end); + if (next_line >= columns_end) break; + + std::string_view next_col = cursor_t::peek_field_at(next_line, columns_end); + if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') { + // Found a column transition. Marker-state fixup later handles any split near markers. + boundary = next_line; + break; + } + boundary = next_line; + } + boundaries[t].end = boundary; + } + } + + // Fix up start pointers (each start is previous end) + for (int t = 1; t < num_threads; t++) { + boundaries[t].start = boundaries[t - 1].end; + } + + return boundaries; +} + +template +static chunk_result_t parse_columns_chunk(const char* chunk_start, + const char* chunk_end, + const parse_state_t& state) +{ + chunk_result_t result; + + if (chunk_start >= chunk_end) { + result.col_offsets.push_back(0); + return result; + } + + size_t chunk_size = (size_t)(chunk_end - chunk_start); + size_t estimated_nnz = chunk_size / 100; + size_t estimated_cols = estimated_nnz / 10; + if (UNLIKELY(state.problem.n_constraints_ > (i_t)std::numeric_limits::max())) { + state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices"); + } + result.values.reserve(estimated_nnz); + result.row_indices.reserve(estimated_nnz); + result.col_offsets.reserve(estimated_cols + 1); + result.var_names.reserve(estimated_cols); + result.var_name_arena.reserve(std::max(4096, estimated_cols * 16)); + result.objective_entries.reserve(estimated_cols); + size_t n_row_blocks = + cuda::ceil_div((size_t)state.problem.n_constraints_, COLUMN_ROW_COUNT_BLOCK_ROWS); + result.row_count_block_dir.resize(n_row_blocks, -1); + size_t estimated_touched_blocks = std::min(n_row_blocks, std::max(16, estimated_nnz)); + result.row_count_blocks.reserve(estimated_touched_blocks); + result.row_count_storage.reserve(estimated_touched_blocks * COLUMN_ROW_COUNT_BLOCK_ROWS); + + cursor_t cursor(chunk_start, (size_t)(chunk_end - chunk_start)); + std::string_view prev_var_name = ""; + + cursor.skip_ws(); + + while (!cursor.done()) { + if (UNLIKELY(*cursor.ptr == 'R')) { + auto next = cursor.peek_field(); + // RHS section is mandatory right after COLUMNS section + if (next == "RHS") { break; } + } + + auto [var_name, field2] = cursor.read_two_fields(); + if (UNLIKELY(!field2.empty() && field2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + + // Check for integer marker + if (UNLIKELY(field2[0] == '\'' && field2 == "'MARKER'")) { + auto marker_type = cursor.read_field(); + + marker_info_t marker; + marker.after_local_var_idx = + result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1; + + if (marker_type == "'INTORG'") { + marker.type = marker_info_t::INTORG; + } else if (marker_type == "'INTEND'") { + marker.type = marker_info_t::INTEND; + } else { + cursor.error("unknown integer marker type in COLUMNS: %.*s", + (int)marker_type.size(), + marker_type.data()); + } + result.markers.push_back(marker); + + while (!cursor.done() && !cursor.eol()) + cursor.ptr++; + if (!cursor.done()) cursor.ptr++; + cursor.skip_ws(); + continue; + } + + auto row_name = field2; + // quite often in MIPs the coefficient is just a single-digit integer + double value; + double sign = 1.0; + if (cursor.ptr[0] == '-') { + sign = -1.0; + cursor.advance(1); + } + if (cursor.ptr + 1 < cursor.end && fp64::is_digit(cursor.ptr[0]) && + (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) { + value = sign * (cursor.ptr[0] - '0'); + cursor.advance(1); + } else { + value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end); + } + // usually EOL directly follows + if (UNLIKELY(!cursor.eol())) { cursor.skip_ws(); } + accept_comment(cursor); + + if (prev_var_name != var_name) { + std::string_view owned_var_name = result.var_name_arena.copy(var_name); + result.var_names.push_back(owned_var_name); + observe_dense_name(result.dense_col_stats.candidate, + result.dense_col_stats.index, + result.dense_col_stats.count, + owned_var_name); + result.col_offsets.push_back(result.values.size()); + prev_var_name = owned_var_name; + } + + auto add_entry = [&](std::string_view rn, double val) { + size_t row_idx = state.row_lookup(rn); + if (LIKELY(row_idx != SIZE_MAX)) { + assert(row_idx <= (size_t)std::numeric_limits::max()); + result.values.push_back(val); + result.row_indices.push_back((uint32_t)row_idx); + column_row_count_slot(result, row_idx)++; + } else if (LIKELY(rn == state.objective_name_sv)) { + result.objective_entries.push_back({result.var_names.size() - 1, val}); + } else if (state.ignored_objective_names.count(rn)) { + return; + } else { + cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data()); + } + }; + + add_entry(row_name, value); + + // Optional second entry on same line + if (!cursor.eol()) { + auto row_name2 = cursor.read_field(); + if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + double value2 = fp64::parse_fp64_advance(cursor.ptr, cursor.end); + cursor.skip_ws(); + accept_comment(cursor); + + add_entry(row_name2, value2); + } + + expect_eol(cursor); + } + + result.col_offsets.push_back(result.values.size()); + + return result; +} + +// Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC +template +struct column_merge_shape_t { + int num_chunks = 0; + i_t n_rows = 0; + std::vector global_col_offset; + size_t total_cols = 0; + size_t total_nnz = 0; +}; + +template +static column_merge_shape_t compute_column_merge_shape( + const std::vector& chunks, i_t n_rows) +{ + column_merge_shape_t shape; + shape.num_chunks = (int)chunks.size(); + shape.n_rows = n_rows; + shape.global_col_offset.resize((size_t)shape.num_chunks + 1); + { + scoped_timer_t timer("columns_global_offsets"); + for (int t = 0; t < shape.num_chunks; t++) { + shape.global_col_offset[(size_t)t + 1] = + shape.global_col_offset[(size_t)t] + chunks[(size_t)t].var_names.size(); + shape.total_nnz += chunks[(size_t)t].values.size(); + } + } + shape.total_cols = shape.global_col_offset[(size_t)shape.num_chunks]; + if constexpr (std::numeric_limits::max() < std::numeric_limits::max()) { + const size_t index_max = (size_t)std::numeric_limits::max(); + if (shape.total_nnz > index_max) { + mps_parser_fail(error_type_t::RuntimeError, + "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu", + shape.total_nnz, + index_max); + } + if (shape.total_cols > index_max || (size_t)n_rows > index_max) { + mps_parser_fail(error_type_t::RuntimeError, + "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index " + "max=%zu", + (size_t)n_rows, + shape.total_cols, + index_max); + } + } + return shape; +} + +template +static void detect_dense_column_metadata(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_dense_metadata"); + bool dense_ok = shape.total_cols > 0; + bool have_first = false; + std::string_view dense_prefix; + uint64_t expected_next_id = 0; + uint64_t dense_min_id = 0; + uint64_t dense_max_id = 0; + size_t dense_pad_width = 0; + + for (int t = 0; t < shape.num_chunks && dense_ok; ++t) { + const auto& stats = chunks[(size_t)t].dense_col_stats; + if (stats.count == 0) { continue; } + if (!stats.candidate || stats.count != chunks[(size_t)t].var_names.size()) { + dense_ok = false; + break; + } + if (!have_first) { + have_first = true; + dense_prefix = stats.index.prefix; + expected_next_id = stats.index.min_id; + dense_min_id = stats.index.min_id; + dense_pad_width = stats.index.pad_width; + } + if (stats.index.prefix != dense_prefix || stats.index.min_id != expected_next_id || + !dense_col_chunk_padding_compatible(stats, dense_pad_width)) { + dense_ok = false; + break; + } + if (stats.index.max_id < stats.index.min_id || + stats.index.max_id - stats.index.min_id + 1 != stats.count) { + dense_ok = false; + break; + } + dense_max_id = stats.index.max_id; + if (stats.index.max_id == std::numeric_limits::max()) { + dense_ok = false; + break; + } + expected_next_id = stats.index.max_id + 1; + } + + if (!have_first || dense_max_id < dense_min_id || + dense_max_id - dense_min_id + 1 != shape.total_cols) { + dense_ok = false; + } + + state.col_index_mode = dense_ok ? index_mode_t::dense_ordered : index_mode_t::hash; + if (dense_ok) { + state.col_dense.prefix.assign(dense_prefix); + state.col_dense.min_id = dense_min_id; + state.col_dense.max_id = dense_max_id; + state.col_dense.pad_width = dense_pad_width; + } +} + +template +static std::vector build_csr_row_offsets(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + std::vector global_row_counts((size_t)shape.n_rows, 0); + { + scoped_timer_t timer("columns_sum_row_counts"); + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& block : chunks[(size_t)t].row_count_blocks) { + const int64_t* block_counts = + chunks[(size_t)t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base); + for (size_t local = 0; local < block_limit; ++local) { + global_row_counts[row_base + local] += (i_t)block_counts[local]; + } + } + } + } + { + scoped_timer_t timer("columns_build_row_offsets"); + state.problem.A_offsets_.resize((size_t)shape.n_rows + 1); + state.problem.A_offsets_[0] = 0; + for (i_t r = 0; r < shape.n_rows; r++) { + state.problem.A_offsets_[(size_t)r + 1] = + state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r]; + } + } + return global_row_counts; +} + +template +static void convert_counts_to_write_positions(std::vector& chunks, + const column_merge_shape_t& shape, + const std::vector& row_offsets, + std::vector& global_row_counts) +{ + scoped_timer_t timer("columns_counts_to_write_positions"); + std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0}); + for (int t = 0; t < shape.num_chunks; t++) { + for (auto& block : chunks[(size_t)t].row_count_blocks) { + int64_t* block_counts = chunks[(size_t)t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base); + for (size_t local = 0; local < block_limit; ++local) { + int64_t count = block_counts[local]; + if (count == 0) continue; + size_t row = row_base + local; + i_t pos = row_offsets[row] + global_row_counts[row]; + block_counts[local] = (int64_t)pos; + global_row_counts[row] += (i_t)count; + } + } + } +} + +static void materialize_chunk_row_count_storage(std::vector& chunks, + int num_threads) +{ + scoped_timer_t timer("columns_row_count_storage_hugepages"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < (int)chunks.size(); ++t) { + materialize_vector_hugepages("column_row_count_storage", + chunks[(size_t)t].row_count_storage, + materialize_touch_t::write_2mb); + } +} + +template +static void allocate_column_outputs(parse_state_t& state, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("allocate_temp_csr_arrays"); + size_t values_bytes = shape.total_nnz * sizeof(f_t); + size_t indices_bytes = shape.total_nnz * sizeof(i_t); + state.temp_csr_nnz = shape.total_nnz; + +#pragma omp parallel sections num_threads(4) + { +#pragma omp section + { + state.temp_A_region = mmap_region_t::anonymous( + std::max(values_bytes, 1), PROT_READ | PROT_WRITE, MAP_PRIVATE, "temp CSR values"); + state.temp_A = (f_t*)state.temp_A_region.data(); + state.temp_A_region.advise(MADV_HUGEPAGE); + } +#pragma omp section + { + state.temp_A_indices_region = mmap_region_t::anonymous(std::max(indices_bytes, 1), + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + "temp CSR column indices"); + state.temp_A_indices = (i_t*)state.temp_A_indices_region.data(); + state.temp_A_indices_region.advise(MADV_HUGEPAGE); + } +#pragma omp section + { + if (state.col_index_mode != index_mode_t::dense_ordered) { + state.var_name_arenas.clear(); + state.var_name_arenas.resize((size_t)shape.num_chunks); + state.var_names_sv.resize(shape.total_cols); + } + } +#pragma omp section + { + state.problem.var_types_.resize(shape.total_cols); + } + } +} + +template +static void scatter_column_chunks_to_csr(parse_state_t& state, + std::vector& chunks, + const column_merge_shape_t& shape, + int num_threads) +{ + scoped_timer_t timer("scatter_into_csr"); + { + scoped_timer_t matrix_timer("scatter_matrix_entries"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots((size_t)shape.num_chunks); +#endif +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < shape.num_chunks; t++) { +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + auto& chunk = chunks[(size_t)t]; + for (size_t local_col = 0; local_col < chunk.var_names.size(); local_col++) { + i_t global_col = (i_t)(shape.global_col_offset[(size_t)t] + local_col); + size_t col_start = chunk.col_offsets[local_col]; + size_t col_end = chunk.col_offsets[local_col + 1]; + for (size_t idx = col_start; idx < col_end; idx++) { + i_t row = (i_t)chunk.row_indices[idx]; + size_t row_idx = (size_t)row; + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = chunk.row_count_block_dir[block_id]; + row_count_block_t& block = chunk.row_count_blocks[(size_t)block_pos]; + int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; + i_t dest = (i_t)write_pos++; + state.temp_A[dest] = (f_t)chunk.values[idx]; + state.temp_A_indices[dest] = global_col; + } + } +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[(size_t)t] = perf_counters.stop(); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("scatter_matrix_entries", perf_snapshots); +#endif + } + + if (state.col_index_mode != index_mode_t::dense_ordered) { + scoped_timer_t names_timer("scatter_var_names"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < shape.num_chunks; t++) { + chunk_name_arena_t& arena = state.var_name_arenas[(size_t)t]; + arena.reserve(std::max(4096, chunks[(size_t)t].var_names.size() * 16)); + for (size_t i = 0; i < chunks[(size_t)t].var_names.size(); i++) { + state.var_names_sv[shape.global_col_offset[(size_t)t] + i] = + arena.copy(chunks[(size_t)t].var_names[i]); + } + } + } else { + scoped_timer_t names_timer("scatter_var_names"); + } +} + +struct global_marker_t { + marker_info_t::Type type; + size_t global_var_idx; +}; + +template +static void apply_column_integer_markers(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_apply_markers"); + std::vector all_markers; + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& m : chunks[(size_t)t].markers) { + global_marker_t gm; + gm.type = m.type; + gm.global_var_idx = + m.after_local_var_idx == SIZE_MAX + ? (shape.global_col_offset[(size_t)t] > 0 ? shape.global_col_offset[(size_t)t] - 1 + : SIZE_MAX) + : shape.global_col_offset[(size_t)t] + m.after_local_var_idx; + all_markers.push_back(gm); + } + } + + std::stable_sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { + if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true; + if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false; + return a.global_var_idx < b.global_var_idx; + }); + + bool is_integer = false; + size_t marker_idx = 0; + for (size_t v = 0; v < shape.total_cols; v++) { + while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX || + all_markers[marker_idx].global_var_idx < v)) { + is_integer = all_markers[marker_idx].type == marker_info_t::INTORG; + marker_idx++; + } + state.problem.var_types_[v] = is_integer ? 'I' : 'C'; + } +} + +template +static void assign_column_objective_entries(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_objective_entries"); + state.problem.c_.resize(shape.total_cols, f_t{0}); + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& [local_col, coeff] : chunks[(size_t)t].objective_entries) { + size_t global_col = shape.global_col_offset[(size_t)t] + local_col; + if (global_col < shape.total_cols) { state.problem.c_[global_col] = (f_t)coeff; } + } + } +} + +template +static void merge_chunk_results_to_csr(parse_state_t& state, + std::vector& chunks, + int num_threads) +{ + scoped_timer_t timer("merge_chunks_to_csr"); + if (chunks.empty()) return; + + auto shape = compute_column_merge_shape(chunks, state.problem.n_constraints_); + detect_dense_column_metadata(state, chunks, shape); + auto global_row_counts = build_csr_row_offsets(state, chunks, shape); + convert_counts_to_write_positions(chunks, shape, state.problem.A_offsets_, global_row_counts); + materialize_chunk_row_count_storage(chunks, num_threads); + allocate_column_outputs(state, shape); + scatter_column_chunks_to_csr(state, chunks, shape, num_threads); + apply_column_integer_markers(state, chunks, shape); + assign_column_objective_entries(state, chunks, shape); + + state.problem.n_vars_ = (i_t)shape.total_cols; + state.problem.nnz_ = (i_t)shape.total_nnz; +} + +template +static void materialize_problem_csr(parse_state_t& state) +{ + scoped_timer_t timer("materialize_problem_csr"); + size_t nnz = state.temp_csr_nnz; + int copy_threads = 2; + copy_threads = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP)); + + int resize_threads = copy_threads > 1 ? 2 : 1; +#pragma omp parallel sections num_threads(resize_threads) + { +#pragma omp section + { + state.problem.A_.resize(nnz); + } +#pragma omp section + { + state.problem.A_indices_.resize(nnz); + } + } + + size_t value_bytes = nnz * sizeof(f_t); + size_t index_bytes = nnz * sizeof(i_t); + size_t total_bytes = value_bytes + index_bytes; + // Copy A_ and A_indices overlapping with the other phases + // this hides the latency costs of heap alloc and default init with other parsing/IO + // instead of making it blocking for the column parse + // TODO: just have A_ and A_indices_ be mmap anon allocs directly in the mps_data_model_t + // but that'd require careful work around avoiding breaking changes and the API esp cython stuff + if (total_bytes != 0) { +#pragma omp parallel for num_threads(copy_threads) schedule(static) + for (int t = 0; t < copy_threads; ++t) { + size_t begin = (total_bytes * (size_t)t) / (size_t)copy_threads; + size_t end = (total_bytes * (size_t)(t + 1)) / (size_t)copy_threads; + if (begin < value_bytes) { + size_t value_end = std::min(end, value_bytes); + if (value_end > begin) { + std::memcpy((char*)state.problem.A_.data() + begin, + (const char*)state.temp_A + begin, + value_end - begin); + } + } + if (end > value_bytes) { + size_t index_begin = begin > value_bytes ? begin - value_bytes : 0; + size_t index_end = end - value_bytes; + std::memcpy((char*)state.problem.A_indices_.data() + index_begin, + (const char*)state.temp_A_indices + index_begin, + index_end - index_begin); + } + } + } + + state.temp_A = nullptr; + state.temp_A_indices = nullptr; + state.temp_csr_materialized = true; + state.temp_A_region.reset(); + state.temp_A_indices_region.reset(); +} + +// COLUMNS is always parsed chunk-parallel: each chunk is counted/parsed by parse_columns_chunk +// and the per-chunk results are stitched together by merge_chunk_results_to_csr. There is no +// separate serial implementation -- a single thread just runs one chunk through the same path. +template +static void parse_columns_section_parallel(parse_state_t& state, + int num_threads, + const char* columns_end) +{ + scoped_timer_t timer("parse_columns_parallel"); + + if (num_threads <= 0) { num_threads = phase_thread_count(MPS_COLUMNS_THREAD_CAP); } + + // Skip the "COLUMNS" header + expect_section(state.cursor, "COLUMNS"); + + const char* columns_start = state.cursor.ptr; + size_t columns_bytes = (size_t)(columns_end - columns_start); + size_t chunk_limited_threads = std::max(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES); + num_threads = std::max(1, std::min(num_threads, (int)chunk_limited_threads)); + + auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads); + + // Parse chunks in parallel + std::vector results(num_threads); + + { + scoped_timer_t timer("parse_columns_chunk_parallel"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots((size_t)num_threads); +#endif + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; + { +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; t++) { + try { + MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns); +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + results[t] = + parse_columns_chunk(chunk_bounds[t].start, chunk_bounds[t].end, state); +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[(size_t)t] = perf_counters.stop(); +#endif + } catch (...) { + std::lock_guard lock(error_mutex); + if (!first_error) { first_error = std::current_exception(); } + } + } + } + if (first_error) { std::rethrow_exception(first_error); } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("parse_columns_chunk_parallel", perf_snapshots); +#endif + } + + // Merge results directly into CSR format + merge_chunk_results_to_csr(state, results, num_threads); + + // Update cursor to RHS section + state.cursor.ptr = columns_end; + state.cursor.skip_ws(); +} + +template +static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_rhs"); + expect_section(cursor, "RHS"); + + // necessary on the cold path since we directly read and lookup on the hot path + auto reread_field_name = [](const char* start, const char* end) { + const char* p = start; + while (p < end && *p > ' ') { + p++; + } + return std::string_view(start, (size_t)(p - start)); + }; + + auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) { + // This is a regular non-obj row. + if (row_idx != SIZE_MAX) { + state.problem.b_[row_idx] = value; + return; + } + // This is the objective row. + std::string_view row_name = reread_field_name(row_start, cursor.end); + if (row_name == state.objective_name_sv) { + state.problem.objective_offset_ = -value; + return; + } + // Other objectives, ignored currently. cold path + if (state.ignored_objective_names.count(row_name)) { return; } + // Unexpected! + error_unknown_row(cursor, row_start, "RHS"); + }; + + while (cursor.ptr < cursor.end) { + [[maybe_unused]] auto rhs_name = cursor.read_field(); + if (accept_comment(cursor)) { + expect_eol(cursor); + continue; + } + const char* row_start = cursor.ptr; + size_t row_idx = state.read_row_lookup(cursor); + auto value = expect_number_fast_pm_one(cursor); + apply_rhs(row_start, row_idx, (f_t)value); + + accept_comment(cursor); + // Optional second entry + if (!cursor.eol()) { + const char* row_start2 = cursor.ptr; + size_t row_idx2 = state.read_row_lookup(cursor); + auto value2 = expect_number_fast_pm_one(cursor); + apply_rhs(row_start2, row_idx2, (f_t)value2); + accept_comment(cursor); + } + expect_eol(cursor); + } +} + +// does the job on 99% of instances, in the vast majority of cases bound names are sequential with +// occasional sparsity +static size_t find_var_after_hint(const std::vector& var_names, + std::string_view var_name, + size_t hint_idx) +{ + const size_t n_vars = var_names.size(); + if (hint_idx + 1 < n_vars && var_names[hint_idx + 1] == var_name) { return hint_idx + 1; } + if (hint_idx < n_vars && var_names[hint_idx] == var_name) { return hint_idx; } + + const size_t first_begin = std::min(hint_idx + 2, n_vars); + for (size_t i = first_begin; i < n_vars; ++i) { + if (var_names[i] == var_name) { return i; } + } + for (size_t i = 0; i < hint_idx && i < n_vars; ++i) { + if (var_names[i] == var_name) { return i; } + } + return SIZE_MAX; +} + +template +static bool apply_bound_record(std::string_view bound_type, + f_t value, + bool has_value, + bool first_bound_for_var, + SetLb&& set_lb, + SetUb&& set_ub, + SetType&& set_type, + Error&& error) +{ + if (bound_type == "LO") { + set_lb(value); + } else if (bound_type == "UP") { + set_ub(value); + if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits::infinity()); } + } else if (bound_type == "FX") { + set_lb(value); + set_ub(value); + } else if (bound_type == "FR") { + set_lb(-std::numeric_limits::infinity()); + set_ub(std::numeric_limits::infinity()); + } else if (bound_type == "MI") { + set_lb(-std::numeric_limits::infinity()); + } else if (bound_type == "PL") { + set_ub(std::numeric_limits::infinity()); + } else if (bound_type == "BV") { + set_lb(f_t{0}); + set_ub(f_t{1}); + set_type('I'); + } else if (bound_type == "LI") { + set_lb(value); + set_type('I'); + } else if (bound_type == "UI") { + set_ub(value); + if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits::infinity()); } + set_type('I'); + } else if (bound_type == "SC") { + if (UNLIKELY(!has_value)) { + error("SC bound requires an upper bound value", bound_type); + return false; + } + set_ub(value); + set_type('S'); + } else { + error("unknown bound type", bound_type); + return false; + } + return true; +} + +// Parallel BOUNDS parser for the common dense/ordered-name case. Returns false when the section +// is too small or not safely parallelizable, so parse_bounds_section resets and falls back to its +// serial path. Bound-type semantics (LO/UP/FX/...) are shared with the serial path through +// apply_bound_record, so the two cannot drift. +template +static bool parse_bounds_section_parallel_dense(parse_state_t& state, + cursor_t& cursor, + const char* bounds_body_start, + const char* bounds_body_end, + size_t n_vars) +{ + const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start); + const int num_threads = phase_thread_count(MPS_BOUNDS_THREAD_CAP); + const bool use_dense_lookup = state.col_index_mode == index_mode_t::dense_ordered; + const size_t min_parallel_bytes = + use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES; + if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; } + + MPS_NVTX_RANGE( + use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint", + nvtx::colors::bounds); + + struct bounds_parallel_stats_t { + size_t lines = 0; + size_t dense_hits = 0; + size_t dense_misses = 0; + size_t comments = 0; + size_t min_var = SIZE_MAX; + size_t max_var = 0; + size_t decreasing_order = 0; + const char* error_ptr = nullptr; + char error_msg[192] = {}; + }; + + std::vector stats((size_t)num_threads); + auto boundaries = + compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads); + + std::vector bound_seen; + { + scoped_timer_t timer("bounds_parallel_seen_alloc"); + bound_seen.resize(n_vars, 0); + } + + { + scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense" + : "parse_bounds_parallel_ordered_hint"); + // Repeated BOUNDS for the same variable are safe inside a group-owned chunk. + // Parse optimistically, then accept only if chunk summaries prove no backward jumps. +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + auto& local = stats[(size_t)t]; + cursor_t cursor(boundaries[(size_t)t].start, + (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start)); + cursor.skip_ws(); + size_t prev_var = SIZE_MAX; + size_t hint_idx = 0; + auto lookup_var = [&](std::string_view var_name) { + if (use_dense_lookup) { return state.col_dense.lookup(var_name); } + // quite often variables are in order, so a cheap lookup trick is to look for the variable + // right after this one + return find_var_after_hint(state.var_names_sv, var_name, hint_idx); + }; + try { + while (cursor.ptr < cursor.end) { + if (UNLIKELY(*cursor.ptr == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + auto bound_type = cursor.read_field(); + if (UNLIKELY(bound_type.empty())) { break; } + if (UNLIKELY(bound_type[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + [[maybe_unused]] auto bound_name = cursor.read_field(); + auto var_name = cursor.read_field(); + if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + size_t var_idx = lookup_var(var_name); + if (UNLIKELY(var_idx == SIZE_MAX)) { + local.dense_misses++; + break; + } + hint_idx = var_idx; + local.dense_hits++; + local.lines++; + local.min_var = std::min(local.min_var, var_idx); + local.max_var = std::max(local.max_var, var_idx); + if (prev_var != SIZE_MAX && var_idx < prev_var) { local.decreasing_order++; } + prev_var = var_idx; + + bool first_bound_for_var = bound_seen[var_idx] == 0; + bound_seen[var_idx] = 1; + + f_t value = 0; + bool has_value = false; + accept_comment(cursor); + if (!cursor.eol()) { + value = (f_t)expect_number_fast_pm_one(cursor); + has_value = true; + accept_comment(cursor); + } + + auto set_lb = [&](f_t x) { state.problem.variable_lower_bounds_[var_idx] = x; }; + auto set_ub = [&](f_t x) { state.problem.variable_upper_bounds_[var_idx] = x; }; + auto set_type = [&](char t) { state.problem.var_types_[var_idx] = t; }; + auto set_error = [&](const char* msg, std::string_view type) { + if (type.empty() || std::strcmp(msg, "unknown bound type") != 0) { + std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", msg); + } else { + std::snprintf(local.error_msg, + sizeof(local.error_msg), + "%s: %.*s", + msg, + (int)type.size(), + type.data()); + } + local.error_ptr = cursor.ptr; + }; + if (!apply_bound_record(bound_type, + value, + has_value, + first_bound_for_var, + set_lb, + set_ub, + set_type, + set_error)) { + break; + } + + expect_eol(cursor); + } + } catch (const std::exception& e) { + std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", e.what()); + local.error_ptr = cursor.ptr; + } + } + } + + size_t dense_misses = 0; + size_t decreasing_order = 0; + size_t overlap_chunks = 0; + size_t prev_max = SIZE_MAX; + for (int t = 0; t < num_threads; ++t) { + const auto& local = stats[(size_t)t]; + if (local.error_ptr != nullptr) { + cursor.ptr = local.error_ptr; + cursor.error("%s", local.error_msg); + } + dense_misses += local.dense_misses; + decreasing_order += local.decreasing_order; + if (local.lines > 0) { + if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; } + prev_max = local.max_var; + } + } + + const bool order_safe = dense_misses == 0 && decreasing_order == 0 && overlap_chunks == 0; + + if (!order_safe) { + std::fprintf(stderr, + "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu " + "decreasing_order=%zu overlap_chunks=%zu\n", + dense_misses, + decreasing_order, + overlap_chunks); + cursor.ptr = bounds_body_start; + return false; + } + + { + scoped_timer_t timer("bounds_integer_defaults"); + for (size_t i = 0; i < n_vars; ++i) { + if (!bound_seen[i] && state.problem.var_types_[i] == 'I') { + state.problem.variable_lower_bounds_[i] = f_t{0}; + state.problem.variable_upper_bounds_[i] = f_t{1}; + } + } + } + + cursor.ptr = bounds_body_end; + return true; +} + +template +static void init_variable_bounds_defaults(parse_state_t& state) +{ + size_t n_vars = (size_t)state.problem.n_vars_; + { + scoped_timer_t timer("bounds_init_defaults"); + state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); + state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); + } + { + scoped_timer_t timer("bounds_madvise_pretouch"); + materialize_vector_hugepages("variable_lower_bounds", + state.problem.variable_lower_bounds_, + materialize_touch_t::write_4kb); + materialize_vector_hugepages("variable_upper_bounds", + state.problem.variable_upper_bounds_, + materialize_touch_t::write_4kb); + } +} + +template +static void apply_unspecified_integer_bounds(parse_state_t& state, HasBound&& has_bound) +{ + scoped_timer_t timer("bounds_integer_defaults"); + size_t n_vars = (size_t)state.problem.n_vars_; + for (size_t i = 0; i < n_vars; ++i) { + if (!has_bound(i) && state.problem.var_types_[i] == 'I') { + state.problem.variable_lower_bounds_[i] = f_t{0}; + state.problem.variable_upper_bounds_[i] = f_t{1}; + } + } +} + +template +static void init_variable_bounds_without_bounds_section(parse_state_t& state) +{ + init_variable_bounds_defaults(state); + apply_unspecified_integer_bounds(state, [](size_t) { return false; }); +} + +template +static void parse_bounds_section(parse_state_t& state, + cursor_t& cursor, + bool allow_parallel_dense = false) +{ + size_t n_vars = (size_t)state.problem.n_vars_; + init_variable_bounds_defaults(state); + + std::vector bound_seen((n_vars + 63) / 64, 0); + auto has_bound = [&](size_t var_idx) { + return (bound_seen[var_idx >> 6] & (uint64_t{1} << (var_idx & 63))) != 0; + }; + auto mark_bound = [&](size_t var_idx) { + bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63); + }; + + if (!accept_section(cursor, "BOUNDS")) { + apply_unspecified_integer_bounds(state, has_bound); + return; + } + + const char* bounds_body_start = cursor.ptr; + const char* bounds_body_end = cursor.end; + if (allow_parallel_dense) { + if (parse_bounds_section_parallel_dense( + state, cursor, bounds_body_start, bounds_body_end, n_vars)) { + return; + } + { + scoped_timer_t timer("bounds_parallel_fallback_reset"); + std::fill(state.problem.variable_lower_bounds_.begin(), + state.problem.variable_lower_bounds_.end(), + f_t{0}); + std::fill(state.problem.variable_upper_bounds_.begin(), + state.problem.variable_upper_bounds_.end(), + std::numeric_limits::infinity()); + } + } + + size_t hint_idx = 0; + { + scoped_timer_t timer("parse_bounds"); + while (!cursor.done()) { + auto bound_type = cursor.read_field(); + [[maybe_unused]] auto bound_name = cursor.read_field(); + auto var_name = cursor.read_field(); + if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + + // optimized lookup using hint (bounds often in same order as columns) + size_t var_idx = SIZE_MAX; + // handle annoying bounds-only vars that weren't declared in COLUMNS + typename parse_state_t::bounds_only_var_t* aux_var = nullptr; + if (LIKELY(state.col_index_mode == index_mode_t::dense_ordered)) { + var_idx = state.col_dense.lookup(var_name); + if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } + } else { + var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx); + if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } + } + if (var_idx != SIZE_MAX) { hint_idx = var_idx; } + bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx); + + f_t value = 0; + bool has_value = false; + accept_comment(cursor); + if (!cursor.eol()) { + value = (f_t)expect_number(cursor); + has_value = true; + accept_comment(cursor); + } + + auto set_lb = [&](f_t x) { + if (aux_var) { + aux_var->lb = x; + } else { + state.problem.variable_lower_bounds_[var_idx] = x; + } + }; + auto set_ub = [&](f_t x) { + if (aux_var) { + aux_var->ub = x; + } else { + state.problem.variable_upper_bounds_[var_idx] = x; + } + }; + auto set_type = [&](char t) { + if (aux_var) { + aux_var->type = t; + } else { + state.problem.var_types_[var_idx] = t; + } + }; + + auto set_error = [&](const char* msg, std::string_view type) { + if (std::strcmp(msg, "unknown bound type") == 0) { + cursor.error("%s: %.*s", msg, (int)type.size(), type.data()); + } + cursor.error("%s", msg); + }; + [[maybe_unused]] bool bound_applied = apply_bound_record( + bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error); + if (aux_var == nullptr) { mark_bound(var_idx); } + + expect_eol(cursor); + } + } + apply_unspecified_integer_bounds(state, has_bound); +} + +template +static void init_constraint_bounds_from_rows(parse_state_t& state) +{ + state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_); + state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_); + + for (i_t i = 0; i < state.problem.n_constraints_; ++i) { + char row_type = state.problem.row_types_[i]; + f_t b = state.problem.b_[i]; + if (row_type == 'E') { + state.problem.constraint_lower_bounds_[i] = b; + state.problem.constraint_upper_bounds_[i] = b; + } else if (row_type == 'L') { + state.problem.constraint_lower_bounds_[i] = -std::numeric_limits::infinity(); + state.problem.constraint_upper_bounds_[i] = b; + } else if (row_type == 'G') { + state.problem.constraint_lower_bounds_[i] = b; + state.problem.constraint_upper_bounds_[i] = std::numeric_limits::infinity(); + } + } +} + +template +static void parse_ranges_section(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_ranges"); + init_constraint_bounds_from_rows(state); + + if (!accept_section(cursor, "RANGES")) { return; } + + auto apply_range = [&](std::string_view row_name, f_t range_val) { + size_t row_idx = state.row_lookup(row_name); + if (row_idx == SIZE_MAX) { + cursor.error("unknown row name in RANGES: %.*s", (int)row_name.size(), row_name.data()); + } + char row_type = state.problem.row_types_[row_idx]; + f_t abs_range = std::abs(range_val); + + if (row_type == 'E') { + if (range_val >= 0) { + state.problem.constraint_upper_bounds_[row_idx] = + state.problem.constraint_lower_bounds_[row_idx] + abs_range; + } else { + state.problem.constraint_lower_bounds_[row_idx] = + state.problem.constraint_upper_bounds_[row_idx] - abs_range; + } + } else if (row_type == 'L') { + state.problem.constraint_lower_bounds_[row_idx] = + state.problem.constraint_upper_bounds_[row_idx] - abs_range; + } else if (row_type == 'G') { + state.problem.constraint_upper_bounds_[row_idx] = + state.problem.constraint_lower_bounds_[row_idx] + abs_range; + } + }; + + while (cursor.ptr < cursor.end) { + [[maybe_unused]] auto range_name = cursor.read_field(); + if (accept_comment(cursor)) { + expect_eol(cursor); + continue; + } + auto row_name = cursor.read_field(); + auto value = (f_t)expect_number(cursor); + apply_range(row_name, value); + + accept_comment(cursor); + if (!cursor.eol()) { + auto row_name2 = cursor.read_field(); + if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + auto value2 = (f_t)expect_number(cursor); + apply_range(row_name2, value2); + accept_comment(cursor); + } + expect_eol(cursor); + } +} + +// quadratric stuff is bare bones for now, optimize if needed + +template +static void build_var_name_map_if_needed(parse_state_t& state) +{ + if (state.col_index_mode == index_mode_t::dense_ordered || !state.var_names_map.empty()) { + return; + } + scoped_timer_t timer("quadratic_build_var_name_map"); + state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2); + for (size_t i = 0; i < state.var_names_sv.size(); ++i) { + state.var_names_map.emplace(state.var_names_sv[i], i); + } +} + +template +static size_t lookup_quadratic_var(parse_state_t& state, std::string_view name) +{ + if (state.col_index_mode == index_mode_t::dense_ordered) { return state.col_dense.lookup(name); } + auto it = state.var_names_map.find(name); + return it == state.var_names_map.end() ? SIZE_MAX : it->second; +} + +template +static void build_quadratic_csr(parse_state_t& state, + const std::vector>& entries, + bool symmetric_upper_triangular) +{ + scoped_timer_t timer("build_quadratic_csr"); + const size_t n_vars = (size_t)state.problem.n_vars_; + if (entries.empty()) { return; } + + struct expanded_entry_t { + size_t row; + size_t col; + size_t seq; + f_t value; + }; + + std::vector expanded; + expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size()); + size_t seq = 0; + for (const auto& [row_i, col_i, value] : entries) { + size_t row = (size_t)row_i; + size_t col = (size_t)col_i; + expanded.push_back({row, col, seq++, value}); + if (symmetric_upper_triangular && row != col) { expanded.push_back({col, row, seq++, value}); } + } + + std::stable_sort(expanded.begin(), expanded.end(), [](const auto& a, const auto& b) { + if (a.row != b.row) return a.row < b.row; + if (a.col != b.col) return a.col < b.col; + return a.seq < b.seq; + }); + + auto& values = state.problem.Q_objective_values_; + auto& indices = state.problem.Q_objective_indices_; + auto& offsets = state.problem.Q_objective_offsets_; + values.clear(); + indices.clear(); + offsets.assign(n_vars + 1, i_t{0}); + values.reserve(expanded.size()); + indices.reserve(expanded.size()); + + size_t current_row = 0; + offsets[0] = 0; + for (const auto& entry : expanded) { + while (current_row < entry.row) { + offsets[++current_row] = (i_t)values.size(); + } + values.push_back(entry.value * f_t{0.5}); + indices.push_back((i_t)entry.col); + } + while (current_row < n_vars) { + offsets[++current_row] = (i_t)values.size(); + } +} + +template +static void parse_quadratic_sections(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_quadratic_sections"); + if (cursor.done()) { return; } + + build_var_name_map_if_needed(state); + std::vector> quadobj_entries; + std::vector> qmatrix_entries; + std::vector>* active_entries = nullptr; + + auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) { + size_t var1_idx = lookup_quadratic_var(state, var1); + if (var1_idx == SIZE_MAX) { + cursor.error( + "unknown variable name in quadratic section: %.*s", (int)var1.size(), var1.data()); + } + size_t var2_idx = lookup_quadratic_var(state, var2); + if (var2_idx == SIZE_MAX) { + cursor.error( + "unknown variable name in quadratic section: %.*s", (int)var2.size(), var2.data()); + } + active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value); + }; + + while (cursor.ptr < cursor.end) { + if (accept_section(cursor, "QUADOBJ")) { + active_entries = &quadobj_entries; + continue; + } + if (accept_section(cursor, "QMATRIX")) { + active_entries = &qmatrix_entries; + continue; + } + if (accept(cursor, "QCMATRIX")) { + auto row_name = cursor.read_field(); + if (row_name.empty()) { cursor.error("QCMATRIX missing constraint row name"); } + size_t row_idx = state.row_lookup(row_name); + if (row_idx == SIZE_MAX) { + cursor.error( + "unknown constraint row name in QCMATRIX: %.*s", (int)row_name.size(), row_name.data()); + } + char row_type = state.problem.row_types_[row_idx]; + if (row_type != 'L' && row_type != 'G') { + cursor.error( + "QCMATRIX row must have ROWS type L or G: %.*s", (int)row_name.size(), row_name.data()); + } + expect_eol(cursor); + typename parse_state_t::qcmatrix_block_t block; + block.row_idx = row_idx; + block.row_name = row_name; + state.qcmatrix_blocks.push_back(std::move(block)); + active_entries = &state.qcmatrix_blocks.back().entries; + continue; + } + if (active_entries == nullptr) { break; } + + const char* field_start = cursor.ptr; + auto var1 = cursor.read_field(); + if (UNLIKELY(var1.empty())) { break; } + if (UNLIKELY(var1[0] == '$' || var1[0] == '*')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + const bool starts_column_one = + field_start == cursor.start || field_start[-1] == '\n' || field_start[-1] == '\r'; + if (UNLIKELY(starts_column_one)) { + cursor.error("unknown quadratic section record: %.*s", (int)var1.size(), var1.data()); + } + auto var2 = cursor.read_field(); + if (UNLIKELY(!var2.empty() && var2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + f_t value = (f_t)expect_number(cursor); + add_entry(var1, var2, value); + accept_comment(cursor); + expect_eol(cursor); + } + + if (!quadobj_entries.empty()) { + build_quadratic_csr(state, quadobj_entries, true); + } else if (!qmatrix_entries.empty()) { + build_quadratic_csr(state, qmatrix_entries, false); + } +} + +template +static void set_cursor_range(parse_state_t& state, mps_phase_range_t range) +{ + state.cursor.ptr = range.begin; + state.cursor.end = range.end; +} + +template +static void parse_header_range(parse_state_t& state, mps_phase_range_t range) +{ + set_cursor_range(state, range); + accept_comment_line(state.cursor); + if (state.cursor.done()) { return; } + parse_name_section(state); + parse_objsense_section(state); + parse_objname_section(state); +} + +template +static void parse_rows_range(parse_state_t& state, mps_phase_range_t range) +{ + set_cursor_range(state, range); + parse_rows_section(state, range.end); +} + +template +static void parse_columns_range(parse_state_t& state, + mps_phase_range_t range, + int num_threads = 0) +{ + set_cursor_range(state, range); + parse_columns_section_parallel(state, num_threads, range.end); +} + +template +static void parse_rhs_range(parse_state_t& state, mps_phase_range_t range) +{ + if (!range.present) { return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_rhs_section(state, cursor); +} + +template +static void parse_bounds_range(parse_state_t& state, mps_phase_range_t range) +{ + if (!range.present) { + init_variable_bounds_without_bounds_section(state); + return; + } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_bounds_section(state, cursor, true); +} + +template +static void parse_ranges_range(parse_state_t& state, mps_phase_range_t range) +{ + if (!range.present) { + init_constraint_bounds_from_rows(state); + return; + } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_ranges_section(state, cursor); +} + +template +static void parse_quadratic_range(parse_state_t& state, mps_phase_range_t range) +{ + if (!range.present) { return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_quadratic_sections(state, cursor); +} + +template +static void finalize_qcmatrix_constraints(parse_state_t& state) +{ + if (state.qcmatrix_blocks.empty()) { return; } + scoped_timer_t timer("finalize_qcmatrix_constraints"); + const size_t original_rows = (size_t)state.problem.n_constraints_; + std::vector quadratic_rows(original_rows, 0); + std::vector seen_rows(original_rows, 0); + size_t active_blocks = 0; + + for (const auto& block : state.qcmatrix_blocks) { + if (block.entries.empty()) { continue; } + if (block.row_idx >= original_rows) { + state.cursor.error("QCMATRIX row index is out of range"); + } + if (seen_rows[block.row_idx]) { + state.cursor.error("duplicate QCMATRIX block for constraint row: %.*s", + (int)block.row_name.size(), + block.row_name.data()); + } + seen_rows[block.row_idx] = 1; + quadratic_rows[block.row_idx] = 1; + ++active_blocks; + } + + if (active_blocks == 0) { return; } + + // rebuild the A_ matrix. fairly ugly and brute force, could do better if we parsed the QCMATRIX + // entries before building the CSR in COLUMNS but unclear if worth it + for (const auto& block : state.qcmatrix_blocks) { + if (block.entries.empty()) { continue; } + + size_t linear_begin = (size_t)state.problem.A_offsets_[block.row_idx]; + size_t linear_end = (size_t)state.problem.A_offsets_[block.row_idx + 1]; + typename mps_data_model_t::quadratic_constraint_t qc; + qc.constraint_row_index = (i_t)block.row_idx; + qc.constraint_row_name = state.problem.row_names_[block.row_idx]; + qc.constraint_row_type = state.problem.row_types_[block.row_idx]; + qc.rhs_value = state.problem.b_[block.row_idx]; + qc.linear_values.assign(state.problem.A_.begin() + linear_begin, + state.problem.A_.begin() + linear_end); + qc.linear_indices.assign(state.problem.A_indices_.begin() + linear_begin, + state.problem.A_indices_.begin() + linear_end); + + std::vector perm(block.entries.size()); + for (size_t i = 0; i < perm.size(); ++i) { + perm[i] = i; + } + std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) { + const auto& ea = block.entries[a]; + const auto& eb = block.entries[b]; + if (std::get<0>(ea) != std::get<0>(eb)) { return std::get<0>(ea) < std::get<0>(eb); } + return std::get<1>(ea) < std::get<1>(eb); + }); + + qc.rows.reserve(block.entries.size()); + qc.cols.reserve(block.entries.size()); + qc.vals.reserve(block.entries.size()); + for (size_t idx : perm) { + const auto& [row, col, val] = block.entries[idx]; + qc.rows.push_back(row); + qc.cols.push_back(col); + qc.vals.push_back(val); + } + state.problem.quadratic_constraints_.push_back(std::move(qc)); + } + + std::vector new_A; + std::vector new_A_indices; + std::vector new_A_offsets; + std::vector new_b; + std::vector new_clb; + std::vector new_cub; + std::vector new_row_names; + std::vector new_row_types; + + new_A.reserve(state.problem.A_.size()); + new_A_indices.reserve(state.problem.A_indices_.size()); + new_A_offsets.reserve(original_rows + 1 - active_blocks); + new_b.reserve(original_rows - active_blocks); + new_clb.reserve(original_rows - active_blocks); + new_cub.reserve(original_rows - active_blocks); + new_row_names.reserve(original_rows - active_blocks); + new_row_types.reserve(original_rows - active_blocks); + new_A_offsets.push_back(0); + + for (size_t row = 0; row < original_rows; ++row) { + if (quadratic_rows[row]) { continue; } + size_t begin = (size_t)state.problem.A_offsets_[row]; + size_t end = (size_t)state.problem.A_offsets_[row + 1]; + new_A.insert(new_A.end(), state.problem.A_.begin() + begin, state.problem.A_.begin() + end); + new_A_indices.insert(new_A_indices.end(), + state.problem.A_indices_.begin() + begin, + state.problem.A_indices_.begin() + end); + new_A_offsets.push_back((i_t)new_A.size()); + new_b.push_back(state.problem.b_[row]); + new_clb.push_back(state.problem.constraint_lower_bounds_[row]); + new_cub.push_back(state.problem.constraint_upper_bounds_[row]); + new_row_names.push_back(std::move(state.problem.row_names_[row])); + new_row_types.push_back(state.problem.row_types_[row]); + } + + state.problem.A_ = std::move(new_A); + state.problem.A_indices_ = std::move(new_A_indices); + state.problem.A_offsets_ = std::move(new_A_offsets); + state.problem.b_ = std::move(new_b); + state.problem.constraint_lower_bounds_ = std::move(new_clb); + state.problem.constraint_upper_bounds_ = std::move(new_cub); + state.problem.row_names_ = std::move(new_row_names); + state.problem.row_types_ = std::move(new_row_types); + state.problem.n_constraints_ = (i_t)state.problem.b_.size(); + state.problem.nnz_ = (i_t)state.problem.A_.size(); +} + +template +static void materialize_problem_names(parse_state_t& state) +{ + scoped_timer_t timer("materialize_problem_names"); + int num_threads = phase_thread_count(MPS_NAMES_THREAD_CAP); + // Copy string_views to actual strings (this is where allocation happens) + { + scoped_timer_t timer("materialize_problem_scalar_names"); + state.problem.problem_name_ = std::string(state.problem_name_sv); + state.problem.objective_name_ = std::string(state.objective_name_sv); + } + + { + scoped_timer_t timer("materialize_problem_row_names"); + size_t n = state.row_names_sv.size(); + state.problem.row_names_.resize(n); + // row names are usually small enough for SSO - parallel assigns mostly don't touch the heap and + // as such may help a lot ideally we could just allocate an arena and store non-owning string + // views but that'd require a refactor of the problem representation + if (n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.problem.row_names_[i].assign(state.row_names_sv[i]); + } + } else { + for (size_t i = 0; i < n; ++i) { + state.problem.row_names_[i].assign(state.row_names_sv[i]); + } + } + } + + { + scoped_timer_t timer("materialize_problem_var_names"); + const bool col_dense_ordered = state.col_index_mode == index_mode_t::dense_ordered; + size_t n = col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size(); + state.problem.var_names_.resize(n); + if (col_dense_ordered && n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.col_dense.format_name(i, state.problem.var_names_[i]); + } + } else if (col_dense_ordered) { + for (size_t i = 0; i < n; ++i) { + state.col_dense.format_name(i, state.problem.var_names_[i]); + } + } else if (n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.problem.var_names_[i].assign(state.var_names_sv[i]); + } + } else { + for (size_t i = 0; i < n; ++i) { + state.problem.var_names_[i].assign(state.var_names_sv[i]); + } + } + } +} + +template +static void append_bounds_only_variables(parse_state_t& state) +{ + if (state.bounds_only_vars.empty()) { return; } + scoped_timer_t timer("append_bounds_only_variables"); + + // BOUNDS-only variables have no matrix entries; append after COLUMNS vars. + for (const auto& [name, aux] : state.bounds_only_vars) { + state.problem.var_names_.emplace_back(name); + state.problem.var_types_.push_back(aux.type); + state.problem.c_.push_back(f_t{0}); + state.problem.variable_lower_bounds_.push_back(aux.lb); + state.problem.variable_upper_bounds_.push_back(aux.ub); + } + state.problem.n_vars_ = (i_t)state.problem.var_names_.size(); +} + +template +static std::size_t init_problem_storage(mps_data_model_t& problem, + std::size_t reserve_hint) +{ + problem.n_vars_ = 0; + problem.n_constraints_ = 0; + problem.nnz_ = 0; + problem.maximize_ = false; + problem.objective_scaling_factor_ = f_t{1}; + problem.objective_offset_ = f_t{0}; + + std::size_t reserve_size = std::max(reserve_hint, 1 * MiB); + std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); + problem.A_offsets_.reserve(reserve_dim); + problem.b_.reserve(reserve_dim); + problem.variable_lower_bounds_.reserve(reserve_dim); + problem.variable_upper_bounds_.reserve(reserve_dim); + problem.var_types_.reserve(reserve_dim); + problem.row_types_.reserve(reserve_dim); + problem.row_names_.reserve(reserve_dim); + problem.var_names_.reserve(reserve_dim); + problem.constraint_lower_bounds_.reserve(reserve_dim); + problem.constraint_upper_bounds_.reserve(reserve_dim); + return reserve_dim; +} + +// Contract every input stream fed to parse_mps_fast_stream must satisfy. +template +concept InputStream = requires(Stream stream) +{ + {stream.data()}->std::convertible_to; + {stream.mutable_data()}->std::convertible_to; + {stream.size()}->std::convertible_to; + {stream.compressed_size()}->std::convertible_to; + {stream.reserve_size_hint()}->std::convertible_to; + {stream.registry()}->std::same_as; + {stream.view()}->std::same_as; + {stream.run_decode_tasks()}->std::same_as; +}; + +template +static mps_data_model_t parse_mps_fast_stream(Stream& stream, + const char* total_timer_name, + const char* producer_task_name) +{ + omp_max_active_levels_guard_t omp_active_levels(2); + + input_stream_view_t input = stream.view(); + auto total_timer = std::make_unique(total_timer_name); + mps_data_model_t problem; + std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint()); + + cursor_t cursor(input.data, 0); + parse_state_t state(problem, cursor); + state.row_names_sv.reserve(reserve_dim); + + auto phase_end = [](const char*) { flush_timers(); }; + + parallel_error_latch_t parser_tasks; + + auto run_parser_task = [&](auto&& fn) { + if (parser_tasks.stopped()) { return; } + try { + fn(); + } catch (...) { + parser_tasks.capture(std::current_exception()); + } + }; + + auto unblock_phase_waiters_after_error = [&]() { + mps_phase_range_t empty{input.data, input.data, false}; + input.registry->publish(mps_phase_kind::header, empty); + input.registry->publish(mps_phase_kind::rows, empty); + input.registry->publish(mps_phase_kind::columns, empty); + input.registry->publish(mps_phase_kind::rhs, empty); + input.registry->publish(mps_phase_kind::bounds, empty); + input.registry->publish(mps_phase_kind::ranges, empty); + input.registry->publish(mps_phase_kind::quadratic, empty); + }; + + // These ints carry no data; they exist only as OpenMP task-dependency tokens. A task's + // depend(out: X) "produces" X and depend(in: X) waits on it, so the phase ordering in the + // task graph below (e.g. bounds after columns_done, because bounds reference variable names) + // is expressed purely through which tokens each task depends on. + int header_ready = 0, rows_ready = 0, columns_ready = 0; + int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0; + int header_done = 0, rows_done = 0, columns_done = 0; + int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0; + int csr_done = 0; + + const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size); + const int parser_threads = parser_thread_cap_for_size(parser_size); + +#pragma omp parallel num_threads(parser_threads) + { + std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num()); + nvtx::name_current_thread(thread_name.c_str()); + +#pragma omp single + { + // Bridge between the producer and the parse tasks: each detached task below blocks + // until run_decode_tasks() publishes that phase's byte range into the registry, then + // completes its event and fulfills depend(out: _ready) -- releasing the matching + // parse task. This is what lets ROWS parsing start the instant the ROWS bytes are + // decoded, overlapping with the decode of later sections. + omp_event_handle_t ev_header; +#pragma omp task detach(ev_header) depend(out : header_ready) + { + input.registry->attach_event(mps_phase_kind::header, ev_header); + } + omp_event_handle_t ev_rows; +#pragma omp task detach(ev_rows) depend(out : rows_ready) + { + input.registry->attach_event(mps_phase_kind::rows, ev_rows); + } + omp_event_handle_t ev_columns; +#pragma omp task detach(ev_columns) depend(out : columns_ready) + { + input.registry->attach_event(mps_phase_kind::columns, ev_columns); + } + omp_event_handle_t ev_rhs; +#pragma omp task detach(ev_rhs) depend(out : rhs_ready) + { + input.registry->attach_event(mps_phase_kind::rhs, ev_rhs); + } + omp_event_handle_t ev_bounds; +#pragma omp task detach(ev_bounds) depend(out : bounds_ready) + { + input.registry->attach_event(mps_phase_kind::bounds, ev_bounds); + } + omp_event_handle_t ev_ranges; +#pragma omp task detach(ev_ranges) depend(out : ranges_ready) + { + input.registry->attach_event(mps_phase_kind::ranges, ev_ranges); + } + omp_event_handle_t ev_quadratic; +#pragma omp task detach(ev_quadratic) depend(out : quadratic_ready) + { + input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic); + } + + // We intentionally keep LZ4/raw input as a stable full-buffer producer here. The + // progressive decoded-page lifetime prototype saved RSS, but made COLUMNS/merge slower + // and really wants a separate memory-limited parser pipeline instead of this fast path. +#pragma omp task + { + MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io); + try { + stream.run_decode_tasks(); + } catch (...) { + parser_tasks.capture(std::current_exception()); + unblock_phase_waiters_after_error(); + } + } + +#pragma omp task depend(in : header_ready) depend(out : header_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_header", nvtx::colors::generic); + parse_header_range(state, input.registry->range(mps_phase_kind::header)); + phase_end("header"); + }); + } + +#pragma omp task depend(in : rows_ready, header_done) depend(out : rows_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_rows", nvtx::colors::rows); + parse_rows_range(state, input.registry->range(mps_phase_kind::rows)); + phase_end("rows"); + }); + } + +#pragma omp task depend(in : rows_done, columns_ready) depend(out : columns_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_columns", nvtx::colors::columns); + parse_columns_range(state, input.registry->range(mps_phase_kind::columns)); + phase_end("columns"); + }); + } + +#pragma omp task depend(in : columns_done) depend(out : names_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_materialize_names", nvtx::colors::names); + scoped_timer_t timer("materialize_problem_names_task"); + materialize_problem_names(state); + }); + } + +#pragma omp task depend(in : columns_done) depend(out : csr_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_materialize_csr", nvtx::colors::alloc); + materialize_problem_csr(state); + }); + } + +#pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_rhs", nvtx::colors::rhs); + parse_rhs_range(state, input.registry->range(mps_phase_kind::rhs)); + phase_end("rhs"); + }); + } + +#pragma omp task depend(in : ranges_ready, rhs_done) depend(out : ranges_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges); + parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges)); + phase_end("ranges"); + }); + } + +#pragma omp task depend(in : bounds_ready, columns_done) depend(out : bounds_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds); + parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds)); + phase_end("bounds"); + }); + } + +#pragma omp task depend(in : quadratic_ready, columns_done) depend(out : quadratic_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic); + parse_quadratic_range(state, input.registry->range(mps_phase_kind::quadratic)); + phase_end("quadratic"); + }); + } + } + } + + parser_tasks.rethrow_if_error(); + + finalize_qcmatrix_constraints(state); + append_bounds_only_variables(state); + + input.size = stream.size(); + cursor.end = input.data + input.size; + if (!input.registry->endata_ready()) { + cursor.ptr = input.data + input.size; + cursor.error("input ended before ENDATA boundary was resolved"); + } + if (input.registry->endata_present()) { + cursor.ptr = input.registry->endata_begin(); + expect(cursor, "ENDATA"); + } + + total_timer.reset(); + flush_timers(); + return problem; +} + +struct padded_memory_input_t { + std::vector buffer; + std::size_t input_size = 0; + std::size_t compressed_size = 0; +}; + +static padded_memory_input_t read_compressed_mps_file(const std::string& path) +{ + std::vector buffer = file_to_string(path); + if (buffer.empty()) { buffer.push_back('\0'); } + + std::size_t input_size = buffer.size() - 1; + ensure_input_buffer_padding(buffer, input_size); + return {std::move(buffer), input_size, get_file_size(path)}; +} + +template +mps_data_model_t parse_mps_fast_file(const std::string& path, FileReadMethod read_method) +{ + FileReadMethod effective_method = effective_file_read_method(path, read_method); + switch (effective_method) { + case FileReadMethod::Lz4: { + lz4_input_stream_t stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode"); + } + case FileReadMethod::Gzip: + case FileReadMethod::Bzip2: { + padded_memory_input_t input = read_compressed_mps_file(path); + memory_input_stream_t stream( + std::move(input.buffer), input.input_size, input.compressed_size); + const char* timer_name = effective_method == FileReadMethod::Gzip + ? "parse_mps_fast_file_gzip (total)" + : "parse_mps_fast_file_bzip2 (total)"; + return parse_mps_fast_stream( + stream, timer_name, "task_memory_scan"); + } + case FileReadMethod::Read: { + raw_input_stream_t stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); + } + } + __builtin_unreachable(); +} + +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp new file mode 100644 index 0000000000..6047a55f05 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "file_reader.hpp" + +#include + +#include +#include + +namespace cuopt::linear_programming::io::detail { + +template +using parser_model_t = mps_data_model_t; + +template +parser_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method = FileReadMethod::Read); + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp new file mode 100644 index 0000000000..78e4219e06 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -0,0 +1,371 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "file_reader.hpp" +#include "nvtx_ranges.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace { + +constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t raw_input_max_read_threads = 8; +constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull; +constexpr long nfs_super_magic = 0x6969; + +bool path_has_suffix(const std::string& path, const char* suffix) noexcept +{ + std::size_t suffix_len = std::strlen(suffix); + if (path.size() < suffix_len) { return false; } + for (std::size_t i = 0; i < suffix_len; ++i) { + unsigned char path_char = path[path.size() - suffix_len + i]; + if (std::tolower(path_char) != suffix[i]) { return false; } + } + return true; +} + +std::size_t add_input_padding(std::size_t size) +{ + if (size > std::numeric_limits::max() - input_buffer_padding_bytes) { + mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow"); + } + return size + input_buffer_padding_bytes; +} + +bool is_nfs_backed_path(const std::string& path) noexcept +{ + struct statfs fs; + return ::statfs(path.c_str(), &fs) == 0 && fs.f_type == nfs_super_magic; +} + +} // namespace + +void ensure_input_buffer_padding(std::vector& buffer, std::size_t input_size) +{ + if (input_size > buffer.size()) { + mps_parser_fail(error_type_t::ValidationError, + "input_size %zu exceeds buffer size %zu", + input_size, + buffer.size()); + } + std::size_t required = add_input_padding(input_size); + if (buffer.size() < required) { buffer.resize(required, '\0'); } +} + +std::size_t get_file_size(int fd, const std::string& path) +{ + struct stat st; + if (::fstat(fd, &st) != 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to stat file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + if (st.st_size < 0) { + mps_parser_fail(error_type_t::RuntimeError, "Negative file size for '%s'", path.c_str()); + } + return (std::size_t)st.st_size; +} + +std::size_t get_file_size(const std::string& path) +{ + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + cuopt::scope_guard close_fd([&] { + if (fd >= 0) { ::close(fd); } + }); + + std::size_t size = get_file_size(fd, path); + ::close(fd); + return size; +} + +std::size_t system_page_size() +{ + static std::size_t page_size = [] { + long value = ::sysconf(_SC_PAGESIZE); + return value > 0 ? (std::size_t)value : (std::size_t)4096; + }(); + return page_size; +} + +bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset) +{ + std::size_t done = 0; + while (done < bytes) { + std::size_t remaining = bytes - done; + std::size_t chunk = + std::min(remaining, (std::size_t)std::numeric_limits::max()); + ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + return false; + } + if (got == 0) { + errno = EIO; + return false; + } + done += (std::size_t)got; + } + return true; +} + +raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); + int buffered_fd = ::open(path.c_str(), O_RDONLY); + cuopt::scope_guard close_buffered([&] { + if (buffered_fd >= 0) { ::close(buffered_fd); } + }); + if (buffered_fd < 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open raw MPS file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + + int direct_fd = -1; + cuopt::scope_guard close_direct([&] { + if (direct_fd >= 0) { ::close(direct_fd); } + }); + + file_size_ = get_file_size(buffered_fd, path); + int read_fd = buffered_fd; + bool large_enough_for_direct = file_size_ > raw_input_direct_io_threshold_bytes; + bool nfs_backed = is_nfs_backed_path(path); + // Buffered reads are consistently faster than O_DIRECT on our NFS mounts; + // keep direct I/O for large local files where it wins. + if (large_enough_for_direct && !nfs_backed) { +#ifdef O_DIRECT + direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); + if (direct_fd >= 0) { + read_fd = direct_fd; + direct_io_ = true; + } +#endif + } + window_bytes_ = raw_input_window_bytes; + window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); +#ifdef MPS_FAST_TIMERS + read_window_ms_.assign(window_count_, 0); +#endif + + output_mapped_size_ = + cuda::round_up(std::max(add_input_padding(file_size_), 1), system_page_size()); + output_region_ = mmap_region_t::anonymous( + output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); + output_data_ = output_region_.char_data(); + output_region_.advise(MADV_HUGEPAGE); + + section_scanner_ = + std::make_unique(output_data_, window_count_, registry_); + + buffered_fd_ = buffered_fd; + buffered_fd = -1; + fd_ = read_fd; + if (read_fd == direct_fd) { direct_fd = -1; } +} + +raw_input_stream_t::~raw_input_stream_t() +{ + if (fd_ >= 0) { ::close(fd_); } + if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); } +} + +const char* raw_input_stream_t::data() const noexcept { return output_data_; } +char* raw_input_stream_t::mutable_data() noexcept { return output_data_; } +std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; } +std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; } +std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; } + +void raw_input_stream_t::read_window_payload(std::size_t offset, std::size_t size) +{ + if (pread_full(fd_, output_data_ + offset, size, offset)) { return; } + // O_DIRECT can reject an unaligned request with EINVAL; fall back to the + // buffered descriptor for this window when that happens. + if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0 && + pread_full(buffered_fd_, output_data_ + offset, size, offset)) { + return; + } + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread raw MPS file '%s': %s", + path_.c_str(), + std::strerror(errno)); +} + +void raw_input_stream_t::run_decode_tasks() +{ + MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io); + if (file_size_ == 0) { + output_view_size_ = 0; + section_scanner_->publish_ready(0); + return; + } + + std::size_t hw_threads = + std::max(1, (std::size_t)std::thread::hardware_concurrency()); + std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); + thread_count = std::max(1, std::min(thread_count, window_count_)); + + // Each window is read independently and handed to the scanner, which owns the + // contiguous decoded-byte frontier and the parallel section publication. + parallel_error_latch_t latch; +#ifdef MPS_FAST_TIMERS + auto read_wall_start = std::chrono::steady_clock::now(); +#endif + parallel_for_indexed( + window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) { + MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); + std::size_t offset = index * window_bytes_; + std::size_t size = std::min(window_bytes_, file_size_ - offset); + { + MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); +#ifdef MPS_FAST_TIMERS + auto start = std::chrono::steady_clock::now(); +#endif + read_window_payload(offset, size); +#ifdef MPS_FAST_TIMERS + auto end = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(end - start); + read_window_ms_[index] = + (uint32_t)std::min(elapsed.count(), std::numeric_limits::max()); +#endif + } + MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); + section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); + }); +#ifdef MPS_FAST_TIMERS + auto read_wall_end = std::chrono::steady_clock::now(); +#endif + latch.rethrow_if_error(); + +#ifdef MPS_FAST_TIMERS + if (!read_window_ms_.empty()) { + std::vector sorted = read_window_ms_; + std::sort(sorted.begin(), sorted.end()); + auto percentile = [&](double pct) { + std::size_t idx = (std::size_t)std::min((double)(sorted.size() - 1), + pct * (double)(sorted.size() - 1)); + return sorted[idx]; + }; + uint64_t total_ms = 0; + for (uint32_t value : read_window_ms_) { + total_ms += value; + } + std::fprintf( + stderr, + "[RAW_READ_LATENCY] windows=%zu wall_ms=%lld total_window_ms=%llu avg_ms=%.3f min_ms=%u " + "p50_ms=%u p90_ms=%u p99_ms=%u max_ms=%u\n", + read_window_ms_.size(), + (long long)std::chrono::duration_cast(read_wall_end - + read_wall_start) + .count(), + (unsigned long long)total_ms, + (double)total_ms / (double)read_window_ms_.size(), + sorted.front(), + percentile(0.50), + percentile(0.90), + percentile(0.99), + sorted.back()); + } +#endif + + output_view_size_ = section_scanner_->ready_bytes(); + section_scanner_->publish_ready(output_view_size_); +} + +memory_input_stream_t::memory_input_stream_t(std::vector buffer, + std::size_t input_size, + std::size_t compressed_size) + : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size) +{ + ensure_input_buffer_padding(buffer_, input_size_); + section_scanner_ = std::make_unique(buffer_.data(), 1, registry_); +} + +const char* memory_input_stream_t::data() const noexcept { return buffer_.data(); } +char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); } +std::size_t memory_input_stream_t::size() const noexcept { return input_size_; } +std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; } +std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; } + +void memory_input_stream_t::run_decode_tasks() +{ + MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io); + // Single block: observe_block advances the frontier and publishes. + section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_); +} + +bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); } +bool has_gzip_extension(const std::string& path) noexcept { return path_has_suffix(path, ".gz"); } +bool has_bzip2_extension(const std::string& path) noexcept { return path_has_suffix(path, ".bz2"); } + +void drop_file_cache(const std::string& path) +{ + MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io); + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { return; } + ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); + ::close(fd); +} + +FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method) +{ + if (has_lz4_extension(path)) { return FileReadMethod::Lz4; } + if (has_gzip_extension(path)) { return FileReadMethod::Gzip; } + if (has_bzip2_extension(path)) { return FileReadMethod::Bzip2; } + if (method == FileReadMethod::Lz4) { + mps_parser_fail( + error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str()); + } + return method; +} + +const char* file_read_method_name(FileReadMethod method) noexcept +{ + switch (method) { + case FileReadMethod::Read: return "read"; + case FileReadMethod::Lz4: return "lz4"; + case FileReadMethod::Gzip: return "gzip"; + case FileReadMethod::Bzip2: return "bzip2"; + default: return "unknown"; + } +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp new file mode 100644 index 0000000000..8ca3456401 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -0,0 +1,319 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +// Input layer for the fast MPS parser: turns on-disk bytes (plain or .lz4) into one +// contiguous parse buffer and publishes MPS section boundaries as data becomes available. +// +// Model: +// - Output is an anonymous mmap'd buffer (THP-hinted, tail-padded for SIMD/cursor safety). +// Raw inputs pread directly into fixed slots; LZ4 decodes into the same layout. +// - Work is split into windows (fixed spans of compressed/raw file bytes). Workers use +// parallel_for_indexed() — std::thread + shared-index dispatch, not OpenMP — because +// blocking pread()/decode does not compose cleanly with OMP team barriers. +// - Each completed window/block is handed to mps_section_block_scanner_t::observe_block(). +// Blocks may finish out of order; the scanner advances a contiguous ready_bytes_ +// frontier and publishes section ranges into mps_phase_registry_t only once the prefix +// up to a section title is contiguous and scannable. +// - The parser runs as OpenMP tasks on those published phases while run_decode_tasks() +// (raw parallel pread, or the LZ4 reader → metadata scanner → decoder pipeline) fills +// the buffer on separate threads. parallel_error_latch_t propagates the first worker +// failure and stops the rest. +// +// LZ4 adds a resident-window pool (parallel pread of compressed spans), block metadata +// scanning with ptr_if_contiguous()/copy_to for window-boundary payloads, parallel decode +// workers, window ref-counting/release, and lazy commit_up_to() of decoded output pages. + +#pragma once + +#include "mmap_region.hpp" +#include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +inline constexpr std::size_t input_buffer_padding_bytes = 64; + +void ensure_input_buffer_padding(std::vector& buffer, std::size_t input_size); + +struct lz4_pipeline_t; + +/** + * @brief File reading method selection + */ +enum class FileReadMethod { Read, Lz4, Gzip, Bzip2 }; + +/** + * @brief Return the effective method for a path. + * + * Compressed inputs are auto-detected by extension; all other inputs use raw input reads. + */ +FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method); + +/** + * @brief Human-readable method name. + */ +const char* file_read_method_name(FileReadMethod method) noexcept; + +/** + * @brief True when the file name has an lz4 extension. + */ +bool has_lz4_extension(const std::string& path) noexcept; +bool has_gzip_extension(const std::string& path) noexcept; +bool has_bzip2_extension(const std::string& path) noexcept; + +/** + * @brief Ask the OS to evict clean cached pages for this file. + * + * This is advisory and affects the local client page cache only. + */ +void drop_file_cache(const std::string& path); + +/** + * @brief OS memory page size, queried once and cached. + */ +std::size_t system_page_size(); + +/** + * @brief File size in bytes; fails with a parser error if it cannot be determined. + */ +std::size_t get_file_size(int fd, const std::string& path); +std::size_t get_file_size(const std::string& path); + +/** + * @brief Read exactly @p bytes at @p offset into @p dst, retrying on EINTR. + * + * Returns false and leaves errno set on error or unexpected EOF. + */ +bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset); + +// First-error-wins latch shared by the parallel reader/decoder pipelines. The +// first captured exception is retained and a stop flag is raised so cooperating +// workers can unwind promptly. The retained exception is rethrown by the +// orchestrating thread once all workers have joined. +class parallel_error_latch_t { + public: + void capture(std::exception_ptr eptr) + { + std::lock_guard lock(mutex_); + if (!first_error_) { + first_error_ = eptr; + stopped_.store(true, std::memory_order_release); + } + } + + bool stopped() const noexcept { return stopped_.load(std::memory_order_acquire); } + + void rethrow_if_error() const + { + if (first_error_) { std::rethrow_exception(first_error_); } + } + + private: + std::mutex mutex_; + std::exception_ptr first_error_ = nullptr; + std::atomic_bool stopped_{false}; +}; + +class scoped_thread_group { + public: + void reserve(std::size_t count) { threads_.reserve(count); } + + template + void emplace(F&& f) + { + threads_.emplace_back(std::forward(f)); + } + + ~scoped_thread_group() + { + for (auto& thread : threads_) { + if (thread.joinable()) { thread.join(); } + } + } + + private: + std::vector threads_; +}; + +// Work-stealing parallel loop over [0, count). Each of thread_count workers pulls +// the next index from a shared counter and invokes body(index). An exception +// escaping body is captured into the latch and stops the loop; the caller is +// responsible for calling latch.rethrow_if_error() after this returns. Workers +// are named "" when a prefix is supplied. +// OMP just doesn't really play well with blocking pread() +template +void parallel_for_indexed(std::size_t count, + std::size_t thread_count, + parallel_error_latch_t& latch, + const char* thread_name_prefix, + Body body) +{ + assert(thread_count > 0); + + std::atomic_size_t next{0}; + scoped_thread_group workers; + workers.reserve(thread_count); + for (std::size_t t = 0; t < thread_count; ++t) { + workers.emplace([&, t] { + if (thread_name_prefix != nullptr) { + std::string name = thread_name_prefix + std::to_string(t); + nvtx::name_current_thread(name.c_str()); + } + while (!latch.stopped()) { + std::size_t index = next.fetch_add(1, std::memory_order_relaxed); + if (index >= count) { break; } + try { + body(index); + } catch (...) { + latch.capture(std::current_exception()); + return; + } + } + }); + } +} + +struct input_stream_view_t { + const char* data = nullptr; + char* mutable_data = nullptr; + std::size_t size = 0; + std::size_t compressed_size = 0; + mps_phase_registry_t* registry = nullptr; +}; + +/** + * @brief CRTP base supplying the registry and view() shared by every input + * stream. Derived classes provide data()/mutable_data()/size()/compressed_size(). + */ +template +class input_stream_base_t { + public: + mps_phase_registry_t& registry() noexcept { return registry_; } + + input_stream_view_t view() noexcept + { + auto* self = static_cast(this); + return {self->data(), self->mutable_data(), self->size(), self->compressed_size(), ®istry_}; + } + + protected: + mps_phase_registry_t registry_; +}; + +// Handles lz4 compressed files (useful since lz4 is very fast, works well for MPS, and makes +// parallel decompression trivial) +class lz4_input_stream_t : public input_stream_base_t { + public: + explicit lz4_input_stream_t(const std::string& path); + ~lz4_input_stream_t(); + + lz4_input_stream_t(const lz4_input_stream_t&) = delete; + lz4_input_stream_t& operator=(const lz4_input_stream_t&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + + void run_decode_tasks(); + + private: + friend struct lz4_pipeline_t; + + void commit_up_to(std::size_t bytes); + + std::string path_; + int fd_ = -1; + mmap_region_t output_region_; + std::size_t compressed_size_ = 0; + char* output_data_ = nullptr; + std::size_t output_mapped_size_ = 0; + std::size_t output_view_size_ = 0; + std::size_t output_committed_size_ = 0; + std::size_t block_max_size_ = 0; + std::size_t content_size_ = 0; + std::size_t header_size_ = 0; + bool content_size_present_ = false; + bool block_checksum_ = false; + bool content_checksum_ = false; + bool dict_id_ = false; + std::mutex commit_mutex_; + std::unique_ptr section_scanner_; + std::size_t block_slot_count_ = 0; +}; + +// Takes a file path +class raw_input_stream_t : public input_stream_base_t { + public: + explicit raw_input_stream_t(const std::string& path); + ~raw_input_stream_t(); + + raw_input_stream_t(const raw_input_stream_t&) = delete; + raw_input_stream_t& operator=(const raw_input_stream_t&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + + void run_decode_tasks(); + + private: + void read_window_payload(std::size_t offset, std::size_t size); + + std::string path_; + int fd_ = -1; + int buffered_fd_ = -1; + bool direct_io_ = false; + mmap_region_t output_region_; + char* output_data_ = nullptr; + std::size_t output_mapped_size_ = 0; + std::size_t output_view_size_ = 0; + std::size_t file_size_ = 0; + std::size_t window_bytes_ = 0; + std::size_t window_count_ = 0; +#ifdef MPS_FAST_TIMERS + std::vector read_window_ms_; +#endif + std::unique_ptr section_scanner_; +}; + +// Takes an in-memory buffer +class memory_input_stream_t : public input_stream_base_t { + public: + memory_input_stream_t(std::vector buffer, + std::size_t input_size, + std::size_t compressed_size); + + memory_input_stream_t(const memory_input_stream_t&) = delete; + memory_input_stream_t& operator=(const memory_input_stream_t&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + + void run_decode_tasks(); + + private: + std::vector buffer_; + std::size_t input_size_ = 0; + std::size_t compressed_size_ = 0; + std::unique_ptr section_scanner_; +}; + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp new file mode 100644 index 0000000000..b7138fedb6 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -0,0 +1,304 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "mmap_region.hpp" + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#ifdef MPS_FAST_PERF_COUNTERS +#include +#endif +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +// below this threshold, the serial row-hash build is usually cheaper than partition setup +inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024; +inline constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; +inline constexpr size_t MPS_ROW_HASH_PARTITIONS = (1 << MPS_ROW_HASH_PARTITION_BITS); + +// FNV-1a over bytes in reverse order; row names commonly share long prefixes. +static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len) +{ + constexpr uint32_t fnv_offset = 2166136261u; + constexpr uint32_t fnv_prime = 16777619u; + + uint32_t h = fnv_offset; + const char* p = ptr + len; + while (p > ptr) { + --p; + h ^= (uint8_t)*p; + h *= fnv_prime; + } + return h; +} + +// 28-byte inline key + uint32 payload: two slots per 64-byte cache line. +struct alignas(32) hash_slot_28_t { + char key[28]; + uint32_t count; +}; + +using hash_key_t = simde__m256i; +using hash_slot_var_t = hash_slot_28_t; +constexpr std::size_t HASH_KEY_BYTES = 28; + +static_assert(sizeof(hash_slot_28_t) == 32); +static_assert(alignof(hash_slot_28_t) == 32); +static_assert(offsetof(hash_slot_28_t, count) == HASH_KEY_BYTES); + +static inline hash_key_t make_key(const char* ptr, std::size_t len) +{ + alignas(32) char buf[32] = {}; + std::memcpy(buf, ptr, len < HASH_KEY_BYTES ? len : HASH_KEY_BYTES); + return simde_mm256_load_si256(reinterpret_cast(buf)); +} + +static inline bool key_cmpeq(const char* slot_key, hash_key_t key) +{ + simde__m256i slot_vec = simde_mm256_loadu_si256(reinterpret_cast(slot_key)); + int mask = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key)); + return (mask & 0x0fffffff) == 0x0fffffff; +} + +static inline void key_store(char* slot_key, hash_key_t key) +{ + simde_mm256_store_si256(reinterpret_cast(slot_key), key); +} + +struct hash_partition_t { + hash_slot_var_t* slots = nullptr; + size_t buckets = 0; + size_t mask = 0; +}; + +static inline size_t hash_partition_for(uint32_t hash) +{ + return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); +} + +static inline size_t hash_bucket_count_for(size_t n_rows, bool compact) +{ + if (compact) { return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); } + return cuda::next_power_of_two(std::max(n_rows * 2, (size_t)64)); +} + +static inline size_t hash_lookup_in( + const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) +{ + const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } + if (slot->count == 0) { return std::numeric_limits::max(); } + if (key_cmpeq(slot->key, key)) { return slot->count - 1; } + } + return std::numeric_limits::max(); +} + +static inline size_t hash_insert_into(hash_slot_var_t* slots, + size_t buckets, + size_t mask, + std::string_view name, + uint32_t hash, + size_t index) +{ + hash_key_t key = make_key(name.data(), name.size()); + hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } + if (slot->count == 0) { + key_store(slot->key, key); + slot->count = (uint32_t)(index + 1); + return i + 1; + } + if (key_cmpeq(slot->key, key)) { + slot->count = (uint32_t)(index + 1); + return i + 1; + } + } + __builtin_unreachable(); +} + +#ifdef MPS_FAST_PERF_COUNTERS +struct hash_build_probe_stats_t { + size_t total_probes = 0; + size_t max_probes = 0; + size_t long_names = 0; + + void seed_long_names(size_t n) { long_names = n; } + + void record_insert(size_t probes) + { + if (probes == 0) { + ++long_names; + } else { + total_probes += probes; + max_probes = std::max(max_probes, probes); + } + } + + void merge(const hash_build_probe_stats_t& other) + { + total_probes += other.total_probes; + max_probes = std::max(max_probes, other.max_probes); + long_names += other.long_names; + } +}; +#endif + +class smallstr_hash_table_t { + public: + void note_long_name(std::string_view name, size_t index) { long_names_[name] = index; } + + size_t long_name_count() const { return long_names_.size(); } + + void reset_build_probe_stats() + { +#ifdef MPS_FAST_PERF_COUNTERS + build_probe_stats_ = {}; + build_probe_stats_.seed_long_names(long_names_.size()); + partition_probe_stats_ = {}; +#endif + } + + void print_build_probe_report(size_t n_rows) const + { +#ifdef MPS_FAST_PERF_COUNTERS + hash_build_probe_stats_t stats = build_probe_stats_; + if (partition_count_ != 0) { + for (size_t p = 0; p < partition_count_; ++p) { + stats.merge(partition_probe_stats_[p]); + } + } + size_t probed_rows = n_rows - stats.long_names; + double mean_probes = probed_rows == 0 ? 0.0 : (double)stats.total_probes / (double)probed_rows; + double load_factor = buckets_ == 0 ? 0.0 : (double)n_rows / (double)buckets_; + std::fprintf(stderr, + "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n", + n_rows, + buckets_, + load_factor, + stats.long_names, + mean_probes, + stats.max_probes); +#endif + } + + void configure_serial_buckets(size_t n_rows, bool compact) + { + partition_count_ = 0; + buckets_ = hash_bucket_count_for(n_rows, compact); + mask_ = buckets_ - 1; + } + + void configure_partitioned_buckets( + const std::array& partition_counts, bool compact) + { + partition_count_ = MPS_ROW_HASH_PARTITIONS; + buckets_ = 0; + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + partitions_[p].buckets = hash_bucket_count_for(partition_counts[p], compact); + partitions_[p].mask = partitions_[p].buckets - 1; + buckets_ += partitions_[p].buckets; + } + mask_ = buckets_ - 1; + } + + void allocate_mmap(const char* label) + { + size_t mmap_size = buckets_ * sizeof(hash_slot_var_t); + region_ = mmap_region_t::anonymous(mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, label); + slots_ = (hash_slot_var_t*)region_.data(); + if (partition_count_ != 0) { + hash_slot_var_t* next_slots = slots_; + for (size_t p = 0; p < partition_count_; ++p) { + partitions_[p].slots = next_slots; + next_slots += partitions_[p].buckets; + } + } + region_.advise(MADV_HUGEPAGE); + } + + mmap_region_t& region() noexcept { return region_; } + const mmap_region_t& region() const noexcept { return region_; } + + hash_slot_var_t* slots() noexcept { return slots_; } + const hash_slot_var_t* slots() const noexcept { return slots_; } + + size_t buckets() const noexcept { return buckets_; } + size_t mask() const noexcept { return mask_; } + size_t partition_count() const noexcept { return partition_count_; } + + const hash_partition_t& partition(size_t p) const noexcept { return partitions_[p]; } + + size_t lookup(std::string_view name) const + { + if (name.size() > HASH_KEY_BYTES) { + auto it = long_names_.find(name); + return it != long_names_.end() ? it->second : std::numeric_limits::max(); + } + hash_key_t key = make_key(name.data(), name.size()); + uint32_t hash = fnv1a_hash(name.data(), name.size()); + if (partition_count_ != 0) { + const auto& part = partitions_[hash_partition_for(hash)]; + return hash_lookup_in(part.slots, part.buckets, part.mask, key, hash); + } + return hash_lookup_in(slots_, buckets_, mask_, key, hash); + } + + size_t insert_serial(std::string_view name, size_t index) + { + size_t probes; + if (name.size() > HASH_KEY_BYTES) { + note_long_name(name, index); + probes = 0; + } else { + probes = hash_insert_into( + slots_, buckets_, mask_, name, fnv1a_hash(name.data(), name.size()), index); + } +#ifdef MPS_FAST_PERF_COUNTERS + build_probe_stats_.record_insert(probes); +#endif + return probes; + } + + size_t insert_partition(size_t partition, std::string_view name, uint32_t hash, size_t index) + { + const auto& part = partitions_[partition]; + size_t probes = hash_insert_into(part.slots, part.buckets, part.mask, name, hash, index); +#ifdef MPS_FAST_PERF_COUNTERS + partition_probe_stats_[partition].record_insert(probes); +#endif + return probes; + } + + private: + mmap_region_t region_; + hash_slot_var_t* slots_ = nullptr; + size_t buckets_ = 0; + size_t mask_ = 0; + size_t partition_count_ = 0; + std::array partitions_{}; + std::unordered_map long_names_{}; +#ifdef MPS_FAST_PERF_COUNTERS + hash_build_probe_stats_t build_probe_stats_{}; + std::array partition_probe_stats_{}; +#endif +}; + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp new file mode 100644 index 0000000000..5e535ce7f2 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -0,0 +1,920 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "file_reader.hpp" +#include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" + +#include +#include + +#include + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace { + +constexpr uint32_t lz4_frame_magic = 0x184D2204u; +constexpr uint32_t lz4_uncompressed_block = 0x80000000u; +constexpr uint32_t lz4_block_size_mask = 0x7FFFFFFFu; +constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull; +constexpr std::size_t lz4_input_max_io_threads = 8; +constexpr std::size_t lz4_no_content_size_reserve_ratio = 128; + +using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int); + +std::size_t estimate_lz4_no_content_size(std::size_t compressed_size) +{ + constexpr std::size_t max_size = std::numeric_limits::max(); + if (compressed_size > max_size / lz4_no_content_size_reserve_ratio) { + return max_size - input_buffer_padding_bytes; + } + return compressed_size * lz4_no_content_size_reserve_ratio; +} + +#if defined(MPS_PARSER_WITH_LZ4) +struct lz4_runtime_t { + void* handle = nullptr; + LZ4_decompress_safe_t decompress_safe = nullptr; + + lz4_runtime_t() + { + for (const char* soname : {"liblz4.so.1", "liblz4.so"}) { + handle = ::dlopen(soname, RTLD_LAZY); + if (handle != nullptr) { break; } + } + if (handle == nullptr) { + mps_parser_fail(error_type_t::RuntimeError, + "Could not open .mps.lz4 file since liblz4 was not found " + "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually " + "or install liblz4."); + } + + decompress_safe = + reinterpret_cast(::dlsym(handle, "LZ4_decompress_safe")); + if (decompress_safe == nullptr) { + mps_parser_fail(error_type_t::RuntimeError, + "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file " + "manually or install a compatible liblz4."); + } + } + + ~lz4_runtime_t() + { + if (handle != nullptr) { ::dlclose(handle); } + } + + lz4_runtime_t(const lz4_runtime_t&) = delete; + lz4_runtime_t& operator=(const lz4_runtime_t&) = delete; +}; + +const lz4_runtime_t& lz4_runtime() +{ + static const lz4_runtime_t runtime; + return runtime; +} +#endif + +int lz4_decompress_safe_runtime([[maybe_unused]] const char* src, + [[maybe_unused]] char* dst, + [[maybe_unused]] int compressed_size, + [[maybe_unused]] int dst_capacity) +{ +#if defined(MPS_PARSER_WITH_LZ4) + return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity); +#else + mps_parser_fail( + error_type_t::RuntimeError, + "Experimental fast MPS parser was built without LZ4 decompression support. " + "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); +#endif +} + +void ensure_lz4_runtime_available() +{ +#if defined(MPS_PARSER_WITH_LZ4) + [[maybe_unused]] auto& runtime = lz4_runtime(); +#else + mps_parser_fail( + error_type_t::RuntimeError, + "Experimental fast MPS parser was built without LZ4 decompression support. " + "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); +#endif +} + +int open_lz4_fd(const std::string& path) +{ + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open LZ4 file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + return fd; +} + +uint32_t read_le32(const char* ptr) +{ + const auto* p = reinterpret_cast(ptr); + return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24); +} + +uint64_t read_le64(const char* ptr) +{ + const auto* p = reinterpret_cast(ptr); + uint64_t value = 0; + for (int i = 7; i >= 0; --i) { + value = (value << 8) | p[i]; + } + return value; +} + +std::size_t block_max_size_from_bd(unsigned char bd) +{ + unsigned block_size_id = (bd >> 4) & 0x7u; + switch (block_size_id) { + case 4: return 64ull * 1024ull; + case 5: return 256ull * 1024ull; + case 6: return 1024ull * 1024ull; + case 7: return 4ull * 1024ull * 1024ull; + default: mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame block size ID"); + } +} + +struct lz4_resident_window_t { + std::size_t index = 0; + std::size_t file_offset = 0; + std::size_t size = 0; + std::unique_ptr data; +}; + +class lz4_resident_windows_t { + public: + explicit lz4_resident_windows_t(std::vector& windows) : windows_(windows) + { + } + + // Compressed file bytes arrive in fixed resident windows; block payloads may span a boundary. + // Return a direct pointer when the whole payload sits in one window (LZ4 decompress + pin); + // otherwise nullptr and the caller stages via copy_to. + const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const + { + if (size == 0) return nullptr; + const auto& w = window_for_offset(offset); + std::size_t local = offset - w.file_offset; + if (local <= w.size && size <= w.size - local) { return w.data.get() + local; } + return nullptr; + } + + void copy_to(std::size_t offset, char* dst, std::size_t size) const + { + std::size_t copied = 0; + while (copied < size) { + const auto& w = window_for_offset(offset + copied); + std::size_t local = offset + copied - w.file_offset; + std::size_t take = std::min(w.size - local, size - copied); + std::memcpy(dst + copied, w.data.get() + local, take); + copied += take; + } + } + + uint8_t read_u8(std::size_t offset) const + { + uint8_t value = 0; + copy_to(offset, reinterpret_cast(&value), sizeof(value)); + return value; + } + + uint32_t read_u32(std::size_t offset) const + { + char bytes[4]; + copy_to(offset, bytes, sizeof(bytes)); + return read_le32(bytes); + } + + uint64_t read_u64(std::size_t offset) const + { + char bytes[8]; + copy_to(offset, bytes, sizeof(bytes)); + return read_le64(bytes); + } + + private: + const lz4_resident_window_t& window_for_offset(std::size_t offset) const + { + if (windows_.empty()) { + mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows"); + } + std::size_t window_stride = windows_.size() > 1 ? windows_[1].file_offset : windows_[0].size; + std::size_t idx = offset / window_stride; + if (idx >= windows_.size()) { + mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); + } + const auto& w = windows_[idx]; + if (offset >= w.file_offset + w.size) { + mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); + } + return w; + } + + std::vector& windows_; +}; + +// Parsed fields of the leading LZ4 frame descriptor (RFC: magic, FLG, BD, and +// optional content size / dictionary id / header checksum). +struct lz4_frame_header_t { + std::size_t block_max_size = 0; + std::size_t content_size = 0; + std::size_t header_size = 0; + bool content_size_present = false; + bool block_checksum = false; + bool content_checksum = false; + bool dict_id = false; +}; + +lz4_frame_header_t parse_lz4_frame_header(int fd, + const std::string& path, + std::size_t compressed_size) +{ + if (compressed_size < 7) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input is too small to contain a frame header"); + } + char header[32]; + std::size_t header_bytes = std::min(sizeof(header), compressed_size); + if (!pread_full(fd, header, header_bytes, 0)) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to read LZ4 frame header '%s': %s", + path.c_str(), + std::strerror(errno)); + } + + std::size_t offset = 0; + uint32_t magic = read_le32(header + offset); + if (magic != lz4_frame_magic) { + mps_parser_fail(error_type_t::ValidationError, + "unsupported LZ4 input: expected standard LZ4 frame magic"); + } + offset += 4; + unsigned char flg = (unsigned char)header[offset++]; + unsigned char bd = (unsigned char)header[offset++]; + unsigned version = (flg >> 6) & 0x3u; + if (version != 1) { + mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version"); + } + bool block_independent = (flg & 0x20u) != 0; + if (!block_independent) { + mps_parser_fail(error_type_t::ValidationError, + "parallel LZ4 reader requires independent blocks; compress with -BI"); + } + + lz4_frame_header_t info; + info.block_checksum = (flg & 0x10u) != 0; + info.content_size_present = (flg & 0x08u) != 0; + info.content_checksum = (flg & 0x04u) != 0; + info.dict_id = (flg & 0x01u) != 0; + info.block_max_size = block_max_size_from_bd(bd); + if (info.content_size_present) { + if (offset + 8 > header_bytes) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading content size"); + } + info.content_size = (std::size_t)read_le64(header + offset); + offset += 8; + } + if (info.dict_id) { + if (offset + 4 > header_bytes) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading dictionary id"); + } + offset += 4; + } + if (offset + 1 > header_bytes) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading header checksum"); + } + offset += 1; + info.header_size = offset; + return info; +} + +} // namespace + +lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("lz4_input_constructor", nvtx::colors::io); + + ensure_lz4_runtime_available(); + + int fd = open_lz4_fd(path); + cuopt::scope_guard close_fd([&] { + if (fd >= 0) { ::close(fd); } + }); + ::posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); + + compressed_size_ = get_file_size(fd, path); + + lz4_frame_header_t header = parse_lz4_frame_header(fd, path, compressed_size_); + block_max_size_ = header.block_max_size; + content_size_ = header.content_size; + header_size_ = header.header_size; + content_size_present_ = header.content_size_present; + block_checksum_ = header.block_checksum; + content_checksum_ = header.content_checksum; + dict_id_ = header.dict_id; + + std::size_t reserve_size = content_size_; + if (!content_size_present_) { + reserve_size = estimate_lz4_no_content_size(compressed_size_); + reserve_size = std::max(reserve_size, block_max_size_); + } + reserve_size += input_buffer_padding_bytes; + + constexpr std::size_t huge_alignment = 2 * 1024 * 1024; // 2MiB + output_mapped_size_ = cuda::round_up(reserve_size, system_page_size()); + output_region_ = mmap_region_t::anonymous_aligned(output_mapped_size_, + huge_alignment, + PROT_NONE, + MAP_PRIVATE | MAP_NORESERVE, + "LZ4 output buffer"); + output_data_ = output_region_.char_data(); + + block_slot_count_ = std::max(1, cuda::ceil_div(reserve_size, block_max_size_) + 1); + + section_scanner_ = + std::make_unique(output_data_, block_slot_count_, registry_); + + fd_ = fd; + fd = -1; +} + +lz4_input_stream_t::~lz4_input_stream_t() +{ + if (fd_ >= 0) { ::close(fd_); } +} + +const char* lz4_input_stream_t::data() const noexcept { return output_data_; } +char* lz4_input_stream_t::mutable_data() noexcept { return output_data_; } +std::size_t lz4_input_stream_t::size() const noexcept { return output_view_size_; } +std::size_t lz4_input_stream_t::compressed_size() const noexcept { return compressed_size_; } +std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept +{ + return content_size_present_ + ? content_size_ + : std::max(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024); +} + +void lz4_input_stream_t::commit_up_to(std::size_t bytes) +{ + MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc); + std::lock_guard lock(commit_mutex_); + if (bytes <= output_committed_size_) return; + if (bytes > output_mapped_size_) { + mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping"); + } + std::size_t new_committed = cuda::round_up(bytes, system_page_size()); + if (new_committed > output_mapped_size_) new_committed = output_mapped_size_; + std::size_t add = new_committed - output_committed_size_; + void* target = output_data_ + output_committed_size_; + mmap_region_t::map_fixed_or_throw( + target, add, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0, "LZ4 output commit"); + ::madvise(target, add, MADV_HUGEPAGE); + output_committed_size_ = new_committed; +} + +struct resident_block_desc_t { + const char* src = nullptr; + std::size_t compressed_size = 0; + std::size_t decompressed_offset = 0; + std::size_t decompressed_size = 0; + std::size_t index = 0; + std::size_t window_index = std::numeric_limits::max(); + bool uncompressed = false; +}; + +struct window_state_t { + std::atomic decode_refs{0}; + std::atomic released{0}; +}; + +// Two distinct units flow through this pipeline: +// * window - a fixed-size span of the compressed file read by the I/O stage. +// * block - a single independent LZ4 data block (decompressed unit) that the +// metadata scanner discovers inside the resident windows. +// Windows feed blocks; the decoded blocks are handed to the section scanner, +// which owns the contiguous decoded-byte frontier and section publication. +// +// Locking (the grouped members below repeat each guard in context): +// * window_mutex - guards window_done[] (reader -> scanner readiness) +// * desc_mutex - guards desc_queue + scanner_done (scanner -> decoders) +// * window_release_mutex - serializes freeing a window buffer + RSS accounting +// * window_state_[].decode_refs/.released, scanned_through_, blocks_scanned, +// compressed_resident_bytes - lock-free atomics +// Locks are never nested. The scanner thread is the sole writer of the frame walk, +// so offset / decompressed_offset are mutated without locking. +struct lz4_pipeline_t { + explicit lz4_pipeline_t(lz4_input_stream_t& input_) + : input(input_), + window_count(cuda::ceil_div(input.compressed_size_, window_bytes)), + windows(window_count), + window_state_(std::make_unique(window_count)), + io_threads(std::min(lz4_input_max_io_threads, window_count)), + window_done(window_count, 0) + { + for (std::size_t i = 0; i < window_count; ++i) { + std::size_t offset = i * window_bytes; + std::size_t size = std::min(window_bytes, input.compressed_size_ - offset); + windows[i].index = i; + windows[i].file_offset = offset; + windows[i].size = size; + } + } + + // Runs the three-stage pipeline to completion: + // + // readers --window_done/window_cv--> scanner --desc_queue/desc_cv--> decoders + // + // * readers (io_threads): pread fixed compressed windows into RAM, mark ready. + // * scanner (1 thread) : walk the LZ4 frame in order, slice it into block + // descriptors, push them to decoders in batches. + // * decoders (io_threads): decompress blocks into the output buffer and hand + // each to the section scanner, which advances the + // decoded-byte frontier and publishes section ranges. + // + // Consumers are spawned first so they are parked waiting before the readers (which + // run on this thread) start producing. scoped_thread_group joins the background + // threads on scope exit; any stage's failure is captured in `latch` and rethrown here. + void run() + { + std::exception_ptr startup_error; + { + scoped_thread_group background; + try { + background.reserve(io_threads + 1); + background.emplace([this] { run_scanner_stage(); }); + for (std::size_t t = 0; t < io_threads; ++t) { + background.emplace([this, t] { run_decoder_stage(t); }); + } + run_readers(); // produce on the calling thread, now that consumers are parked + } catch (...) { + startup_error = std::current_exception(); + fail_and_notify(startup_error); + } + } + if (startup_error) { std::rethrow_exception(startup_error); } + latch.rethrow_if_error(); + } + + void finalize() + { + input.output_view_size_ = input.section_scanner_->ready_bytes(); + input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes); + input.section_scanner_->publish_ready(input.output_view_size_); + } + + void fail_and_notify(std::exception_ptr eptr) + { + latch.capture(eptr); + window_cv.notify_all(); + desc_cv.notify_all(); + } + + void add_compressed_resident(std::size_t bytes) + { + compressed_resident_bytes.fetch_add(bytes, std::memory_order_relaxed); + } + + void try_release_window(std::size_t index) + { + if (index >= window_count) { return; } + if (index >= scanned_through_.load(std::memory_order_acquire)) { return; } + window_state_t& state = window_state_[index]; + if (state.decode_refs.load(std::memory_order_acquire) != 0) { return; } + uint8_t expected = 0; + if (!state.released.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { return; } + std::lock_guard lock(window_release_mutex); + if (windows[index].data) { + windows[index].data.reset(); + compressed_resident_bytes.fetch_sub(windows[index].size, std::memory_order_relaxed); + } + } + + void mark_windows_scanned_before(std::size_t offset) + { + assert(offset >= last_mark_offset_); + last_mark_offset_ = offset; + std::size_t new_scanned_through = std::min(window_count, offset / window_bytes); + std::size_t prev = scanned_through_.load(std::memory_order_relaxed); + if (new_scanned_through <= prev) { return; } + scanned_through_.store(new_scanned_through, std::memory_order_release); + for (std::size_t wi = prev; wi < new_scanned_through; ++wi) { + try_release_window(wi); + } + } + + void run_readers() + { + parallel_for_indexed( + window_count, io_threads, latch, "lz4-window-read-", [this](std::size_t index) { + read_window(index); + }); + } + + void read_window(std::size_t index) + { + try { + auto& w = windows[index]; + w.data.reset(new char[w.size]); + add_compressed_resident(w.size); + bool ok = false; + { + MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); + ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset); + } + if (!ok) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread LZ4 resident window: %s", + std::strerror(errno)); + } + { + MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); + std::lock_guard lock(window_mutex); + window_done[index] = 1; + } + window_cv.notify_all(); + } catch (...) { + fail_and_notify(std::current_exception()); + } + } + + void run_decoder_stage(std::size_t tid) + { + try { + std::string thread_name = "lz4-window-decode-" + std::to_string(tid); + nvtx::name_current_thread(thread_name.c_str()); + while (true) { + std::vector batch = wait_for_decode_batch(); + if (batch.empty()) { return; } + decode_batch(batch); + } + } catch (...) { + fail_and_notify(std::current_exception()); + } + } + + std::vector wait_for_decode_batch() + { + MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); + std::unique_lock lock(desc_mutex); + desc_cv.wait(lock, [&] { return latch.stopped() || scanner_done || !desc_queue.empty(); }); + if (latch.stopped() || desc_queue.empty()) { return {}; } + std::vector batch = std::move(desc_queue.front()); + desc_queue.pop_front(); + return batch; + } + + void decode_batch(const std::vector& batch) + { + MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode); + for (const auto& block : batch) { + decode_block(block); + } + } + + void decode_block(const resident_block_desc_t& block) + { + char* dst = input.output_data_ + block.decompressed_offset; + int actual = 0; + { + MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode); + if (block.uncompressed) { + std::memcpy(dst, block.src, block.decompressed_size); + actual = (int)block.decompressed_size; + } else if (block.compressed_size > (std::size_t)std::numeric_limits::max() || + block.decompressed_size > (std::size_t)std::numeric_limits::max()) { + actual = -1; + } else { + actual = lz4_decompress_safe_runtime( + block.src, dst, (int)block.compressed_size, (int)block.decompressed_size); + } + } + if (actual < 0 || (std::size_t)actual > block.decompressed_size) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input block decompressed to invalid size"); + } + release_block_window_ref(block); + publish_decoded_block(block, dst, (std::size_t)actual); + } + + void release_block_window_ref(const resident_block_desc_t& block) + { + if (block.window_index == std::numeric_limits::max()) { return; } + uint32_t old = + window_state_[block.window_index].decode_refs.fetch_sub(1, std::memory_order_acq_rel); + assert(old > 0); + if (old == 1) { try_release_window(block.window_index); } + } + + void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size) + { + MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); + // The scanner advances the contiguous decoded-byte frontier and publishes + // section ranges as blocks complete, regardless of decode order. + input.section_scanner_->observe_block(block.index, dst, dst + actual_size); + } + + void wait_range_ready(std::size_t begin, std::size_t size) + { + if (size == 0) return; + if (begin > input.compressed_size_ || size > input.compressed_size_ - begin) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading resident window"); + } + std::size_t first = begin / window_bytes; + std::size_t last = (begin + size - 1) / window_bytes; + if (last >= window_done.size()) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading resident window"); + } + for (std::size_t wi = first; wi <= last; ++wi) { + MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); + std::unique_lock lock(window_mutex); + window_cv.wait(lock, [&] { return latch.stopped() || window_done[wi] != 0; }); + if (latch.stopped() && window_done[wi] == 0) { + mps_parser_fail(error_type_t::RuntimeError, + "LZ4 metadata scanner stopped before required window was ready"); + } + } + } + + void push_batch(std::vector& batch) + { + if (batch.empty()) return; + { + MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc); + input.commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size); + } + { + MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic); + std::lock_guard lock(desc_mutex); + desc_queue.push_back(std::move(batch)); + } + batch.clear(); + desc_cv.notify_one(); + } + + void run_scanner_stage() + { + try { + nvtx::name_current_thread("lz4-metadata-scan"); + scan_lz4_metadata(); + { + std::lock_guard lock(desc_mutex); + scanner_done = true; + } + desc_cv.notify_all(); + } catch (...) { + { + std::lock_guard lock(desc_mutex); + scanner_done = true; + } + fail_and_notify(std::current_exception()); + } + } + + void scan_lz4_metadata() + { + lz4_resident_windows_t resident(windows); + std::vector batch; + batch.reserve(lz4_decode_batch_decompressed_bytes / input.block_max_size_ + 1); + std::size_t batch_decoded_bytes = 0; + std::size_t offset = input.header_size_; + std::size_t decompressed_offset = 0; + blocks_scanned.store(0, std::memory_order_relaxed); + + while (true) { + MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic); + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block header"); + } + uint32_t raw_block_size = resident.read_u32(offset); + offset += 4; + if (raw_block_size == 0) { break; } + + resident_block_desc_t block = + scan_one_block(resident, raw_block_size, offset, decompressed_offset); + batch_decoded_bytes += block.decompressed_size; + batch.push_back(block); + blocks_scanned.fetch_add(1, std::memory_order_relaxed); + if (blocks_scanned.load(std::memory_order_relaxed) > input.block_slot_count_) { + mps_parser_fail(error_type_t::OutOfMemoryError, + "LZ4 input block count exceeded reserved metadata slots"); + } + if (batch_decoded_bytes >= lz4_decode_batch_decompressed_bytes) { + push_batch(batch); + batch_decoded_bytes = 0; + } + } + + scan_frame_footer(offset, decompressed_offset); + push_batch(batch); + mark_windows_scanned_before(input.compressed_size_); + } + + resident_block_desc_t scan_one_block(lz4_resident_windows_t& resident, + uint32_t raw_block_size, + std::size_t& offset, + std::size_t& decompressed_offset) + { + // --- Decode the block-size word and validate it --------------------------- + bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; + std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; + if (block_payload_size == 0) { + mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block"); + } + if (block_payload_size > input.block_max_size_ && uncompressed) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 uncompressed block exceeds frame block maximum"); + } + if (input.content_size_present_ && decompressed_offset >= input.content_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame contains more blocks than content size allows"); + } + + // --- Wait until the payload bytes are resident ---------------------------- + wait_range_ready(offset, block_payload_size); + if (offset + block_payload_size > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block payload"); + } + + // --- Determine the decompressed size -------------------------------------- + // Compressed blocks expand to block_max_size_ (or the content-size remainder + // for the final block); uncompressed blocks keep their payload size. + std::size_t decompressed_size = block_payload_size; + if (!uncompressed) { + decompressed_size = + input.content_size_present_ + ? std::min(input.block_max_size_, input.content_size_ - decompressed_offset) + : input.block_max_size_; + } + if (input.content_size_present_ && + decompressed_size > input.content_size_ - decompressed_offset) { + mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size"); + } + + // --- Stage the payload for the decoder ------------------------------------ + // Fast path: the whole payload lives in one window, so point the decoder + // straight at it (zero copy) and pin that window with a decode_refs bump until + // the decode completes. Otherwise it straddles a window boundary: copy it out + // into crossing_payloads, which stays alive for the whole run, so no window pin + // is needed (and the source window can be released as soon as it is scanned). + const char* src = resident.ptr_if_contiguous(offset, block_payload_size); + std::size_t window_index = std::numeric_limits::max(); + if (src == nullptr) { + crossing_payloads.emplace_back(block_payload_size); + resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size); + src = crossing_payloads.back().data(); + } else { + window_index = offset / window_bytes; + window_state_[window_index].decode_refs.fetch_add(1, std::memory_order_acq_rel); + } + + // --- Record the descriptor and advance past the block (+ optional checksum) - + resident_block_desc_t block{src, + block_payload_size, + decompressed_offset, + decompressed_size, + blocks_scanned.load(std::memory_order_relaxed), + window_index, + uncompressed}; + decompressed_offset += decompressed_size; + offset += block_payload_size; + mark_windows_scanned_before(offset); + if (input.block_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block checksum"); + } + offset += 4; + mark_windows_scanned_before(offset); + } + return block; + } + + void scan_frame_footer(std::size_t& offset, std::size_t decompressed_offset) + { + if (input.content_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading content checksum"); + } + offset += 4; + mark_windows_scanned_before(offset); + } + if (input.content_size_present_ && decompressed_offset != input.content_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame ended before declared content size was reached"); + } + if (offset != input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input contains trailing data after the first frame"); + } + } + + // ---- Input + chunking (immutable after construction) ------------------------ + // The compressed file is split into fixed-size `windows`; `io_threads` reader + // threads pull them by index. + lz4_input_stream_t& input; + const std::size_t window_bytes = lz4_pipeline_batch_bytes; + const std::size_t window_count; + std::vector windows; + const std::size_t io_threads; + + // First-error-wins latch shared by all three stages: stops the pipeline and + // retains the first exception for run() to rethrow after the threads join. + parallel_error_latch_t latch; + + // ---- Reader -> scanner readiness (guarded by window_mutex) ----------------- + // A reader sets window_done[i]=1 once window i is resident; the scanner blocks + // on window_cv until every window covering the bytes it needs is ready. + std::vector window_done; + std::mutex window_mutex; + std::condition_variable window_cv; + + // ---- Window lifecycle / early release --------------------------------------- + // windows[i].data is freed exactly once, when the metadata scan has passed window i + // (scanned_through_ > i) AND no decoder still pins it (window_state_[i].decode_refs == 0). + // scanned_through_ advances monotonically in mark_windows_scanned_before (last_mark_offset_ + // asserts that monotonicity); decode_refs bumps in scan_one_block and drops in + // release_block_window_ref; the per-window `released` CAS makes the free exactly-once. + // window_release_mutex serializes the data.reset() + compressed_resident_bytes accounting. + std::unique_ptr window_state_; + std::atomic_size_t scanned_through_{0}; + std::size_t last_mark_offset_{0}; + std::mutex window_release_mutex; + std::atomic_size_t compressed_resident_bytes{0}; + + // ---- Scanner -> decoder queue (guarded by desc_mutex) ---------------------- + // The scanner pushes batches of block descriptors; decoders pop them via desc_cv. + // scanner_done signals the scanner has emitted its final batch. + std::deque> desc_queue; + bool scanner_done = false; + std::mutex desc_mutex; + std::condition_variable desc_cv; + + // ---- Scanner scratch / progress --------------------------------------------- + // blocks_scanned doubles as the running block index; crossing_payloads holds staged + // copies of blocks that straddle a window boundary (see scan_one_block). + std::atomic_size_t blocks_scanned{0}; + std::vector> crossing_payloads; +}; + +void lz4_input_stream_t::run_decode_tasks() +{ + MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io); + lz4_pipeline_t pipeline(*this); + pipeline.run(); + pipeline.finalize(); +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp new file mode 100644 index 0000000000..9d5469e860 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + +// Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range +// are still released by unmapping the owning outer range. +class mmap_region_t { + public: + mmap_region_t() = default; + mmap_region_t(void* ptr, std::size_t size) noexcept : ptr_(ptr), size_(size) {} + + mmap_region_t(const mmap_region_t&) = delete; + mmap_region_t& operator=(const mmap_region_t&) = delete; + + mmap_region_t(mmap_region_t&& other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + unmap_ptr_(other.unmap_ptr_), + unmap_size_(other.unmap_size_) + { + other.ptr_ = nullptr; + other.size_ = 0; + other.unmap_ptr_ = nullptr; + other.unmap_size_ = 0; + } + + mmap_region_t& operator=(mmap_region_t&& other) noexcept + { + if (this != &other) { + reset(); + ptr_ = other.ptr_; + size_ = other.size_; + unmap_ptr_ = other.unmap_ptr_; + unmap_size_ = other.unmap_size_; + other.ptr_ = nullptr; + other.size_ = 0; + other.unmap_ptr_ = nullptr; + other.unmap_size_ = 0; + } + return *this; + } + + ~mmap_region_t() { reset(); } + + private: + static mmap_region_t map( + void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context) + { + void* ptr = ::mmap(address, size, prot, flags, fd, offset); + if (ptr == MAP_FAILED) { + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); + } + return mmap_region_t(ptr, size); + } + + public: + static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context) + { + return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context); + } + + static mmap_region_t anonymous_aligned( + std::size_t size, std::size_t alignment, int prot, int flags, const char* context) + { + if (!cuda::is_power_of_two(alignment)) { + mps_parser_fail(error_type_t::RuntimeError, + "mmap aligned allocation requires power-of-two alignment"); + } + if (size > std::numeric_limits::max() - alignment) { + mps_parser_fail(error_type_t::OutOfMemoryError, "mmap aligned allocation size overflow"); + } + + std::size_t raw_size = size + alignment; + void* raw = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0); + if (raw == MAP_FAILED) { + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); + } + + uintptr_t raw_addr = reinterpret_cast(raw); + uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1); + return mmap_region_t(reinterpret_cast(aligned_addr), size, raw, raw_size); + } + + static void map_fixed_or_throw( + void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context) + { + void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset); + if (ptr == MAP_FAILED) { + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); + } + } + + void reset() noexcept + { + void* base = unmap_ptr_ != nullptr ? unmap_ptr_ : ptr_; + std::size_t len = unmap_ptr_ != nullptr ? unmap_size_ : size_; + if (base != nullptr && len != 0) { ::munmap(base, len); } + ptr_ = nullptr; + size_ = 0; + unmap_ptr_ = nullptr; + unmap_size_ = 0; + } + + void advise(int advice) const noexcept + { + if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); } + } + + void* data() noexcept { return ptr_; } + char* char_data() noexcept { return (char*)ptr_; } + std::size_t size() const noexcept { return size_; } + + private: + mmap_region_t(void* ptr, std::size_t size, void* unmap_ptr, std::size_t unmap_size) noexcept + : ptr_(ptr), size_(size), unmap_ptr_(unmap_ptr), unmap_size_(unmap_size) + { + } + + void* ptr_ = nullptr; + std::size_t size_ = 0; + void* unmap_ptr_ = nullptr; + std::size_t unmap_size_ = 0; +}; + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp new file mode 100644 index 0000000000..3924e2dcd5 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -0,0 +1,478 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "mps_section_scanner.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cuopt::linear_programming::io::detail { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace { + +struct section_record_t { + mps_section_kind kind; + const char* name; + std::size_t len; +}; + +constexpr section_record_t section_records[] = { + {mps_section_kind::rows, "ROWS", 4}, + {mps_section_kind::columns, "COLUMNS", 7}, + {mps_section_kind::rhs, "RHS", 3}, + {mps_section_kind::bounds, "BOUNDS", 6}, + {mps_section_kind::ranges, "RANGES", 6}, + {mps_section_kind::quadobj, "QUADOBJ", 7}, + {mps_section_kind::qmatrix, "QMATRIX", 7}, + {mps_section_kind::qcmatrix, "QCMATRIX", 8}, + {mps_section_kind::endata, "ENDATA", 6}, +}; + +constexpr const char* header_records[] = {"NAME", "OBJSENSE", "OBJNAME"}; + +constexpr std::size_t kSimdWidth = sizeof(simde__m256i); +static_assert(kSimdWidth == 32); +static_assert((std::size_t)mps_section_kind::rows == 0); +static_assert((std::size_t)mps_section_kind::endata + 1 == std::size(section_records)); +static_assert((std::size_t)mps_phase_kind::header == 0); +static_assert((std::size_t)mps_phase_kind::quadratic + 1 == 7); + +bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; } + +simde__m256i nonblank_column1_mask(simde__m256i bytes) +{ + return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' ')); +} + +enum class section_record_match_t { invalid, header, section }; + +bool line_has_record_prefix(const char* line_start, const char* line_end, const char* name) +{ + std::size_t len = std::strlen(name); + if ((std::size_t)(line_end - line_start) < len || std::memcmp(line_start, name, len) != 0) { + return false; + } + const char* after = line_start + len; + return after == line_end || *after <= ' '; +} + +} // namespace + +std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) { return (std::size_t)phase; } + +void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range) +{ + std::size_t idx = phase_index(phase); + omp_event_handle_t event{}; + bool fulfill = false; + { + std::lock_guard lock(mutex_); + if (ready_[idx].load(std::memory_order_acquire)) { return; } + ranges_[idx] = range; + ready_[idx].store(true, std::memory_order_release); + if (has_event_[idx] && !event_fulfilled_[idx]) { + event = events_[idx]; + event_fulfilled_[idx] = true; + fulfill = true; + } + } + if (fulfill) { omp_fulfill_event(event); } +} + +void mps_phase_registry_t::attach_event(mps_phase_kind phase, omp_event_handle_t event) +{ + std::size_t idx = phase_index(phase); + bool fulfill = false; + { + std::lock_guard lock(mutex_); + events_[idx] = event; + has_event_[idx] = true; + if (ready_[idx].load(std::memory_order_acquire) && !event_fulfilled_[idx]) { + event_fulfilled_[idx] = true; + fulfill = true; + } + } + if (fulfill) { omp_fulfill_event(event); } +} + +bool mps_phase_registry_t::ready(mps_phase_kind phase) const +{ + return ready_[phase_index(phase)].load(std::memory_order_acquire); +} + +mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const +{ + std::size_t idx = phase_index(phase); + bool is_ready = ready_[idx].load(std::memory_order_acquire); + assert(is_ready); + return ranges_[idx]; +} + +void mps_phase_registry_t::publish_endata(const char* begin, bool present) +{ + std::lock_guard lock(mutex_); + if (endata_ready_.load(std::memory_order_acquire)) { return; } + endata_begin_ = begin; + endata_present_ = present; + endata_ready_.store(true, std::memory_order_release); +} + +bool mps_phase_registry_t::endata_ready() const +{ + return endata_ready_.load(std::memory_order_acquire); +} + +const char* mps_phase_registry_t::endata_begin() const +{ + assert(endata_ready()); + return endata_begin_; +} + +bool mps_phase_registry_t::endata_present() const +{ + assert(endata_ready()); + return endata_present_; +} + +static section_record_match_t is_section_record(const char* line_start, + const char* line_end, + mps_section_kind* kind) +{ + if (line_start >= line_end) { return section_record_match_t::invalid; } + + for (const char* name : header_records) { + if (line_has_record_prefix(line_start, line_end, name)) { + return section_record_match_t::header; + } + } + + for (const section_record_t& record : section_records) { + if ((std::size_t)(line_end - line_start) < record.len || + std::memcmp(line_start, record.name, record.len) != 0) { + continue; + } + const char* after = line_start + record.len; + while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) { + ++after; + } + // QCMATRIX records are of the form "QCMATRIX " + if (record.kind == mps_section_kind::qcmatrix) { + if (after == line_end) { return section_record_match_t::invalid; } + *kind = record.kind; + return section_record_match_t::section; + } + if (after != line_end) { return section_record_match_t::invalid; } + *kind = record.kind; + return section_record_match_t::section; + } + return section_record_match_t::invalid; +} + +mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data, + std::size_t block_count, + mps_phase_registry_t& registry) + : data_(data), + block_count_(block_count), + registry_(registry), + block_decoded_(std::make_unique[]>(block_count)), + block_begin_offsets_(std::make_unique(block_count)), + block_end_offsets_(std::make_unique(block_count)) +{ + for (std::size_t i = 0; i < block_count_; ++i) { + block_decoded_[i].store(0, std::memory_order_relaxed); + block_begin_offsets_[i].store(0, std::memory_order_relaxed); + block_end_offsets_[i].store(0, std::memory_order_relaxed); + } +} + +std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind) +{ + return (std::size_t)kind; +} + +void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr) +{ + std::atomic& slot = section_hits_[section_hit_index(kind)]; + const char* expected = nullptr; + if (slot.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_acquire)) { + notify_ready_phases(); + } +} + +void mps_section_block_scanner_t::scan_section_range(const char* begin, const char* end) +{ + if (begin >= end) return; + const char* p = begin; + + // Interior scans that start inside a decoded block skip the leading partial + // line. A separate boundary scan covers section titles whose newline/title + // bytes straddle adjacent LZ4 blocks. + if (p != data_) { + const void* nl = __builtin_memchr(p, '\n', (std::size_t)(end - p)); + if (nl == nullptr) { return; } + p = (const char*)nl + 1; + } + + auto try_candidate = [&](const char* line_start) { + const void* nl = __builtin_memchr(line_start, '\n', (std::size_t)(end - line_start)); + const char* line_end = nullptr; + if (nl == nullptr) { + const char* ready_ptr = data_ + ready_bytes_.load(std::memory_order_acquire); + if (end != ready_ptr) { return; } + line_end = end; + } else { + line_end = (const char*)nl; + } + if (*line_start == '*' || *line_start == '$') { return; } + mps_section_kind kind; + section_record_match_t match = is_section_record(line_start, line_end, &kind); + if (match == section_record_match_t::section) { + record_section_hit(kind, line_start); + return; + } + if (match == section_record_match_t::invalid) { + mps_parser_fail(error_type_t::ValidationError, + "unknown section record: %.*s", + (int)(line_end - line_start), + line_start); + } + }; + + // Handle the very first line of a file (NAME indicator, usually) + if (p == data_) { + if (p < end && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); } + ++p; + } + + // In compliant MPS, indicator records begin in column 1 while data records + // begin in column 2+. use "\n[nonblank]" as a needle for the SIMD scan + const simde__m256i newline = simde_mm256_set1_epi8('\n'); + while ((std::size_t)(end - p) >= kSimdWidth) { + // The first-line path above increments p when p == data_, so p - 1 is + // in-bounds here. Loading the previous vector lets us test "\nX" for all + // 32 candidate column-1 bytes with one AVX2 mask. + // loadu is comparable to aligned reads on modern SSE/AVX. + // might warrant some checks on ARM though + simde__m256i current = simde_mm256_loadu_si256(reinterpret_cast(p)); + simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast(p - 1)); + std::uint32_t mask = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256( + simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current))); + while (mask != 0) { + int bit = __builtin_ctz(mask); + try_candidate(p + bit); + mask &= mask - 1; + } + p += kSimdWidth; + } + + // scalar tail + while (p < end) { + if (*(p - 1) == '\n' && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); } + ++p; + } +} + +void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::size_t right_index) +{ + std::size_t left_begin = block_begin_offsets_[left_index].load(std::memory_order_acquire); + std::size_t boundary = block_begin_offsets_[right_index].load(std::memory_order_acquire); + std::size_t right_end = block_end_offsets_[right_index].load(std::memory_order_acquire); + std::size_t begin = + boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin; + std::size_t end = + right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end; + scan_section_range(data_ + begin, data_ + end); +} + +// scans a freshly decoded block for section titles, along with the start/end boundaries if a +// section title straddles blocks +void mps_section_block_scanner_t::observe_block(std::size_t block_index, + const char* begin, + const char* end) +{ + if (block_index >= block_count_) { + mps_parser_fail(error_type_t::RuntimeError, + "MPS section scanner observed invalid LZ4 block index"); + } + + // --- Scan this block, then record its extent and mark it decoded. The release store on + // block_decoded_ publishes the two relaxed offset stores above it. + scan_section_range(begin, end); + block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed); + block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed); + block_decoded_[block_index].store(1, std::memory_order_release); + + // --- Rescan the seams with already-decoded neighbors, in case a title straddles the boundary. + if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) { + scan_boundary(block_index - 1, block_index); + } + if (block_index + 1 < block_count_ && + block_decoded_[block_index + 1].load(std::memory_order_acquire)) { + scan_boundary(block_index, block_index + 1); + } + + // --- Extend the contiguous decoded-byte frontier and publish any newly bounded phases. + advance_ready_frontier(); +} + +void mps_section_block_scanner_t::advance_ready_frontier() +{ + std::size_t new_ready = 0; + bool grew = false; + { + std::lock_guard lock(frontier_mutex_); + while (next_block_ < block_count_ && + block_decoded_[next_block_].load(std::memory_order_acquire)) { + new_ready = block_end_offsets_[next_block_].load(std::memory_order_acquire); + ++next_block_; + grew = true; + } + } + if (grew) { publish_ready(new_ready); } +} + +void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes) +{ + ready_bytes_.store(ready_bytes, std::memory_order_release); + std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0; + scan_section_range(data_ + begin, data_ + ready_bytes); + notify_ready_phases(); +} + +std::size_t mps_section_block_scanner_t::ready_bytes() const noexcept +{ + return ready_bytes_.load(std::memory_order_acquire); +} + +void mps_section_block_scanner_t::notify_ready_phases() +{ + // Publication model: each present phase runs from its own section header to + // the first later section header that has been discovered. Optional sections + // publish present=false once a later boundary proves they cannot still appear. + // ENDATA, or final ready bytes for truncated/non-newline files, is the final + // boundary for the trailing optional/quadratic phases. + std::lock_guard lock(publish_mutex_); + std::size_t ready = ready_bytes_.load(std::memory_order_acquire); + const char* ready_ptr = data_ + ready; + const char* rows = + section_hits_[section_hit_index(mps_section_kind::rows)].load(std::memory_order_acquire); + const char* columns = + section_hits_[section_hit_index(mps_section_kind::columns)].load(std::memory_order_acquire); + const char* rhs = + section_hits_[section_hit_index(mps_section_kind::rhs)].load(std::memory_order_acquire); + const char* bounds = + section_hits_[section_hit_index(mps_section_kind::bounds)].load(std::memory_order_acquire); + const char* ranges = + section_hits_[section_hit_index(mps_section_kind::ranges)].load(std::memory_order_acquire); + const char* quadobj = + section_hits_[section_hit_index(mps_section_kind::quadobj)].load(std::memory_order_acquire); + const char* qmatrix = + section_hits_[section_hit_index(mps_section_kind::qmatrix)].load(std::memory_order_acquire); + const char* qcmatrix = + section_hits_[section_hit_index(mps_section_kind::qcmatrix)].load(std::memory_order_acquire); + const char* endata = + section_hits_[section_hit_index(mps_section_kind::endata)].load(std::memory_order_acquire); + auto available = [&](const char* p) { return p != nullptr && p <= ready_ptr; }; + bool final_ready = + block_count_ == 0 || + (block_decoded_[block_count_ - 1].load(std::memory_order_acquire) && + ready == block_end_offsets_[block_count_ - 1].load(std::memory_order_acquire)); + const char* final_boundary = available(endata) ? endata : (final_ready ? ready_ptr : nullptr); + auto earliest_available_after = [&](const char* after, + std::initializer_list candidates) { + const char* best = nullptr; + for (const char* p : candidates) { + if (!available(p) || (after != nullptr && p <= after)) { continue; } + if (best == nullptr || p < best) { best = p; } + } + return best; + }; + auto publish_optional = [&](mps_phase_kind phase, + const char* self, + const char* predecessor, + std::initializer_list later_candidates) { + if (registry_.ready(phase)) { return; } + if (available(self)) { + const char* end = earliest_available_after(self, later_candidates); + if (end != nullptr) { registry_.publish(phase, {self, end, true}); } + return; + } + if (predecessor != nullptr && + earliest_available_after(predecessor, later_candidates) != nullptr) { + registry_.publish(phase, {nullptr, nullptr, false}); + } + }; + + // Three publication shapes follow: + // (1) mandatory header/rows/columns -- each spans from its start to the next mandatory + // section; published as soon as that bounding section is available. + // (2) optional rhs/ranges/bounds via publish_optional -- present=true once bounded, or + // present=false once a later section proves the optional one cannot still appear. + // (3) quadratic -- starts at the earliest of the three quad markers (quadobj/qmatrix/qcmatrix). + // final_boundary (ENDATA, or the final ready frontier for truncated files) closes the tail. + if (available(rows) && !registry_.ready(mps_phase_kind::header)) { + registry_.publish(mps_phase_kind::header, {data_, rows, true}); + } + if (available(rows) && available(columns) && !registry_.ready(mps_phase_kind::rows)) { + registry_.publish(mps_phase_kind::rows, {rows, columns, true}); + } + if (available(columns) && !registry_.ready(mps_phase_kind::columns)) { + const char* columns_end = earliest_available_after( + columns, {rhs, ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + if (columns_end != nullptr) { + registry_.publish(mps_phase_kind::columns, {columns, columns_end, true}); + } + } + + publish_optional(mps_phase_kind::rhs, + rhs, + columns, + {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + publish_optional(mps_phase_kind::ranges, + ranges, + rhs ? rhs : columns, + {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + publish_optional(mps_phase_kind::bounds, + bounds, + ranges ? ranges : (rhs ? rhs : columns), + {quadobj, qmatrix, qcmatrix, final_boundary}); + + if (!registry_.ready(mps_phase_kind::quadratic)) { + const char* quadratic_begin = nullptr; + if (available(quadobj)) { quadratic_begin = quadobj; } + if (available(qmatrix) && (quadratic_begin == nullptr || qmatrix < quadratic_begin)) { + quadratic_begin = qmatrix; + } + if (available(qcmatrix) && (quadratic_begin == nullptr || qcmatrix < quadratic_begin)) { + quadratic_begin = qcmatrix; + } + if (quadratic_begin != nullptr && final_boundary != nullptr) { + registry_.publish(mps_phase_kind::quadratic, {quadratic_begin, final_boundary, true}); + } else if (quadratic_begin == nullptr && final_boundary != nullptr) { + registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false}); + } + } + + if (available(endata)) { + registry_.publish_endata(endata, true); + } else if (final_ready && final_boundary != nullptr) { + registry_.publish_endata(final_boundary, false); + } +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp new file mode 100644 index 0000000000..5d05e8b2f8 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +#include + +// The section scanner handles freshly read/decoded blocks and scans them for section titles while +// they're still warm in cache it then publishes read/decoded input ranges to the parser workers, +// which handle their respective sections in parallel. + +namespace cuopt::linear_programming::io::detail { + +enum class mps_section_kind { + rows, + columns, + rhs, + bounds, + ranges, + quadobj, + qmatrix, + qcmatrix, + endata, +}; + +enum class mps_phase_kind { + header, + rows, + columns, + rhs, + bounds, + ranges, + quadratic, +}; + +struct mps_phase_range_t { + const char* begin = nullptr; + const char* end = nullptr; + bool present = false; +}; + +class mps_phase_registry_t { + public: + void publish(mps_phase_kind phase, mps_phase_range_t range); + void attach_event(mps_phase_kind phase, omp_event_handle_t event); + + bool ready(mps_phase_kind phase) const; + // range() acquire-loads ready_[phase] (pairs with publish()'s release store) before + // reading ranges_[phase]. Callers must not invoke range() until the phase is published. + mps_phase_range_t range(mps_phase_kind phase) const; + + void publish_endata(const char* begin, bool present); + bool endata_ready() const; + const char* endata_begin() const; + bool endata_present() const; + + private: + // mutex_ guards ranges_/events_/has_event_/event_fulfilled_ and the endata_* fields for writers. + // Readers observe ready_[phase] / endata_ready_ (release-stored under the lock on publish, + // acquire-loaded here) and may then read the matching range lock-free -- see range()'s contract. + static constexpr std::size_t phase_count = 7; + + static std::size_t phase_index(mps_phase_kind phase); + + mps_phase_range_t ranges_[phase_count]{}; + std::atomic ready_[phase_count]{}; + omp_event_handle_t events_[phase_count]{}; + bool has_event_[phase_count]{}; + bool event_fulfilled_[phase_count]{}; + const char* endata_begin_ = nullptr; + bool endata_present_ = false; + std::atomic endata_ready_{false}; + mutable std::mutex mutex_; +}; + +// Turns out-of-order decoded blocks into ordered section-range publications for the parser: +// +// producer --observe_block(i,...)--> [SIMD-scan block i for section titles] --> section_hits_ +// [advance contiguous decoded-byte frontier (ready_bytes_)] +// --> notify_ready_phases --> registry --> parser tasks +// +// Producers (the LZ4 decoders / raw readers) call observe_block for each block in any order. +// Per block the scanner (1) SIMD-scans it for section titles starting in column 1 and records +// the first byte of each section via a first-writer-wins CAS; (2) advances a contiguous +// decoded-byte frontier across whatever leading blocks are now present; and (3) recomputes which +// phases are fully bounded and publishes their [begin,end) ranges to the registry, unblocking the +// matching parser task. A title can straddle two blocks, so adjacent decoded blocks are also +// rescanned over a small overlap (boundary_overlap). +class mps_section_block_scanner_t { + public: + mps_section_block_scanner_t(const char* data, + std::size_t block_count, + mps_phase_registry_t& registry); + + // Records a freshly decoded block, scans it for section titles, advances the + // contiguous decoded-byte frontier across out-of-order completions, and + // publishes any newly available section ranges. Producers only need to feed + // blocks in any order; the frontier and publication live entirely here. + void observe_block(std::size_t block_index, const char* begin, const char* end); + void publish_ready(std::size_t ready_bytes); + + // Current contiguous decoded-byte frontier; producers use this as the final + // view size once all blocks have been observed. + std::size_t ready_bytes() const noexcept; + + private: + static constexpr std::size_t section_count = 9; + // Section titles are short; 128 bytes is enough to rescan around a decoded + // block boundary and catch a newline/title pair split across adjacent blocks. + static constexpr std::size_t boundary_overlap = 128; + + static std::size_t section_hit_index(mps_section_kind kind); + + void scan_section_range(const char* begin, const char* end); + void scan_boundary(std::size_t left_index, std::size_t right_index); + void record_section_hit(mps_section_kind kind, const char* ptr); + void notify_ready_phases(); + void advance_ready_frontier(); + + // Concurrency: observe_block runs concurrently on many producer threads. + // * frontier_mutex_ guards next_block_ and the ready_bytes_ frontier advance. + // * publish_mutex_ serializes notify_ready_phases so each phase publishes once, in order. + // * block_decoded_[i] is release-stored after block_begin/end_offsets_[i] (relaxed), so an + // acquire-load of a set flag makes those offsets visible to the reader. + // * section_hits_[k] is a first-writer-wins CAS holding the earliest byte of section k. + // * registry_ carries its own internal lock. + const char* data_ = nullptr; + std::size_t block_count_ = 0; + mps_phase_registry_t& registry_; + std::mutex publish_mutex_; + std::unique_ptr[]> block_decoded_; + std::unique_ptr block_begin_offsets_; + std::unique_ptr block_end_offsets_; + std::atomic_size_t ready_bytes_{0}; + std::atomic section_hits_[section_count]{}; + std::mutex frontier_mutex_; + std::size_t next_block_ = 0; +}; + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp new file mode 100644 index 0000000000..0f47b45f56 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -0,0 +1,132 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#ifdef MPS_FAST_NVTX +#include +#include +#include +#endif + +namespace cuopt::linear_programming::io::detail::nvtx { + +namespace colors { +constexpr std::uint32_t generic = 0xff8b949e; +constexpr std::uint32_t io = 0xff58a6ff; +constexpr std::uint32_t decode = 0xff3fb950; +constexpr std::uint32_t rows = 0xffd29922; +constexpr std::uint32_t columns = 0xffff7b72; +constexpr std::uint32_t rhs = 0xffa371f7; +constexpr std::uint32_t bounds = 0xfff0883e; +constexpr std::uint32_t ranges = 0xff79c0ff; +constexpr std::uint32_t names = 0xff56d364; +constexpr std::uint32_t alloc = 0xffdb61a2; +constexpr std::uint32_t finalize = 0xffc9d1d9; +} // namespace colors + +inline std::uint32_t color_for_name(std::string_view name) noexcept +{ + if (name.find("lz4") != std::string_view::npos || name.find("read") != std::string_view::npos) { + return colors::io; + } + if (name.find("decode") != std::string_view::npos || + name.find("decompress") != std::string_view::npos) { + return colors::decode; + } + if (name.find("row") != std::string_view::npos) { return colors::rows; } + if (name.find("column") != std::string_view::npos || name.find("csr") != std::string_view::npos) { + return colors::columns; + } + if (name.find("rhs") != std::string_view::npos) { return colors::rhs; } + if (name.find("bound") != std::string_view::npos) { return colors::bounds; } + if (name.find("range") != std::string_view::npos) { return colors::ranges; } + if (name.find("name") != std::string_view::npos || + name.find("materialize") != std::string_view::npos) { + return colors::names; + } + if (name.find("alloc") != std::string_view::npos || + name.find("resize") != std::string_view::npos || + name.find("mmap") != std::string_view::npos) { + return colors::alloc; + } + if (name.find("finalize") != std::string_view::npos) { return colors::finalize; } + return colors::generic; +} + +class scoped_range_t { + public: + explicit scoped_range_t(const char* name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) + { + push(name, color, category); + } + + explicit scoped_range_t(std::string name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) + : owned_name_(std::move(name)) + { + push(owned_name_.c_str(), color, category); + } + + ~scoped_range_t() { end(); } + + void end() + { +#ifdef MPS_FAST_NVTX + if (active_) { + nvtxRangePop(); + active_ = false; + } +#endif + } + + scoped_range_t(const scoped_range_t&) = delete; + scoped_range_t& operator=(const scoped_range_t&) = delete; + + private: + void push([[maybe_unused]] const char* name, + [[maybe_unused]] std::uint32_t color, + [[maybe_unused]] std::uint32_t category) + { +#ifdef MPS_FAST_NVTX + nvtxEventAttributes_t event{}; + event.version = NVTX_VERSION; + event.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + event.colorType = NVTX_COLOR_ARGB; + event.color = color; + event.messageType = NVTX_MESSAGE_TYPE_ASCII; + event.message.ascii = name; + event.category = category; + nvtxRangePushEx(&event); + active_ = true; +#endif + } + + std::string owned_name_; +#ifdef MPS_FAST_NVTX + bool active_ = false; +#endif +}; + +inline void name_current_thread([[maybe_unused]] const char* name) +{ +#ifdef MPS_FAST_NVTX + nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name); +#endif +} + +} // namespace cuopt::linear_programming::io::detail::nvtx + +#define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b +#define MPS_FAST_NVTX_CONCAT(a, b) MPS_FAST_NVTX_CONCAT_INNER(a, b) +#define MPS_NVTX_RANGE(name, color) \ + ::cuopt::linear_programming::io::detail::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT( \ + _mps_nvtx_range_, __LINE__)(name, color) diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp index 77b92d90e9..30d9c41f9f 100644 --- a/cpp/src/io/file_to_string.cpp +++ b/cpp/src/io/file_to_string.cpp @@ -9,6 +9,8 @@ #include +#include +#include #include #include #include @@ -22,9 +24,9 @@ #include #endif // MPS_PARSER_WITH_ZLIB -#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) +#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) || defined(MPS_PARSER_WITH_LZ4) #include -#endif // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB +#endif // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB || MPS_PARSER_WITH_LZ4 namespace { using cuopt::linear_programming::io::error_type_t; @@ -207,22 +209,184 @@ std::vector zlib_file_to_string(const std::string& file) } // end namespace #endif // MPS_PARSER_WITH_ZLIB +#ifdef MPS_PARSER_WITH_LZ4 +namespace { +// Minimal liblz4 frame ABI declarations; keep in sync with lz4frame.h. +struct LZ4F_dctx; +using LZ4F_errorCode_t = size_t; +struct LZ4F_frameInfo_t { + int blockSizeID; + int blockMode; + int contentChecksumFlag; + int frameType; + unsigned long long contentSize; + unsigned dictID; + int blockChecksumFlag; +}; +using LZ4F_createDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx**, unsigned); +using LZ4F_freeDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx*); +using LZ4F_getFrameInfo_t = LZ4F_errorCode_t (*)(LZ4F_dctx*, + LZ4F_frameInfo_t*, + const void*, + size_t*); +using LZ4F_decompress_t = + LZ4F_errorCode_t (*)(LZ4F_dctx*, void*, size_t*, const void*, size_t*, const void*); +using LZ4F_isError_t = unsigned (*)(LZ4F_errorCode_t); +using LZ4F_getErrorName_t = const char* (*)(LZ4F_errorCode_t); + +std::vector lz4_file_to_string(const std::string& file) +{ + struct DlCloseDeleter { + void operator()(void* fp) + { + mps_parser_expects_fatal( + dlclose(fp) == 0, error_type_t::ValidationError, "Error closing liblz4.so!"); + } + }; + struct Lz4DctxDeleter { + void operator()(LZ4F_dctx* f) + { + if (f != nullptr) { + const LZ4F_errorCode_t err = fptr(f); + mps_parser_expects_fatal( + !is_error(err), error_type_t::ValidationError, "Error closing lz4 file!"); + } + } + LZ4F_freeDecompressionContext_t fptr = nullptr; + LZ4F_isError_t is_error = nullptr; + }; + + void* raw_lz4handle = nullptr; + for (const char* soname : {"liblz4.so.1", "liblz4.so"}) { + raw_lz4handle = dlopen(soname, RTLD_LAZY); + if (raw_lz4handle != nullptr) break; + } + std::unique_ptr lz4handle{raw_lz4handle}; + mps_parser_expects(lz4handle != nullptr, + error_type_t::ValidationError, + "Could not open .lz4 file since liblz4 was not found " + "(tried liblz4.so.1, liblz4.so). In order to open .lz4 files directly, " + "please ensure liblz4 is installed. Alternatively, decompress the .lz4 file " + "manually and open the uncompressed file. Given path: %s", + file.c_str()); + + LZ4F_createDecompressionContext_t LZ4F_createDecompressionContext = + reinterpret_cast( + dlsym(lz4handle.get(), "LZ4F_createDecompressionContext")); + LZ4F_freeDecompressionContext_t LZ4F_freeDecompressionContext = + reinterpret_cast( + dlsym(lz4handle.get(), "LZ4F_freeDecompressionContext")); + LZ4F_getFrameInfo_t LZ4F_getFrameInfo = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_getFrameInfo")); + LZ4F_decompress_t LZ4F_decompress = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_decompress")); + LZ4F_isError_t LZ4F_isError = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_isError")); + LZ4F_getErrorName_t LZ4F_getErrorName = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_getErrorName")); + mps_parser_expects( + LZ4F_createDecompressionContext != nullptr && LZ4F_freeDecompressionContext != nullptr && + LZ4F_getFrameInfo != nullptr && LZ4F_decompress != nullptr && LZ4F_isError != nullptr && + LZ4F_getErrorName != nullptr, + error_type_t::ValidationError, + "Error loading liblz4! Library version might be incompatible. Please decompress the .lz4 " + "file manually and open the uncompressed file. Given path: %s", + file.c_str()); + + std::unique_ptr fp{fopen(file.c_str(), "rb")}; + mps_parser_expects(fp != nullptr, + error_type_t::ValidationError, + "Error opening input file! Given path: %s", + file.c_str()); + mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0, + error_type_t::ValidationError, + "Error seeking input file! Given path: %s", + file.c_str()); + const long compressed_size = ftell(fp.get()); + mps_parser_expects(compressed_size != -1L, + error_type_t::ValidationError, + "Error sizing input file! Given path: %s", + file.c_str()); + std::vector compressed(compressed_size); + rewind(fp.get()); + mps_parser_expects(fread(compressed.data(), sizeof(char), compressed_size, fp.get()) == + static_cast(compressed_size), + error_type_t::ValidationError, + "Error reading input file! Given path: %s", + file.c_str()); + + constexpr unsigned lz4f_version = 100; + LZ4F_dctx* raw_dctx = nullptr; + LZ4F_errorCode_t lz4_status = LZ4F_createDecompressionContext(&raw_dctx, lz4f_version); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Could not open lz4 compressed file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + std::unique_ptr dctx{raw_dctx, + {LZ4F_freeDecompressionContext, LZ4F_isError}}; + + const char* src = compressed.data(); + size_t src_size = compressed.size(); + LZ4F_frameInfo_t frame_info{}; + size_t src_used = src_size; + lz4_status = LZ4F_getFrameInfo(dctx.get(), &frame_info, src, &src_used); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Error reading lz4 frame info for input file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + src += src_used; + src_size -= src_used; + + std::vector buf; + if (frame_info.contentSize > 0) { buf.reserve((size_t)frame_info.contentSize + 1); } + const size_t readbufsize = 1ull << 24; // 16MiB + std::vector readbuf(readbufsize); + while (lz4_status != 0) { + size_t dst_size = readbuf.size(); + src_used = src_size; + lz4_status = LZ4F_decompress(dctx.get(), readbuf.data(), &dst_size, src, &src_used, nullptr); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Error in lz4 decompression of input file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + if (dst_size > 0) { buf.insert(buf.end(), begin(readbuf), begin(readbuf) + dst_size); } + src += src_used; + src_size -= src_used; + mps_parser_expects(src_used != 0 || dst_size != 0 || lz4_status == 0, + error_type_t::ValidationError, + "Stalled lz4 decompression of input file! Given path: %s", + file.c_str()); + } + buf.push_back('\0'); + return buf; +} +} // end namespace +#endif // MPS_PARSER_WITH_LZ4 + namespace cuopt::linear_programming::io::detail { std::vector file_to_string(const std::string& file) { + std::string lower(file); + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { + return (char)std::tolower(c); + }); + #ifdef MPS_PARSER_WITH_BZIP2 - if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".bz2") { - return bz2_file_to_string(file); - } + if (lower.ends_with(".bz2")) { return bz2_file_to_string(file); } #endif // MPS_PARSER_WITH_BZIP2 #ifdef MPS_PARSER_WITH_ZLIB - if (file.size() > 3 && file.substr(file.size() - 3, 3) == ".gz") { - return zlib_file_to_string(file); - } + if (lower.ends_with(".gz")) { return zlib_file_to_string(file); } #endif // MPS_PARSER_WITH_ZLIB +#ifdef MPS_PARSER_WITH_LZ4 + if (lower.ends_with(".lz4")) { return lz4_file_to_string(file); } +#endif // MPS_PARSER_WITH_LZ4 + // Faster than using C++ I/O std::unique_ptr fp{fopen(file.c_str(), "r")}; mps_parser_expects(fp != nullptr, diff --git a/cpp/src/io/file_to_string.hpp b/cpp/src/io/file_to_string.hpp index 94b2df821d..3b1924e12c 100644 --- a/cpp/src/io/file_to_string.hpp +++ b/cpp/src/io/file_to_string.hpp @@ -17,6 +17,7 @@ namespace cuopt::linear_programming::io::detail { // The dispatcher looks at the extension: // - ".bz2" → libbz2 (dlopen'd at runtime), if MPS_PARSER_WITH_BZIP2. // - ".gz" → libz (dlopen'd at runtime), if MPS_PARSER_WITH_ZLIB. +// - ".lz4" → liblz4 (dlopen'd at runtime), if MPS_PARSER_WITH_LZ4. // - otherwise → plain fopen. // The returned buffer's size includes the null terminator. std::vector file_to_string(const std::string& file); diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp index 5f7cecda94..9d4dea2bbf 100644 --- a/cpp/src/io/mps_parser.cpp +++ b/cpp/src/io/mps_parser.cpp @@ -797,9 +797,9 @@ void mps_parser_t::parse_rows(std::string_view line) } if (type == Objective) { // Keep only the first name or OBJNAME since it was set before - if (objective_name.empty()) - objective_name = name; - else + if (objective_name.empty()) objective_name = name; + // aligns with CPLEX/SCIP behavior + else if (name != objective_name) ignored_objective_names.emplace(name); // If we wanted to strictly follow MPS definition: a new objective row ('N') should be treated // as an unbounded constraints, aka an extra contraints row with lower bound -infinity and upper diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp index 93d9d9c73c..c9b3a351c6 100644 --- a/cpp/src/io/parser.cpp +++ b/cpp/src/io/parser.cpp @@ -7,8 +7,13 @@ #include +#include #include +#include + +#include + namespace cuopt::linear_programming::io { template @@ -35,4 +40,18 @@ template mps_data_model_t read_mps_from_string(std::string_view mps_ template mps_data_model_t read_mps_from_string(std::string_view mps_contents, bool fixed_mps_format); +template +mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) +{ + CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str()); + return detail::parse_mps_fast_file(mps_file_path); +} + +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); + } // namespace cuopt::linear_programming::io diff --git a/cpp/src/io/utilities/error.hpp b/cpp/src/io/utilities/error.hpp index 58ac3891e1..c1b28fc7ff 100644 --- a/cpp/src/io/utilities/error.hpp +++ b/cpp/src/io/utilities/error.hpp @@ -34,6 +34,30 @@ inline std::string error_to_string(error_type_t error) return std::string("UnAccountedError"); } +[[noreturn]] inline void mps_parser_throw(error_type_t error_type, const char* msg) +{ + throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) + + "\", \"msg\": " + "\"" + std::string(msg) + "\"}"); +} + +/** + * @brief Report an unrecoverable parser error. + * + * @param[error_type_t] error enum error type + * @param[const char *] fmt String format for error message + * @param variable set of arguments used for fmt + * @throw std::logic_error always + */ +[[noreturn]] inline void mps_parser_fail(error_type_t error_type, const char* fmt, ...) +{ + va_list args; + va_start(args, fmt); + char msg[2048]; + vsnprintf(msg, sizeof(msg), fmt, args); + va_end(args); + mps_parser_throw(error_type, msg); +} + /** * @brief Function for checking (pre-)conditions that throws an exception when a * condition is false @@ -52,9 +76,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f char msg[2048]; vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - - throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) + - "\", \"msg\": " + "\"" + std::string(msg) + "\"}"); + mps_parser_throw(error_type, msg); } } diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp new file mode 100644 index 0000000000..70658aa9b3 --- /dev/null +++ b/cpp/src/utilities/perf_counters.hpp @@ -0,0 +1,194 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +// Utils to return to total resident set size (used physical pages) +static size_t parse_status_kb_line(const char* line, const char* key) +{ + size_t key_len = std::strlen(key); + if (std::strncmp(line, key, key_len) != 0) { return 0; } + const char* p = line + key_len; + while (*p == ' ' || *p == '\t') { + ++p; + } + char* end_ptr = nullptr; + size_t value = std::strtol(p, &end_ptr, 10); + return value; +} + +static std::pair current_process_rss_kb() +{ + FILE* file = std::fopen("/proc/self/status", "r"); + if (file == nullptr) { return {0, 0}; } + + size_t rss_kb = 0; + size_t hwm_kb = 0; + char line[256]; + while (std::fgets(line, sizeof(line), file) != nullptr) { + if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); } + if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); } + if (rss_kb != 0 && hwm_kb != 0) { break; } + } + std::fclose(file); + return {rss_kb, hwm_kb}; +} + +struct perf_counter_spec_t { + const char* name; + uint32_t type; + uint64_t config; +}; + +static constexpr uint64_t perf_cache_config(uint64_t cache, uint64_t op, uint64_t result) +{ + return cache | (op << 8) | (result << 16); +} + +// Small scoped Linux perf_event_open wrapper for coarse phase diagnostics. +// +// Important limitations: +// - Counters are per-thread: construct one instance inside each worker whose +// work should be measured, then aggregate snapshots. +// - These are generic perf events; exact mappings vary by CPU. Some events may +// be unavailable or unhelpful, e.g. store-side DTLB misses on this node. +// - This deliberately does not use event groups or time_enabled/time_running +// scaling, so counts are approximate if the kernel multiplexes counters. +static constexpr std::array PERF_COUNTER_SPECS = {{ + {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES}, + {"instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS}, + {"cache_refs", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES}, + {"cache_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES}, + {"branch_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES}, + {"backend_stall_cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND}, + {"dtlb_load_misses", + PERF_TYPE_HW_CACHE, + perf_cache_config( + PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_RESULT_MISS)}, + {"dtlb_store_misses", + PERF_TYPE_HW_CACHE, + perf_cache_config( + PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_WRITE, PERF_COUNT_HW_CACHE_RESULT_MISS)}, +}}; + +struct perf_counter_snapshot_t { + bool active = false; + int open_errno = 0; + std::array values = {}; +}; + +class thread_perf_counters_t { + public: + thread_perf_counters_t() + { + fds_.fill(-1); + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + perf_event_attr attr = {}; + attr.type = PERF_COUNTER_SPECS[i].type; + attr.size = sizeof(attr); + attr.config = PERF_COUNTER_SPECS[i].config; + attr.disabled = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + + int fd = (int)syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + if (fd < 0) { + if (first_errno_ == 0) { first_errno_ = errno; } + continue; + } + fds_[i] = fd; + active_ = true; + } + + if (active_) { + for (int fd : fds_) { + if (fd >= 0) { + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + } + } + } + } + + thread_perf_counters_t(const thread_perf_counters_t&) = delete; + thread_perf_counters_t& operator=(const thread_perf_counters_t&) = delete; + + ~thread_perf_counters_t() { close_all(); } + + perf_counter_snapshot_t stop() + { + perf_counter_snapshot_t snapshot; + snapshot.active = active_; + snapshot.open_errno = first_errno_; + + for (size_t i = 0; i < fds_.size(); ++i) { + int fd = fds_[i]; + if (fd < 0) continue; + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + uint64_t value = 0; + if (read(fd, &value, sizeof(value)) == (ssize_t)sizeof(value)) { snapshot.values[i] = value; } + } + close_all(); + active_ = false; + return snapshot; + } + + private: + void close_all() + { + for (int& fd : fds_) { + if (fd >= 0) { + close(fd); + fd = -1; + } + } + } + + bool active_ = false; + int first_errno_ = 0; + std::array fds_; +}; + +static inline void print_perf_totals(const char* label, + const std::vector& snapshots) +{ + std::array totals = {}; + bool any_active = false; + int first_errno = 0; + for (const auto& snapshot : snapshots) { + if (snapshot.open_errno != 0 && first_errno == 0) { first_errno = snapshot.open_errno; } + if (!snapshot.active) continue; + any_active = true; + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + totals[i] += snapshot.values[i]; + } + } + + if (!any_active) { + std::fprintf(stderr, "[PERF] %s unavailable errno=%d\n", label, first_errno); + return; + } + + double ipc = totals[0] == 0 ? 0.0 : (double)totals[1] / (double)totals[0]; + double miss_rate = totals[2] == 0 ? 0.0 : (double)totals[3] / (double)totals[2]; + std::fprintf(stderr, "[PERF] %s", label); + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + std::fprintf(stderr, " %s=%llu", PERF_COUNTER_SPECS[i].name, totals[i]); + } + std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate); +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt index bc057db1e2..6db30755c3 100644 --- a/cpp/tests/linear_programming/CMakeLists.txt +++ b/cpp/tests/linear_programming/CMakeLists.txt @@ -21,6 +21,16 @@ ConfigureTest(MPS_PARSER_TEST ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp LABELS numopt) +ConfigureTest(MPS_FAST_PARSER_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp + LABELS numopt) +target_include_directories(MPS_FAST_PARSER_TEST + PRIVATE + "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast" +) +target_link_libraries(MPS_FAST_PARSER_TEST PRIVATE simde::simde) + # ################################################################################################## # - C API Tests---------------------------------------------------------------------- # The C API tests require a separate library to be linked against. So we don't use the ConfigureTest macro. diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp new file mode 100644 index 0000000000..8bde21bb61 --- /dev/null +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp @@ -0,0 +1,188 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fast_fp64_parser.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::io::detail { + +namespace { + +uint64_t bits(double value) { return std::bit_cast(value); } + +double reference_strtod(std::string_view token) +{ + std::string normalized(token); + for (char& c : normalized) { + if (c == 'd' || c == 'D') { c = 'e'; } + } + char* end = nullptr; + errno = 0; + return std::strtod(normalized.c_str(), &end); +} + +double parse_token(std::string_view token) +{ + const char* p = token.data(); + return fp64::parse_fp64_advance(p, token.data() + token.size()); +} + +void check_bitwise_strtod(std::string_view token) +{ + std::string normalized(token); + for (char& c : normalized) { + if (c == 'd' || c == 'D') { c = 'e'; } + } + char* end = nullptr; + errno = 0; + const double ref = std::strtod(normalized.c_str(), &end); + EXPECT_EQ(end, normalized.c_str() + normalized.size()); + + std::string padded(token); + padded.append(40, ' '); + const char* p = padded.data(); + const double padded_value = fp64::parse_fp64_advance(p, padded.data() + padded.size()); + EXPECT_EQ(p, padded.data() + token.size()); + + const uint64_t ref_bits = bits(ref); + EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'"; + EXPECT_EQ(ref_bits, bits(padded_value)) << "padded parse mismatch for '" << token << "'"; +} + +std::string random_token(std::mt19937_64& rng) +{ + std::uniform_int_distribution sign_dist(0, 4); + std::uniform_int_distribution digit_dist(0, 9); + std::uniform_int_distribution shape_dist(0, 5); + std::uniform_int_distribution len_dist(1, 19); + std::uniform_int_distribution exp_dist(-30, 30); + + std::string token; + int sign = sign_dist(rng); + if (sign == 0) { + token.push_back('-'); + } else if (sign == 1) { + token.push_back('+'); + } + + int shape = shape_dist(rng); + if (shape == 0) { + token.append("0."); + int frac_len = std::uniform_int_distribution(1, 19)(rng); + for (int i = 0; i < frac_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + } else { + int int_len = len_dist(rng); + token.push_back(static_cast('1' + std::uniform_int_distribution(0, 8)(rng))); + for (int i = 1; i < int_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + if (shape >= 2) { + token.push_back('.'); + int remaining = 24 - static_cast(token.size()); + int max_frac = std::max(0, std::min(19, remaining)); + int frac_len = max_frac == 0 ? 0 : std::uniform_int_distribution(0, max_frac)(rng); + for (int i = 0; i < frac_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + } + } + + if (shape == 5) { + int exp = exp_dist(rng); + std::string suffix = "e" + std::to_string(exp); + if (token.size() + suffix.size() <= 25) { token += suffix; } + } + + if (token.size() > 25) { token.resize(25); } + return token; +} + +} // namespace + +TEST(FastFp64ParserTest, CommonTableMatchesStrtodBitwise) +{ + std::setlocale(LC_NUMERIC, "C"); + const std::vector cases = { + "0", + "-0", + "1", + "-1", + "+1", + "2", + "42", + "123456789", + "57.", + "-57.", + "0.1", + "0.01", + "0.12345678901234", + "0.1234567890123456", + "0.3333333333333333", + "0.6508282938248958", + "3.14159", + "3130000", + "8594600.16", + "2344.55", + "0.000000000000001", + "9999999999999999", + "1844674407370955161", + "1e0", + "1e-9", + "1E12", + "-2.5e3", + "3.125D-2", + }; + + for (std::string_view token : cases) { + check_bitwise_strtod(token); + } +} + +TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd) +{ + std::setlocale(LC_NUMERIC, "C"); + std::string text = "123.45 ABC"; + const char* p = text.data(); + double value = fp64::parse_fp64_advance(p, text.data() + text.size()); + + EXPECT_EQ(bits(reference_strtod("123.45")), bits(value)); + EXPECT_EQ(text.data() + 6, p); + EXPECT_EQ(std::string_view(" ABC"), std::string_view(p, 5)); +} + +TEST(FastFp64ParserTest, RejectsMalformedNumericSuffix) +{ + std::setlocale(LC_NUMERIC, "C"); + for (const char* token : {"1x", "1e", "1d+", "1e+"}) { + SCOPED_TRACE(token); + EXPECT_THROW(parse_token(token), std::exception); + } +} + +TEST(FastFp64ParserTest, FixedSeedRandomDifferential) +{ + std::setlocale(LC_NUMERIC, "C"); + std::mt19937_64 rng(0x4d50535f46415354ULL); + for (int i = 0; i < 100000; ++i) { + std::string token = random_token(rng); + ASSERT_LE(token.size(), 25U); + check_bitwise_strtod(token); + } +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp new file mode 100644 index 0000000000..771462a9ab --- /dev/null +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -0,0 +1,936 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fast_parser.hpp" +#include "mps_section_scanner.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuopt::linear_programming::io::detail { + +namespace { + +struct TempMpsFile { + explicit TempMpsFile(std::string contents) + { + char path_template[128]; + std::snprintf(path_template, + sizeof(path_template), + "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps", + static_cast(getpid())); + int fd = mkstemps(path_template, 4); + if (fd < 0) { + throw std::runtime_error(std::string("mkstemps failed: ") + std::strerror(errno)); + } + path = path_template; + FILE* file = fdopen(fd, "wb"); + if (file == nullptr) { + close(fd); + throw std::runtime_error(std::string("fdopen failed: ") + std::strerror(errno)); + } + if (!contents.empty() && + std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) { + std::fclose(file); + throw std::runtime_error(std::string("failed to write temporary MPS file: ") + + std::strerror(errno)); + } + if (std::fclose(file) != 0) { + throw std::runtime_error(std::string("failed to close temporary MPS file: ") + + std::strerror(errno)); + } + } + + TempMpsFile(const TempMpsFile&) = delete; + TempMpsFile& operator=(const TempMpsFile&) = delete; + + ~TempMpsFile() + { + if (!path.empty()) { std::remove(path.c_str()); } + } + + std::string path; +}; + +struct TempOwnedPath { + explicit TempOwnedPath(std::string p) : path(std::move(p)) {} + TempOwnedPath(const TempOwnedPath&) = delete; + TempOwnedPath& operator=(const TempOwnedPath&) = delete; + + ~TempOwnedPath() + { + if (!path.empty()) { std::remove(path.c_str()); } + } + + std::string path; +}; + +std::string_view range_text(const mps_phase_range_t& range) +{ + if (!range.present) { return {}; } + return std::string_view(range.begin, static_cast(range.end - range.begin)); +} + +uint64_t bits(double value) { return std::bit_cast(value); } + +template +void expect_vectors_bitwise_equal(const std::vector& reference, + const std::vector& fast, + std::string_view field, + std::string_view context) +{ + static_assert(std::is_trivially_copyable_v); + SCOPED_TRACE(std::string(context) + " " + std::string(field)); + ASSERT_EQ(reference.size(), fast.size()) << "size"; + if (reference.empty()) { return; } + EXPECT_EQ(0, std::memcmp(reference.data(), fast.data(), reference.size() * sizeof(T))); +} + +void check_models_match_reference_bitwise(const parser_model_t& fast, + const mps_data_model_t& reference, + std::string_view context) +{ + EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars"; + EXPECT_EQ(reference.n_constraints_, fast.n_constraints_) + << std::string(context) + " n_constraints"; + EXPECT_EQ(reference.get_nnz(), fast.get_nnz()) << std::string(context) + " nnz"; + EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize"; + EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name"; + EXPECT_EQ(reference.objective_name_, fast.objective_name_) + << std::string(context) + " objective_name"; + + EXPECT_EQ(bits(reference.objective_scaling_factor_), bits(fast.objective_scaling_factor_)) + << std::string(context) + " objective_scaling_factor"; + EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_)) + << std::string(context) + " objective_offset"; + + expect_vectors_bitwise_equal(reference.A_, fast.A_, "A", context); + EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices"; + EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets"; + expect_vectors_bitwise_equal(reference.b_, fast.b_, "b", context); + expect_vectors_bitwise_equal(reference.c_, fast.c_, "c", context); + expect_vectors_bitwise_equal(reference.variable_lower_bounds_, + fast.variable_lower_bounds_, + "variable_lower_bounds", + context); + expect_vectors_bitwise_equal(reference.variable_upper_bounds_, + fast.variable_upper_bounds_, + "variable_upper_bounds", + context); + expect_vectors_bitwise_equal(reference.constraint_lower_bounds_, + fast.constraint_lower_bounds_, + "constraint_lower_bounds", + context); + expect_vectors_bitwise_equal(reference.constraint_upper_bounds_, + fast.constraint_upper_bounds_, + "constraint_upper_bounds", + context); + EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types"; + EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types"; + EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names"; + EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names"; + + ASSERT_EQ(reference.quadratic_constraints_.size(), fast.quadratic_constraints_.size()) + << std::string(context) + " quadratic_constraints size"; + for (size_t q = 0; q < reference.quadratic_constraints_.size(); ++q) { + const auto& ref_qc = reference.quadratic_constraints_[q]; + const auto& fast_qc = fast.quadratic_constraints_[q]; + SCOPED_TRACE(std::string(context) + " quadratic_constraint " + std::to_string(q)); + EXPECT_EQ(ref_qc.constraint_row_index, fast_qc.constraint_row_index); + EXPECT_EQ(ref_qc.constraint_row_name, fast_qc.constraint_row_name); + EXPECT_EQ(ref_qc.constraint_row_type, fast_qc.constraint_row_type); + EXPECT_EQ(bits(ref_qc.rhs_value), bits(fast_qc.rhs_value)); + expect_vectors_bitwise_equal( + ref_qc.linear_values, fast_qc.linear_values, "linear_values", context); + EXPECT_EQ(ref_qc.linear_indices, fast_qc.linear_indices); + expect_vectors_bitwise_equal(ref_qc.vals, fast_qc.vals, "qc_vals", context); + EXPECT_EQ(ref_qc.rows, fast_qc.rows); + EXPECT_EQ(ref_qc.cols, fast_qc.cols); + } +} + +mps_data_model_t parse_reference_model(const std::string& path) +{ + mps_data_model_t reference; + mps_parser_t parser(reference, path, false); + return reference; +} + +void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) +{ + TempMpsFile file(std::move(contents)); + auto fast = parse_mps_fast_file(file.path, FileReadMethod::Read); + auto reference = parse_reference_model(file.path); + check_models_match_reference_bitwise(fast, reference, fixture_name); +} + +std::string row_name(size_t i) +{ + std::ostringstream out; + out << 'R' << std::setw(6) << std::setfill('0') << i; + return out.str(); +} + +int find_var_index(const parser_model_t& model, std::string_view name) +{ + for (size_t i = 0; i < model.var_names_.size(); ++i) { + if (model.var_names_[i] == name) { return static_cast(i); } + } + return -1; +} + +void check_model_shapes( + const parser_model_t& model, int rows, int vars, int nnz, std::string_view context) +{ + EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows"; + EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars"; + EXPECT_EQ(nnz, model.nnz_) << std::string(context) + " nnz"; + EXPECT_EQ(static_cast(rows + 1), model.A_offsets_.size()) + << std::string(context) + " offsets"; + EXPECT_EQ(static_cast(nnz), model.A_.size()) << std::string(context) + " values"; + EXPECT_EQ(static_cast(nnz), model.A_indices_.size()) << std::string(context) + " indices"; +} + +std::string section_split_fixture() +{ + return "NAME SPLITS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " UP BND X1 4\n" + "ENDATA\n"; +} + +std::string to_crlf(std::string text) +{ + std::string converted; + converted.reserve(text.size() + text.size() / 8); + for (char c : text) { + if (c == '\n') { + converted += "\r\n"; + } else { + converted.push_back(c); + } + } + return converted; +} + +} // namespace + +TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks) +{ + const std::string mps = + "NAME EDGE\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " x1 OBJ 1\n" + " x1 rowA 2\n" + "RHS\n" + " rhs rowA 3\n" + "ENDATA\n"; + + const size_t columns_pos = mps.find("COLUMNS"); + EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split"; + const size_t split = columns_pos + 3; + + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 2, registry); + + scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); + scanner.publish_ready(0); + scanner.observe_block(0, mps.data(), mps.data() + split); + scanner.publish_ready(mps.size()); + + EXPECT_TRUE(registry.ready(mps_phase_kind::header)) << "header not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready"; + + EXPECT_TRUE(range_text(registry.range(mps_phase_kind::columns)).starts_with("COLUMNS")) + << "columns range begins at wrong boundary"; + EXPECT_TRUE(range_text(registry.range(mps_phase_kind::rhs)).starts_with("RHS")) + << "rhs range begins at wrong boundary"; +} + +TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte) +{ + const std::string mps = section_split_fixture(); + const std::vector headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"}; + + for (std::string_view header : headers) { + const size_t pos = mps.find(header); + EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture"; + for (size_t offset = 1; offset < header.size(); ++offset) { + const size_t split = pos + offset; + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 2, registry); + + scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); + scanner.observe_block(0, mps.data(), mps.data() + split); + scanner.publish_ready(mps.size()); + + EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::bounds)) << "bounds not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) + << "quadratic sentinel not ready after split"; + } + } +} + +TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows) +{ + const std::string mps = + "NAME BAD\n" + "ROWS\n" + " N OBJ\n" + "FOO\n" + "COLUMNS\n" + " x OBJ 1\n" + "ENDATA\n"; + + EXPECT_THROW( + { + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 1, registry); + scanner.observe_block(0, mps.data(), mps.data() + mps.size()); + scanner.publish_ready(mps.size()); + }, + std::logic_error); +} + +TEST(FastMpsParserEdgeTest, ParserRejectsUnknownSectionRecords) +{ + TempMpsFile file( + "NAME BAD_UNKNOWN_SECTION\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " FR BND1 X1\n" + "QSECTION R1\n" + " X1 X1 1\n" + "ENDATA\n"); + + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception); +} + +TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference) +{ + verify_fixture_bitwise("bounds_defaults_and_types", + "NAME BOUNDS_EDGE\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " XFREE rowA 1\n" + " XUP0 rowA 1\n" + " XNEG rowA 1\n" + " XBV rowA 1\n" + " XFX rowA 1\n" + " XLI rowA 1\n" + "RHS\n" + " RHS1 rowA 10\n" + "BOUNDS\n" + " FR BND XFREE\n" + " UP BND XUP0 0\n" + " UP BND XNEG -1\n" + " BV BND XBV\n" + " FX BND XFX 7\n" + " LI BND XLI 2\n" + " UI BND XLI 9\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins) +{ + const std::string contents = + "NAME BOUNDS_DUP\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " X1 rowA 1\n" + "RHS\n" + " RHS1 rowA 10\n" + "BOUNDS\n" + " LO BND X1 0\n" + " UP BND X1 5\n" + " UP BND X1 3\n" + " LO BND X1 2\n" + "ENDATA\n"; + + verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents); + TempMpsFile file(contents); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + EXPECT_EQ(1, model.n_vars_) << "n_vars"; + EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound"; + EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound"; +} + +TEST(FastMpsParserEdgeTest, NondenseRowAndColumnNamesUseHashPath) +{ + verify_fixture_bitwise("nondense_row_and_column_names", + "NAME HASH_NAMES\n" + "ROWS\n" + " N obj.row\n" + " G demand-east\n" + " L capacity-west\n" + " E balance.17\n" + "COLUMNS\n" + " alpha obj.row 4.5 demand-east 1\n" + " beta_two capacity-west -2 balance.17 3\n" + " z-last demand-east 7 balance.17 -1\n" + "RHS\n" + " rhs demand-east 2 capacity-west 9\n" + " rhs balance.17 0\n" + "BOUNDS\n" + " LO b alpha -5\n" + " UP b beta_two 6\n" + " FR b z-last\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath) +{ + TempMpsFile file( + "NAME OPTIONALS\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " X1 OBJ 1 rowA 2\n" + "RHS\n" + " RHS1 rowA 0\n" + "ENDATA\n"); + + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars"; + EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default"; + EXPECT_EQ(std::numeric_limits::infinity(), model.variable_upper_bounds_.at(0)); +} + +TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically) +{ + TempMpsFile file( + "NAME BOUNDS_ONLY\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " XMAIN OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " UP B AUX_Z 9\n" + " LO B AUX_Z -3\n" + " BV B AUX_A\n" + " SC B AUX_S 5\n" + "ENDATA\n"); + + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + check_model_shapes(model, 1, 4, 1, "bounds-only"); + EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name"; + EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1"; + EXPECT_EQ(std::string("AUX_S"), model.var_names_.at(2)) << "bounds-only sorted name 2"; + EXPECT_EQ(std::string("AUX_Z"), model.var_names_.at(3)) << "bounds-only sorted name 3"; + + const int aux_a = find_var_index(model, "AUX_A"); + const int aux_s = find_var_index(model, "AUX_S"); + const int aux_z = find_var_index(model, "AUX_Z"); + ASSERT_GE(aux_a, 0); + ASSERT_GE(aux_s, 0); + ASSERT_GE(aux_z, 0); + EXPECT_EQ('I', model.var_types_.at(aux_a)) << "bounds-only BV type"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(aux_a)) << "bounds-only BV lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(aux_a)) << "bounds-only BV ub"; + EXPECT_EQ('S', model.var_types_.at(aux_s)) << "bounds-only SC type"; + EXPECT_EQ(5.0, model.variable_upper_bounds_.at(aux_s)) << "bounds-only SC ub"; + EXPECT_EQ(-3.0, model.variable_lower_bounds_.at(aux_z)) << "bounds-only duplicate lb"; + EXPECT_EQ(9.0, model.variable_upper_bounds_.at(aux_z)) << "bounds-only duplicate ub"; +} + +TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds) +{ + TempMpsFile file( + "NAME MARKERS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " MARK000 'MARKER' 'INTORG'\n" + " XINT OBJ 1 R1 1\n" + " MARK001 'MARKER' 'INTEND'\n" + " XCONT OBJ 2 R1 2\n" + " MARK002 'MARKER' 'INTORG'\n" + " XBIN OBJ 3 R1 3\n" + " MARK003 'MARKER' 'INTEND'\n" + "RHS\n" + " RHS1 R1 10\n" + "ENDATA\n"); + + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + check_model_shapes(model, 1, 3, 3, "integer markers"); + const int xint = find_var_index(model, "XINT"); + const int xcont = find_var_index(model, "XCONT"); + const int xbin = find_var_index(model, "XBIN"); + ASSERT_GE(xint, 0); + ASSERT_GE(xcont, 0); + ASSERT_GE(xbin, 0); + EXPECT_EQ('I', model.var_types_.at(xint)) << "XINT type"; + EXPECT_EQ('C', model.var_types_.at(xcont)) << "XCONT type"; + EXPECT_EQ('I', model.var_types_.at(xbin)) << "XBIN type"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xint)) << "XINT default lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xint)) << "XINT default ub"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xbin)) << "XBIN default lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xbin)) << "XBIN default ub"; +} + +TEST(FastMpsParserEdgeTest, NumericParsingIntegrationMatchesReferenceBitwise) +{ + verify_fixture_bitwise("numeric_parsing_integration", + "NAME NUMBERS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + " G R2\n" + " E R3\n" + "COLUMNS\n" + " X0 OBJ 0.12345678901234 R1 1e-9\n" + " X1 OBJ -2.5E3 R2 0.12345678901234567890123\n" + " X2 R3 9999999999999999\n" + "RHS\n" + " RHS1 R1 3.14159 R2 -0.000000000000001\n" + " RHS1 R3 42\n" + "RANGES\n" + " RNG R1 0.25 R2 1E2\n" + "BOUNDS\n" + " LO B X0 -123456789\n" + " UP B X0 123456789\n" + " FX B X1 0.3333333333333333\n" + " FR B X2\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, CrlfLineEndingsMatchReferenceBitwise) +{ + verify_fixture_bitwise("crlf_line_endings", + to_crlf("NAME CRLF_EDGE\n" + "OBJSENSE\n" + " MAX\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " UP B X1 4\n" + "ENDATA\n")); +} + +TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise) +{ + verify_fixture_bitwise("comment_placement_supported_cases", + "* leading star comment\n" + "$ leading dollar comment\n" + "NAME COMMENTS\n" + "$ comment between NAME and ROWS\n" + "ROWS\n" + "* comment after ROWS header\n" + " N OBJ $ row objective comment\n" + "$ comment between ROW records\n" + " L R1 $ row constraint comment\n" + "COLUMNS\n" + "* comment after COLUMNS header\n" + " X1 OBJ 1 R1 2 $ inline column comment\n" + "$ comment before next column\n" + " X2 OBJ -1 R1 3\n" + "RHS\n" + "$ comment after RHS header\n" + " RHS1 R1 5 $ inline rhs comment\n" + "BOUNDS\n" + "* comment after BOUNDS header\n" + " LO B X1 0 $ inline bound comment\n" + "$ comment before ENDATA\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective) +{ + verify_fixture_bitwise("objective_metadata", + "NAME OBJMETA\n" + "OBJSENSE\n" + " MAX\n" + "OBJNAME\n" + " COST\n" + "ROWS\n" + " N ALT\n" + " N COST\n" + " L R1\n" + "COLUMNS\n" + " X1 ALT 100 COST 5\n" + " X1 R1 1\n" + " X2 COST -2 R1 3\n" + "RHS\n" + " RHS1 COST 7 R1 11\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) +{ + { + TempMpsFile file( + "NAME BADOBJ\n" + "OBJSENSE\n" + " SIDEWAYS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADCOLROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 MISSING 1\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADRHSROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 MISSING 1\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADBOUND\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " XX B X1 1\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADSC\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " SC B X1\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); + } +} + +TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary) +{ + constexpr size_t row_count = 180000; + std::string mps; + mps.reserve(8 * 1024 * 1024); + mps += "NAME BIGCOLS\nROWS\n N OBJ\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " L "; + mps += row_name(i); + mps += '\n'; + } + mps += "COLUMNS\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " XBIG "; + mps += row_name(i); + mps += " 1\n"; + } + mps += " XTAIL "; + mps += row_name(1); + mps += " 2\nRHS\n RHS1 "; + mps += row_name(1); + mps += " 0\nENDATA\n"; + + TempMpsFile file(std::move(mps)); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + check_model_shapes( + model, static_cast(row_count), 2, static_cast(row_count + 1), "large columns"); + EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name"; + EXPECT_EQ(std::string("XTAIL"), model.var_names_.at(1)) << "large tail column name"; +} + +TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered) +{ + constexpr size_t repeat_count = 700000; + std::string mps; + mps.reserve(12 * 1024 * 1024); + mps += + "NAME BIGBOUNDS\nROWS\n N OBJ\n L R1\nCOLUMNS\n alpha OBJ 1 R1 1\nRHS\n RHS1 R1 0\nBOUNDS\n"; + for (size_t i = 0; i < repeat_count; ++i) { + mps += " UP B alpha "; + mps += std::to_string(i % 1000); + mps += '\n'; + } + mps += "ENDATA\n"; + + TempMpsFile file(std::move(mps)); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); + check_model_shapes(model, 1, 1, 1, "large bounds"); + EXPECT_EQ(static_cast((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0)) + << "large repeated bounds last value"; +} + +TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput) +{ + constexpr size_t row_count = 70000; + std::string mps; + mps.reserve(4 * 1024 * 1024); + mps += "NAME LZ4PARITY\nROWS\n N OBJ\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " L "; + mps += row_name(i); + mps += '\n'; + } + mps += "COLUMNS\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " X"; + mps += std::to_string(i); + mps += ' '; + mps += row_name(i); + mps += " 0.125\n"; + } + mps += "RHS\n RHS1 "; + mps += row_name(1); + mps += " 1\nENDATA\n"; + + TempMpsFile raw_file(std::move(mps)); + TempOwnedPath lz4_file(raw_file.path + ".lz4"); + const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path; + if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; } + + auto raw = parse_mps_fast_file(raw_file.path, FileReadMethod::Read); + auto lz4 = parse_mps_fast_file(lz4_file.path, FileReadMethod::Read); + + check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity"); + EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count"; + EXPECT_EQ(raw.row_names_.size(), lz4.row_names_.size()) << "lz4 row name count"; + EXPECT_EQ(raw.A_, lz4.A_) << "lz4 A values"; + EXPECT_EQ(raw.A_indices_, lz4.A_indices_) << "lz4 A indices"; + EXPECT_EQ(raw.A_offsets_, lz4.A_offsets_) << "lz4 A offsets"; + EXPECT_EQ(raw.c_, lz4.c_) << "lz4 objective"; + EXPECT_EQ(raw.b_, lz4.b_) << "lz4 rhs"; + EXPECT_EQ(raw.var_types_, lz4.var_types_) << "lz4 var types"; + EXPECT_EQ(raw.variable_lower_bounds_, lz4.variable_lower_bounds_) << "lz4 lower bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, lz4.variable_upper_bounds_) << "lz4 upper bounds"; +} + +TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch) +{ + std::string mps; + mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n"; + mps += " X1 OBJ 1 R1 2.5\n X2 R1 -3.25 R2 4\n"; + mps += "RHS\n RHS1 R1 7 R2 8\nBOUNDS\n BV BND X1\n UP BND X2 10\nENDATA\n"; + + TempMpsFile raw_file(std::move(mps)); + TempOwnedPath gzip_file(raw_file.path + ".gz"); + TempOwnedPath bzip2_file(raw_file.path + ".bz2"); + + const std::string gzip_cmd = "gzip -c " + raw_file.path + " > " + gzip_file.path; + const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path; + if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; } + if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; } + + auto raw = parse_mps_fast_file(raw_file.path, FileReadMethod::Read); + auto gzip = parse_mps_fast_file(gzip_file.path, FileReadMethod::Read); + auto bzip2 = parse_mps_fast_file(bzip2_file.path, FileReadMethod::Read); + + check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity"); + check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity"); + EXPECT_EQ(raw.A_, gzip.A_) << "gzip A values"; + EXPECT_EQ(raw.A_, bzip2.A_) << "bzip2 A values"; + EXPECT_EQ(raw.A_indices_, gzip.A_indices_) << "gzip A indices"; + EXPECT_EQ(raw.A_indices_, bzip2.A_indices_) << "bzip2 A indices"; + EXPECT_EQ(raw.A_offsets_, gzip.A_offsets_) << "gzip A offsets"; + EXPECT_EQ(raw.A_offsets_, bzip2.A_offsets_) << "bzip2 A offsets"; + EXPECT_EQ(raw.c_, gzip.c_) << "gzip objective"; + EXPECT_EQ(raw.c_, bzip2.c_) << "bzip2 objective"; + EXPECT_EQ(raw.b_, gzip.b_) << "gzip rhs"; + EXPECT_EQ(raw.b_, bzip2.b_) << "bzip2 rhs"; + EXPECT_EQ(raw.variable_lower_bounds_, gzip.variable_lower_bounds_) << "gzip lower bounds"; + EXPECT_EQ(raw.variable_lower_bounds_, bzip2.variable_lower_bounds_) << "bzip2 lower bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, gzip.variable_upper_bounds_) << "gzip upper bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, bzip2.variable_upper_bounds_) << "bzip2 upper bounds"; + EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types"; + EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types"; +} + +TEST(FastMpsParserEdgeTest, QcMatrixRowsMatchReferenceBitwise) +{ + verify_fixture_bitwise("qcmatrix rows", + "NAME QCMATRIX_TEST\n" + "ROWS\n" + " N OBJ\n" + " L LIN\n" + " L QC1\n" + " G QC2\n" + "COLUMNS\n" + " X1 OBJ 1 LIN 2\n" + " X1 QC1 3 QC2 4\n" + " X2 OBJ 2 LIN 5\n" + " X2 QC1 6 QC2 7\n" + "RHS\n" + " RHS1 LIN 10 QC1 11\n" + " RHS1 QC2 12\n" + "QCMATRIX QC1\n" + " X1 X1 1.25\n" + " X1 X2 -2.5\n" + "QCMATRIX QC2\n" + " X2 X2 3.75\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, QcMatrixMalformedCasesMatchReference) +{ + const std::vector cases = { + "NAME DUP_QC\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX QC1\n" + " X1 X1 1\n" + "QCMATRIX QC1\n" + " X1 X1 2\n" + "ENDATA\n", + "NAME BAD_QC_ROW\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX UNKNOWN\n" + " X1 X1 1\n" + "ENDATA\n", + "NAME BAD_QC_VAR\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX QC1\n" + " X1 XBAD 1\n" + "ENDATA\n"}; + + for (const auto& mps : cases) { + TempMpsFile file(mps); + EXPECT_THROW(((void)parse_reference_model(file.path)), std::exception); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception); + } +} + +TEST(FastMpsParserEdgeTest, QuadraticParserRejectsUnknownColumnOneRecords) +{ + const std::vector records = {"QSECTION QC1", + "CSECTION QC1 0 QUAD"}; + + for (const auto& record : records) { + TempMpsFile file( + "NAME BAD_QUAD_RECORD\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + " X2 OBJ 3 QC1 4\n" + "RHS\n" + " RHS1 QC1 5\n" + "QMATRIX\n" + " X1 X1 1\n" + + record + + "\n" + " X2 X2 2\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception) + << record; + } +} + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index af1368865d..70f7beb2dc 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -56,6 +56,21 @@ bool file_exists(const std::string& file) namespace { +struct mps_reader_param_t { + const char* name; + mps_reader_type_t reader; +}; + +constexpr mps_reader_param_t default_mps_reader_param{"default_reader", + mps_reader_type_t::default_reader}; +constexpr mps_reader_param_t fast_mps_reader_param{"fast_experimental", + mps_reader_type_t::fast_experimental}; + +std::string mps_reader_param_name(const ::testing::TestParamInfo& info) +{ + return info.param.name; +} + // Non-template forwarding wrapper around read_lp_from_string. // Exists only so EXPECT_THROW(read_lp_string(R"LP(...)LP"), exc) is parsed // correctly — gtest's macro splits its args on top-level commas, and the @@ -115,20 +130,21 @@ double q_entry(const mps_data_model_t& m, int row, int col) // =========================================================================== // Per-fixture test classes. Each class describes one named problem fixture // and owns the checker for that problem's expected parsed data model. The -// MPS and LP TEST_F cases within a fixture share the same `check_model` +// MPS TEST_P and LP TEST_F cases within a fixture share the same `check_model` // method, so the expected values live in exactly one place per fixture. // // All fixtures inherit a common base that supplies read_mps_file and // read_lp_file helpers. // =========================================================================== -class parser_fixture_base : public ::testing::Test { +class parser_fixture_base : public ::testing::TestWithParam { protected: - static mps_data_model_t read_mps_file(const std::string& file, - bool fixed_format = true) + mps_data_model_t read_mps_file(const std::string& file, + bool fixed_format = true) const { const std::string& root = cuopt::test::get_rapids_dataset_root_dir(); - return read_mps(root + "/" + file, fixed_format); + const auto reader = GetParam().reader; + return read(root + "/" + file, reader, fixed_format); } static mps_data_model_t read_lp_file(const std::string& file) @@ -357,9 +373,13 @@ TEST(mps_parser, bad_mps_files) } } -TEST_F(good_mps_1_test, mps) +TEST_P(good_mps_1_test, mps) +{ + check_model(read_mps_file("linear_programming/good-mps-1.mps", false)); +} + +TEST_F(good_mps_1_test, mps_parser_internals) { - check_model(read_mps_file("linear_programming/good-mps-1.mps")); // Parser-struct fields that are MPS-only (not exposed via the data model). auto mps = read_from_mps("linear_programming/good-mps-1.mps"); EXPECT_EQ("good-1", mps.problem_name); @@ -592,9 +612,13 @@ TEST(mps_parser_free_format, bad_mps_files_free_format) } } -TEST_F(up_low_bounds_test, mps) +TEST_P(up_low_bounds_test, mps) { check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false)); +} + +TEST_F(up_low_bounds_test, mps_parser_internals) +{ auto mps = read_from_mps("linear_programming/lp_model_with_var_bounds.mps", false); EXPECT_EQ("lp_model_with_var_bounds", mps.problem_name); EXPECT_EQ("OBJ", mps.objective_name); @@ -607,14 +631,14 @@ TEST_F(up_low_bounds_test, lp) check_model(read_lp_file("linear_programming/lp_model_with_var_bounds.lp")); } -TEST_F(good_mps_1_test, mps_free_format) +TEST_P(good_mps_1_test, mps_free_format) { // free-format-mps-1.mps encodes the same problem as good-mps-1 with default // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker. check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false)); } -TEST_F(some_var_bounds_test, mps) +TEST_P(some_var_bounds_test, mps) { check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps")); } @@ -624,7 +648,7 @@ TEST_F(some_var_bounds_test, lp) check_model(read_lp_file("linear_programming/good-mps-some-var-bounds.lp")); } -TEST_F(fixed_var_bound_test, mps) +TEST_P(fixed_var_bound_test, mps) { check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps")); } @@ -634,7 +658,7 @@ TEST_F(fixed_var_bound_test, lp) check_model(read_lp_file("linear_programming/good-mps-fixed-var.lp")); } -TEST_F(free_var_bound_test, mps) +TEST_P(free_var_bound_test, mps) { check_model(read_mps_file("linear_programming/good-mps-free-var.mps")); } @@ -644,7 +668,7 @@ TEST_F(free_var_bound_test, lp) check_model(read_lp_file("linear_programming/good-mps-free-var.lp")); } -TEST_F(lower_inf_var_bound_test, mps) +TEST_P(lower_inf_var_bound_test, mps) { check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps")); } @@ -662,7 +686,7 @@ TEST(mps_bounds, rhs_cost) EXPECT_EQ(int(-5), mps.objective_offset_value); } -TEST_F(upper_inf_var_bound_test, mps) +TEST_P(upper_inf_var_bound_test, mps) { check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps")); } @@ -817,9 +841,13 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types) }; } -TEST_F(mip_with_bounds_test, mps) +TEST_P(mip_with_bounds_test, mps) { check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false)); +} + +TEST_F(mip_with_bounds_test, mps_parser_internals) +{ auto mps = read_from_mps("mixed_integer_programming/good-mip-mps-1.mps", false); EXPECT_EQ("COST", mps.objective_name); ASSERT_EQ(int(2), mps.row_types.size()); @@ -877,7 +905,7 @@ TEST(mps_parser, good_mps_file_mip_no_marker) EXPECT_EQ(10., mps.variable_upper_bounds[1]); } -TEST_F(mip_no_bounds_test, mps) +TEST_P(mip_no_bounds_test, mps) { check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false)); } @@ -887,7 +915,7 @@ TEST_F(mip_no_bounds_test, lp) check_model(read_lp_file("mixed_integer_programming/good-mip-mps-no-bounds.lp")); } -TEST_F(mip_partial_bounds_test, mps) +TEST_P(mip_partial_bounds_test, mps) { check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false)); } @@ -897,6 +925,32 @@ TEST_F(mip_partial_bounds_test, lp) check_model(read_lp_file("mixed_integer_programming/good-mip-mps-partial-bounds.lp")); } +#define INSTANTIATE_MPS_READER_TEST(Fixture) \ + INSTANTIATE_TEST_SUITE_P(mps_readers, \ + Fixture, \ + ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \ + mps_reader_param_name) + +#define INSTANTIATE_DEFAULT_MPS_READER_TEST(Fixture) \ + INSTANTIATE_TEST_SUITE_P( \ + mps_readers, Fixture, ::testing::Values(default_mps_reader_param), mps_reader_param_name) + +INSTANTIATE_MPS_READER_TEST(good_mps_1_test); +INSTANTIATE_MPS_READER_TEST(up_low_bounds_test); +INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test); +INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test); +INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test); +// fast mps parser doesn't support fixed format +INSTANTIATE_DEFAULT_MPS_READER_TEST(some_var_bounds_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(fixed_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test); + +// NOTE: INSTANTIATE_MPS_READER_TEST / INSTANTIATE_DEFAULT_MPS_READER_TEST are intentionally +// left defined here; the QP/QCQP file fixtures below reuse them. They are #undef-ed after the +// last instantiation. + #ifdef MPS_PARSER_WITH_BZIP2 TEST(mps_parser, good_mps_file_bzip2_compressed) { @@ -998,13 +1052,14 @@ TEST(qps_parser, quadratic_objective_basic) EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]); } +class qps_file_reader_test : public parser_fixture_base {}; + // Test actual QPS files from the dataset -TEST(qps_parser, test_qps_files) +TEST_P(qps_file_reader_test, test_qps_files) { // Test QP_Test_1.qps if it exists if (file_exists("quadratic_programming/QP_Test_1.qps")) { - auto parsed_data = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps", false); + auto parsed_data = read_mps_file("quadratic_programming/QP_Test_1.qps", false); EXPECT_EQ("QP_Test_1", parsed_data.get_problem_name()); EXPECT_EQ(2, parsed_data.get_n_variables()); // C------1 and C------2 @@ -1023,8 +1078,7 @@ TEST(qps_parser, test_qps_files) // Test QP_Test_2.qps if it exists if (file_exists("quadratic_programming/QP_Test_2.qps")) { - auto parsed_data = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps", false); + auto parsed_data = read_mps_file("quadratic_programming/QP_Test_2.qps", false); EXPECT_EQ("QP_Test_2", parsed_data.get_problem_name()); EXPECT_EQ(3, parsed_data.get_n_variables()); // C------1, C------2, C------3 @@ -2582,6 +2636,19 @@ TEST(read, qps_extension_dispatches_to_mps_parser) EXPECT_EQ(m.get_variable_names()[0], "x"); } +TEST(read, qps_extension_dispatches_to_fast_experimental_reader) +{ + temp_file_t tmp(".qps"); + { + std::ofstream out(tmp.string()); + out << kTrivialMps; + } + auto m = read(tmp.string(), mps_reader_type_t::fast_experimental); + ASSERT_EQ(m.get_variable_names().size(), 1u); + EXPECT_EQ(m.get_variable_names()[0], "x"); + EXPECT_NEAR(m.get_variable_upper_bounds()[0], 10.0, tolerance); +} + TEST(read, mps_gz_extension_dispatches_to_mps_parser) { auto m = read(cuopt::test::get_rapids_dataset_root_dir() + @@ -2796,13 +2863,12 @@ TEST(qps_parser, qcmatrix_append_api) } // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic. -TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) +TEST_P(qps_file_reader_test, qcmatrix_mps_linear_rhs_and_bounds) { if (!file_exists("qcqp/QC_Test_1.mps")) { GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root"; } - const auto model = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false); + const auto model = read_mps_file("qcqp/QC_Test_1.mps", false); ASSERT_TRUE(model.has_quadratic_constraints()); const auto& qcs = model.get_quadratic_constraints(); @@ -2848,13 +2914,12 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value); } -TEST(qps_parser, qcqp_p0033_mps_sections) +TEST_P(qps_file_reader_test, qcqp_p0033_mps_sections) { if (!file_exists("qcqp/p0033_qc1.mps")) { GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root"; } - const auto model = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false); + const auto model = read_mps_file("qcqp/p0033_qc1.mps", false); EXPECT_EQ(12, model.get_n_constraints()); EXPECT_EQ(33, model.get_n_variables()); @@ -2897,4 +2962,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1) auto reloaded_2 = read_mps(temp_file_2.string(), false); compare_data_models(reloaded, reloaded_2); } + +INSTANTIATE_MPS_READER_TEST(qps_file_reader_test); + +#undef INSTANTIATE_MPS_READER_TEST +#undef INSTANTIATE_DEFAULT_MPS_READER_TEST } // namespace cuopt::linear_programming::io diff --git a/thirdparty/THIRD_PARTY_LICENSES b/thirdparty/THIRD_PARTY_LICENSES index a70fa8ce1c..e09000b56d 100644 --- a/thirdparty/THIRD_PARTY_LICENSES +++ b/thirdparty/THIRD_PARTY_LICENSES @@ -512,3 +512,63 @@ Copyright notice: Jean-loup Gailly Mark Adler jloup@gzip.org madler@alumni.caltech.edu + + +----------------------------------------------------------------------------------------- +== LZ4 + +Usage: cuopt uses LZ4 through dynamically loaded library symbols + +Copyright (c) Yann Collet. All rights reserved. + +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +----------------------------------------------------------------------------------------- +== SIMDe + +Usage: cuopt uses SIMDe in experimental fast MPS parser SIMD compatibility code + +Copyright (c) 2017 Evan Nemerson + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.