From 4d2ec827c22028afc947acbe06859a0b6b6af8b9 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Wed, 3 Jun 2026 04:09:10 -0700 Subject: [PATCH 01/22] port fast mps parser tp tree --- cpp/CMakeLists.txt | 30 +- cpp/cuopt_cli.cpp | 22 +- .../cuopt/linear_programming/io/parser.hpp | 59 +- cpp/src/CMakeLists.txt | 1 + cpp/src/io/CMakeLists.txt | 10 + .../fast_parse_primitives.hpp | 590 ++++ .../io/experimental_mps_fast/fast_parser.cpp | 2770 +++++++++++++++++ .../io/experimental_mps_fast/fast_parser.hpp | 19 + .../fast_parser_adapter.cpp | 23 + .../io/experimental_mps_fast/file_reader.cpp | 252 ++ .../io/experimental_mps_fast/file_reader.hpp | 168 + .../hash_table_smallstr.hpp | 330 ++ .../experimental_mps_fast/lz4_file_reader.cpp | 759 +++++ .../io/experimental_mps_fast/mmap_region.hpp | 141 + .../mps_section_scanner.cpp | 413 +++ .../mps_section_scanner.hpp | 98 + .../io/experimental_mps_fast/nvtx_ranges.hpp | 135 + .../io/experimental_mps_fast/simd_compat.hpp | 10 + cpp/tests/linear_programming/parser_test.cpp | 107 +- 19 files changed, 5905 insertions(+), 32 deletions(-) create mode 100644 cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser.cpp create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser.hpp create mode 100644 cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp create mode 100644 cpp/src/io/experimental_mps_fast/file_reader.cpp create mode 100644 cpp/src/io/experimental_mps_fast/file_reader.hpp create mode 100644 cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp create mode 100644 cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp create mode 100644 cpp/src/io/experimental_mps_fast/mmap_region.hpp create mode 100644 cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp create mode 100644 cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp create mode 100644 cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp create mode 100644 cpp/src/io/experimental_mps_fast/simd_compat.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7e2dd099c1..60227547b4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -202,9 +202,11 @@ endif () find_package(OpenMP REQUIRED) message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") -# MPS/QPS parser supports compressed inputs via bzip2 and zlib +# MPS/QPS parser supports compressed inputs via bzip2 and zlib; the experimental fast MPS parser +# supports LZ4 via runtime-loaded liblz4. option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON) option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON) +option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON) if (CUOPT_PARSER_WITH_BZIP2) find_package(BZip2 REQUIRED) add_compile_definitions(MPS_PARSER_WITH_BZIP2) @@ -213,6 +215,10 @@ if (CUOPT_PARSER_WITH_ZLIB) find_package(ZLIB REQUIRED) add_compile_definitions(MPS_PARSER_WITH_ZLIB) endif () +if (CUOPT_PARSER_WITH_LZ4) + # No headers or link target needed; the experimental reader loads one liblz4 symbol at runtime. + add_compile_definitions(MPS_PARSER_WITH_LZ4) +endif () # Debug options if (CMAKE_BUILD_TYPE MATCHES Debug) @@ -250,6 +256,20 @@ else () find_package(RAFT REQUIRED) endif () +rapids_cpm_find(simde 0.8.2 + CPM_ARGS + GIT_REPOSITORY https://github.com/simd-everywhere/simde.git + GIT_TAG v0.8.2 + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE +) + +if (NOT TARGET simde::simde) + add_library(simde::simde INTERFACE IMPORTED GLOBAL) + set_target_properties(simde::simde + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${simde_SOURCE_DIR}") +endif () + FetchContent_Declare( papilo GIT_REPOSITORY "https://github.com/scipopt/papilo.git" @@ -436,11 +456,18 @@ if (BUILD_TESTS) endif () set(CUOPT_SRC_FILES) +set(MPS_FAST_SRC_FILES) add_subdirectory(src) if (HOST_LINEINFO) set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") endif () +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND + CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") + set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + APPEND PROPERTY COMPILE_OPTIONS "-mavx2;-maes;-msse4.2") +endif () + # Apply -UNDEBUG only to solver source files (not gRPC infrastructure). # Must happen before gRPC files are appended to CUOPT_SRC_FILES. # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO). @@ -596,6 +623,7 @@ target_link_libraries(cuopt ${CUDSS_LIB_FILE} PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} + simde::simde $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 37876cac7a..e99462091e 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -90,11 +90,13 @@ inline cuopt::init_logger_t dummy_logger( * .mps/.qps and their .gz/.bz2 variants → MPS parser; * anything else is rejected. * @param initial_solution_file Path to initial solution file in SOL format + * @param mps_reader MPS reader implementation selected by the CLI * @param settings Merged solver settings (config file loaded in main, then CLI overrides applied) */ int run_single_file(const std::string& file_path, const std::string& initial_solution_file, bool solve_relaxation, + cuopt::linear_programming::io::mps_reader_type_t mps_reader, cuopt::linear_programming::solver_settings_t& settings) { cuopt::init_logger_t log(settings.get_parameter(CUOPT_LOG_FILE), @@ -108,7 +110,7 @@ int run_single_file(const std::string& file_path, { CUOPT_LOG_INFO("Reading file %s", base_filename.c_str()); try { - mps_data_model = cuopt::linear_programming::io::read(file_path); + mps_data_model = cuopt::linear_programming::io::read(file_path, mps_reader); } catch (const std::logic_error& e) { CUOPT_LOG_ERROR("Parser exception: %s", e.what()); parsing_failed = true; @@ -285,7 +287,8 @@ int main(int argc, char* argv[]) .help( "input problem file; format dispatched by extension (case-insensitive). " "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants " - "(e.g. .lp.gz, .mps.bz2, .qps.gz)") + "(e.g. .lp.gz, .mps.bz2, .qps.gz). Experimental .mps.lz4 inputs require " + "--mps-reader fast") .nargs(1) .required(); @@ -303,6 +306,13 @@ int main(int argc, char* argv[]) .help("path to parameter config file (key = value format, supports all parameters)") .default_value(std::string("")); + program.add_argument("--mps-reader") + .help( + "MPS reader implementation: default uses the production parser; fast uses the experimental " + "SIMD parser for LP/MIP .mps and .mps.lz4 files") + .default_value(std::string("default")) + .choices("default", "fast"); + program.add_argument("--dump-hyper-params") .help("print hyper-parameters only in config file format and exit") .default_value(false) @@ -403,6 +413,12 @@ int main(int argc, char* argv[]) const auto initial_solution_file = program.get("--initial-solution"); const auto solve_relaxation = program.get("--relaxation"); const auto params_file = program.get("--params-file"); + const auto mps_reader_arg = program.get("--mps-reader"); + + auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader; + if (mps_reader_arg == "fast") { + mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental; + } cuopt::linear_programming::solver_settings_t settings; try { @@ -432,5 +448,5 @@ int main(int argc, char* argv[]) RAFT_CUDA_TRY(cudaSetDevice(0)); } - return run_single_file(file_name, initial_solution_file, solve_relaxation, settings); + return run_single_file(file_name, initial_solution_file, solve_relaxation, mps_reader, settings); } diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index a63e40f31f..1d47590287 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -17,6 +17,14 @@ namespace cuopt::linear_programming::io { +/** + * @brief Selects which MPS reader implementation should be used by dispatching entry points. + * + * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems + * from raw .mps and .mps.lz4 files only. + */ +enum class mps_reader_type_t { default_reader, fast_experimental }; + /** * @brief Reads the equation from an MPS or QPS file. * @@ -43,6 +51,18 @@ template mps_data_model_t read_mps(const std::string& mps_file_path, bool fixed_mps_format = false); +/** + * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader. + * + * This prototype reader supports raw .mps and .mps.lz4 files only. It does not support LP, QPS, + * quadratic MPS sections, fixed-format forcing, or .gz/.bz2 compressed inputs. + * + * @param[in] mps_file_path Path to a raw .mps or .mps.lz4 file. + * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file. + */ +template +mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); + /** * @brief Reads an MPS problem from in-memory file contents. * @@ -107,13 +127,19 @@ mps_data_model_t read_lp(const std::string& lp_file_path); template mps_data_model_t read_lp_from_string(std::string_view lp_contents); +template +inline mps_data_model_t read(const std::string& path, + mps_reader_type_t mps_reader, + bool fixed_mps_format = false); + /** * @brief Reads an optimization problem from a file, dispatching on the file * extension. Extension matching is case-insensitive. * * Routing: * - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps() - * - .lp, .lp.gz, .lp.bz2 → read_lp() + * - .mps.lz4 → experimental fast MPS reader only + * - .lp, .lp.gz, .lp.bz2 → read_lp() * - anything else → std::logic_error * * This is the entry point of choice for user-facing tools (CLI, C API) that @@ -126,13 +152,37 @@ mps_data_model_t read_lp_from_string(std::string_view lp_contents); */ template inline mps_data_model_t read(const std::string& path, bool fixed_mps_format = false) +{ + return read(path, mps_reader_type_t::default_reader, fixed_mps_format); +} + +template +inline mps_data_model_t read(const std::string& path, + mps_reader_type_t mps_reader, + bool fixed_mps_format) { std::string lower(path); std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); - if (lower.ends_with(".mps") || lower.ends_with(".mps.gz") || lower.ends_with(".mps.bz2") || - lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2")) { + const bool is_mps_lz4 = lower.ends_with(".mps.lz4"); + if (lower.ends_with(".mps") || is_mps_lz4 || lower.ends_with(".mps.gz") || + lower.ends_with(".mps.bz2") || lower.ends_with(".qps") || lower.ends_with(".qps.gz") || + lower.ends_with(".qps.bz2")) { + if (mps_reader == mps_reader_type_t::fast_experimental) { + if (fixed_mps_format) { + throw std::logic_error( + "experimental fast MPS reader does not support fixed MPS format forcing"); + } + if (!lower.ends_with(".mps") && !is_mps_lz4) { + throw std::logic_error( + "experimental fast MPS reader supports raw .mps and .mps.lz4 LP/MIP files only"); + } + return read_mps_fast_experimental(path); + } + if (is_mps_lz4) { + throw std::logic_error(".mps.lz4 inputs require the experimental fast MPS reader"); + } return read_mps(path, fixed_mps_format); } if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) { @@ -140,7 +190,8 @@ inline mps_data_model_t read(const std::string& path, bool fixed_mps_f } throw std::logic_error( "read: unrecognized input file extension. Supported (case-insensitive): " - ".mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, .lp.bz2. " + ".mps, .mps.lz4, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, " + ".lp.bz2. " "Given path: " + path); } diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index 1ae6988466..6883cce82f 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -25,3 +25,4 @@ add_subdirectory(branch_and_bound) add_subdirectory(cuts) set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${UTIL_SRC_FILES} PARENT_SCOPE) +set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE) diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt index cc4affa890..4c99b1848b 100644 --- a/cpp/src/io/CMakeLists.txt +++ b/cpp/src/io/CMakeLists.txt @@ -3,6 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 # cmake-format: on +set(MPS_FAST_SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp +) + set(PARSERS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/data_model_view.cpp ${CMAKE_CURRENT_SOURCE_DIR}/file_to_string.cpp @@ -13,6 +21,8 @@ set(PARSERS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/cython_parser.cpp + ${MPS_FAST_SRC_FILES} ) set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${PARSERS_SRC_FILES} PARENT_SCOPE) +set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE) diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp new file mode 100644 index 0000000000..9da59e7b44 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -0,0 +1,590 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "simd_compat.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __likely +#define __likely(x) __builtin_expect(!!(x), 1) +#endif + +#ifndef __unlikely +#define __unlikely(x) __builtin_expect(!!(x), 0) +#endif + +namespace mps_fast { + +// double values in MPS data rarely need more than this many fractional digits. +inline constexpr double decimals[16][10] = { + {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}, + {0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}, + {0.000, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009}, + {0.0000, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009}, + {0.00000, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009}, + {0.000000, + 0.000001, + 0.000002, + 0.000003, + 0.000004, + 0.000005, + 0.000006, + 0.000007, + 0.000008, + 0.000009}, + {0.0000000, + 0.0000001, + 0.0000002, + 0.0000003, + 0.0000004, + 0.0000005, + 0.0000006, + 0.0000007, + 0.0000008, + 0.0000009}, + {0.00000000, + 0.00000001, + 0.00000002, + 0.00000003, + 0.00000004, + 0.00000005, + 0.00000006, + 0.00000007, + 0.00000008, + 0.00000009}, + {0.000000000, + 0.000000001, + 0.000000002, + 0.000000003, + 0.000000004, + 0.000000005, + 0.000000006, + 0.000000007, + 0.000000008, + 0.000000009}, + {0.0000000000, + 0.0000000001, + 0.0000000002, + 0.0000000003, + 0.0000000004, + 0.0000000005, + 0.0000000006, + 0.0000000007, + 0.0000000008, + 0.0000000009}, + {0.00000000000, + 0.00000000001, + 0.00000000002, + 0.00000000003, + 0.00000000004, + 0.00000000005, + 0.00000000006, + 0.00000000007, + 0.00000000008, + 0.00000000009}, + {0.000000000000, + 0.000000000001, + 0.000000000002, + 0.000000000003, + 0.000000000004, + 0.000000000005, + 0.000000000006, + 0.000000000007, + 0.000000000008, + 0.000000000009}, + {0.0000000000000, + 0.0000000000001, + 0.0000000000002, + 0.0000000000003, + 0.0000000000004, + 0.0000000000005, + 0.0000000000006, + 0.0000000000007, + 0.0000000000008, + 0.0000000000009}, + {0.00000000000000, + 0.00000000000001, + 0.00000000000002, + 0.00000000000003, + 0.00000000000004, + 0.00000000000005, + 0.00000000000006, + 0.00000000000007, + 0.00000000000008, + 0.00000000000009}, + {0.000000000000000, + 0.000000000000001, + 0.000000000000002, + 0.000000000000003, + 0.000000000000004, + 0.000000000000005, + 0.000000000000006, + 0.000000000000007, + 0.000000000000008, + 0.000000000000009}, + {0.0000000000000000, + 0.0000000000000001, + 0.0000000000000002, + 0.0000000000000003, + 0.0000000000000004, + 0.0000000000000005, + 0.0000000000000006, + 0.0000000000000007, + 0.0000000000000008, + 0.0000000000000009}}; + +inline constexpr int EXP10_TABLE_MAX = 308; + +constexpr double constexpr_pow10(int exp) +{ + if (exp == 0) return 1.0; + double result = 1.0; + if (exp > 0) { + for (int i = 0; i < exp; ++i) + result *= 10.0; + } else { + for (int i = 0; i > exp; --i) + result /= 10.0; + } + return result; +} + +constexpr auto make_exp10_table() +{ + std::array table{}; + for (int i = -EXP10_TABLE_MAX; i <= EXP10_TABLE_MAX; ++i) { + table[(size_t)(i + EXP10_TABLE_MAX)] = constexpr_pow10(i); + } + return table; +} + +inline constexpr auto table_exp10 = make_exp10_table(); + +static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; } + +static inline double fast_frac_atoi(const char*& data, const char* end) +{ + double val = 0.0; + +#define MPS_FAST_FRAC_DIGIT(i) \ + do { \ + if (data >= end || !is_digit_byte(*data)) return val; \ + val += decimals[i][static_cast(*data) & 0xF]; \ + ++data; \ + } while (0) + + MPS_FAST_FRAC_DIGIT(0); + MPS_FAST_FRAC_DIGIT(1); + MPS_FAST_FRAC_DIGIT(2); + MPS_FAST_FRAC_DIGIT(3); + MPS_FAST_FRAC_DIGIT(4); + MPS_FAST_FRAC_DIGIT(5); + MPS_FAST_FRAC_DIGIT(6); + MPS_FAST_FRAC_DIGIT(7); + MPS_FAST_FRAC_DIGIT(8); + MPS_FAST_FRAC_DIGIT(9); + MPS_FAST_FRAC_DIGIT(10); + MPS_FAST_FRAC_DIGIT(11); + MPS_FAST_FRAC_DIGIT(12); + MPS_FAST_FRAC_DIGIT(13); + MPS_FAST_FRAC_DIGIT(14); + MPS_FAST_FRAC_DIGIT(15); + +#undef MPS_FAST_FRAC_DIGIT + + while (data < end && is_digit_byte(*data)) { + ++data; + } + return val; +} + +static inline double fast_atof_core(const char*& data, const char* end) +{ + double sign = 1.0; + if (data < end && *data == '-') { + sign = -1.0; + ++data; + } else if (data < end && *data == '+') { + ++data; + } + + uint64_t int_part = 0; + while (data < end && is_digit_byte(*data)) { + int_part = int_part * 10 + (*data - '0'); + ++data; + } + + double result = static_cast(int_part); + + if (data < end && *data == '.') { + ++data; + result += fast_frac_atoi(data, end); + } + + if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) { + ++data; + int exp_sign = 1; + if (data < end && *data == '-') { + exp_sign = -1; + ++data; + } else if (data < end && *data == '+') { + ++data; + } + + int exponent = 0; + while (data < end && is_digit_byte(*data)) { + exponent = exponent * 10 + (*data - '0'); + ++data; + } + + exponent *= exp_sign; + if (exponent >= -EXP10_TABLE_MAX && exponent <= EXP10_TABLE_MAX) { + result *= table_exp10[static_cast(exponent + EXP10_TABLE_MAX)]; + } else { + result *= std::pow(10.0, exponent); + } + } + + return sign * result; +} + +static inline double fast_atof(const char* data, const char* end) +{ + return fast_atof_core(data, end); +} + +static inline double fast_atof_advance(const char*& ptr, const char* end) +{ + return fast_atof_core(ptr, end); +} + +struct cursor_t { + const char* start; + const char* ptr; + const char* end; + + cursor_t(const char* data, std::size_t size) : start(data), ptr(data), end(data + size) {} + + bool done() const { return ptr >= end; } + + std::pair position() const + { + std::size_t line = 1; + const char* line_start = start; + for (const char* p = start; p < ptr; ++p) { + if (*p == '\n') { + ++line; + line_start = p + 1; + } + } + std::size_t column = static_cast(ptr - line_start) + 1; + return {line, column}; + } + + [[noreturn]] void error(const char* msg, ...) + { + auto [line, col] = position(); + va_list args; + va_start(args, msg); + char msg_buf[512]; + std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args); + va_end(args); + char buf[1024]; + std::snprintf(buf, sizeof(buf), "%zu:%zu: %s", line, col, msg_buf); + throw std::runtime_error(buf); + } + + void advance(std::size_t n) + { + if (ptr + n > end) { throw std::runtime_error("cursor advanced past end of file"); } + ptr += n; + } + + template + static const char* scalar_scan(const char* p, const char* end) + { + while (p < end) { + unsigned char c = static_cast(*p); + if constexpr (skip_ws_mode) { + if (c > 32 || c == '\n') return p; + } else { + if (c <= 32) return p; + } + p++; + } + return end; + } + + template + static const char* simd_scan(const char* p, const char* end) + { + const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + while (p + 32 <= end) { + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)p); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + + unsigned int mask; + if (skip_ws_mode) { + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); + } else { + mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); + } + + if (mask != 0) { return p + __builtin_ctz(mask); } + p += 32; + } + return scalar_scan(p, end); + } + + void skip_ws() { ptr = simd_scan(ptr, end); } + + void skip_comment_line() + { + while (!done() && *ptr != '\n') { + ptr++; + } + if (!done()) ptr++; + } + + void skip_to_eol() + { + while (!done() && *ptr != '\n') { + ptr++; + } + } + + inline __attribute__((always_inline)) std::string_view read_field() + { + if (__unlikely(done())) { return {}; } + + const char* field_start = ptr; + if (__unlikely(end - ptr < 32)) { + ptr = scalar_scan(ptr, end); + const char* field_end = ptr; + if (ptr < end) { skip_ws(); } + return std::string_view(field_start, field_end - field_start); + } + + const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); + + if (__unlikely(ws_mask == 0)) { + ptr = simd_scan(ptr + 32, end); + const char* field_end = ptr; + if (ptr < end) { skip_ws(); } + return std::string_view(field_start, field_end - field_start); + } + + int field_end_off = __builtin_ctz(ws_mask); + const char* field_end = ptr + field_end_off; + + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + unsigned int stop_mask = + (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); + unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1); + + if (__likely(after_field != 0)) { + ptr = ptr + __builtin_ctz(after_field); + } else { + ptr = field_end; + if (ptr < end) { skip_ws(); } + } + + return std::string_view(field_start, field_end - field_start); + } + + inline __attribute__((always_inline)) std::string_view peek_field() + { + if (__unlikely(done())) { return {}; } + const char* field_end = simd_scan(ptr, end); + return std::string_view(ptr, field_end - ptr); + } + + inline __attribute__((always_inline)) std::pair + read_two_fields() + { + if (__unlikely(end - ptr < 32)) { + auto f1 = read_field(); + auto f2 = read_field(); + return {f1, f2}; + } + + const char* field1_start = ptr; + const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + + unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32); + unsigned int ws_mask = ~printable_mask; + unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); + unsigned int stop_mask = printable_mask | nl_mask; + + if (__unlikely(ws_mask == 0)) { + auto f1 = read_field(); + auto f2 = read_field(); + return {f1, f2}; + } + int field1_end_off = __builtin_ctz(ws_mask); + + unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1); + if (__unlikely(after_field1 == 0)) { + auto f1 = read_field(); + auto f2 = read_field(); + return {f1, f2}; + } + int field2_start_off = __builtin_ctz(after_field1); + + if (__unlikely(ptr[field2_start_off] == '\n')) { + auto f1 = read_field(); + auto f2 = read_field(); + return {f1, f2}; + } + + unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1); + if (__unlikely(ws_after_field2_start == 0)) { + auto f1 = read_field(); + auto f2 = read_field(); + return {f1, f2}; + } + int field2_end_off = __builtin_ctz(ws_after_field2_start); + + unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1); + if (__likely(after_field2 != 0)) { + ptr = ptr + __builtin_ctz(after_field2); + } else { + ptr = ptr + field2_end_off; + skip_ws(); + } + + return {std::string_view(field1_start, field1_end_off), + std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)}; + } + + bool eol() const { return ptr < end && *ptr == '\n'; } +}; + +static inline void expect(cursor_t& cursor, const char* field) +{ + auto id = cursor.read_field(); + if (__unlikely(id != field)) { cursor.error("expected '%s', got '%s'", field, id.data()); } +} + +static inline void accept_comment_line(cursor_t& cursor) +{ + for (;;) { + while (!cursor.done() && cursor.eol()) { + cursor.advance(1); + } + if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; } + cursor.skip_comment_line(); + } +} + +static inline void expect_eol(cursor_t& cursor) +{ + if (__unlikely(!cursor.eol())) { cursor.error("expected end of line, got '%s'", cursor.ptr); } + + for (;;) { + while (cursor.eol()) { + cursor.advance(1); + } + if (__unlikely(cursor.done())) { return; } + + if (__unlikely(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) { + cursor.skip_comment_line(); + continue; + } + + if (__likely(cursor.ptr[0] == ' ') && __likely(cursor.ptr + 1 < cursor.end)) { + cursor.ptr += 1; + } + + if (__unlikely(cursor.done())) { return; } + if (__unlikely(!std::isalpha(static_cast(cursor.ptr[0])))) { + cursor.skip_ws(); + if (cursor.eol()) { continue; } + } + break; + } +} + +static inline std::string_view peek(cursor_t& cursor) { return cursor.peek_field(); } + +static inline bool accept(cursor_t& cursor, const char* field) +{ + if (peek(cursor) == field) { + expect(cursor, field); + return true; + } + return false; +} + +static inline void expect_section(cursor_t& cursor, const char* section) +{ + expect(cursor, section); + expect_eol(cursor); +} + +static inline double expect_number(cursor_t& cursor) +{ + auto num = cursor.read_field(); + if (num.empty()) { cursor.error("expected number, got '%s'", num.data()); } + return fast_atof(num.data(), num.data() + num.size()); +} + +static inline double expect_number_fast_pm_one(cursor_t& cursor) +{ + const char* p = cursor.ptr; + if (p[0] == '-' && p[1] == '1' && p[2] <= ' ') { + cursor.ptr = p + 2; + cursor.skip_ws(); + return -1.0; + } + if (p[0] == '1' && p[1] <= ' ') { + cursor.ptr = p + 1; + cursor.skip_ws(); + return 1.0; + } + return expect_number(cursor); +} + +static inline bool accept_section(cursor_t& cursor, const char* section) +{ + if (accept(cursor, section)) { + expect_eol(cursor); + return true; + } + return false; +} + +static inline bool accept_comment(cursor_t& cursor) +{ + if (__unlikely(!cursor.done() && cursor.ptr[0] == '$')) { + cursor.skip_to_eol(); + return true; + } + return false; +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp new file mode 100644 index 0000000000..bce17a435f --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -0,0 +1,2770 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "fast_parser.hpp" +#include "fast_parse_primitives.hpp" +#include "file_reader.hpp" +#include "hash_table_smallstr.hpp" +#include "mmap_region.hpp" +#include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" +#ifdef MPS_FAST_PERF_COUNTERS +#include "perf_counters.hpp" +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +namespace mps_fast { + +static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; +static constexpr int MPS_ROWS_THREAD_CAP = 16; +static constexpr int MPS_COLUMNS_THREAD_CAP = 32; +static constexpr int MPS_BOUNDS_THREAD_CAP = 32; +static constexpr int MPS_NAMES_THREAD_CAP = 16; +static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024; +static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256ull * 1024ull * 1024ull; +static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * 1024 * 1024; + +static int phase_thread_count(int phase_cap) +{ + return std::max(1, std::min(phase_cap, omp_get_max_threads())); +} + +// ============================================================================= +// RAII Timer for profiling with deferred output +// ============================================================================= + +struct TimerEntry { + const char* name; + double elapsed_ms; +}; + +static std::vector& get_timer_buffer() +{ + static std::vector buffer; + buffer.reserve(100); + return buffer; +} + +static std::mutex& get_timer_mutex() +{ + static std::mutex mutex; + return mutex; +} + +static void flush_timers() +{ + std::lock_guard lock(get_timer_mutex()); + auto& buffer = get_timer_buffer(); + for (const auto& entry : buffer) { + std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms); + } + buffer.clear(); +} + +static size_t system_page_size() +{ + static size_t page_size = [] { + long value = sysconf(_SC_PAGESIZE); + return value > 0 ? (size_t)value : (size_t)4096; + }(); + return page_size; +} + +enum class materialize_touch_t { + write_2mb, + write_4kb, +}; + +// instanciate a range using mmap anon pages with hugepage hints, and materialize them +// by touching each to nudge the kernel into invoking its THP mechanism +static void materialize_hugepages(const char* label, + void* data, + size_t bytes, + materialize_touch_t touch) +{ + (void)label; + if (data == nullptr || bytes == 0) return; + + constexpr size_t two_mb = 2 * 1024 * 1024; + size_t page_size = system_page_size(); + uintptr_t start = reinterpret_cast(data); + uintptr_t end = start + bytes; + uintptr_t aligned_start = start & ~(uintptr_t)(page_size - 1); + uintptr_t aligned_end = (end + page_size - 1) & ~(uintptr_t)(page_size - 1); + size_t aligned_bytes = (size_t)(aligned_end - aligned_start); + + errno = 0; + madvise(reinterpret_cast(aligned_start), aligned_bytes, MADV_HUGEPAGE); + + size_t step = touch == materialize_touch_t::write_2mb ? two_mb : page_size; + volatile char* ptr = reinterpret_cast(data); + for (size_t offset = 0; offset < bytes; offset += step) { + ptr[offset] = ptr[offset]; + } + ptr[bytes - 1] = ptr[bytes - 1]; +} + +template +static void materialize_vector_hugepages(const char* label, + std::vector& values, + materialize_touch_t touch) +{ + materialize_hugepages(label, values.data(), values.size() * sizeof(T), touch); +} + +class scoped_timer_t { + public: + scoped_timer_t(const char* name, double* accumulator = nullptr) + : name_(name), + accumulator_(accumulator), + nvtx_(name, nvtx::color_for_name(name)), + start_(std::chrono::high_resolution_clock::now()) + { + } + + ~scoped_timer_t() + { + auto end = std::chrono::high_resolution_clock::now(); + double elapsed_ms = std::chrono::duration(end - start_).count(); + nvtx_.end(); + if (accumulator_) { *accumulator_ += elapsed_ms; } + std::lock_guard lock(get_timer_mutex()); + get_timer_buffer().push_back({name_, elapsed_ms}); + } + + scoped_timer_t(const scoped_timer_t&) = delete; + scoped_timer_t& operator=(const scoped_timer_t&) = delete; + + private: + const char* name_; + double* accumulator_; + nvtx::scoped_range nvtx_; + std::chrono::high_resolution_clock::time_point start_; +}; + +static inline bool section_token_matches(const char* p, + const char* end, + const char* token, + size_t len) +{ + return (size_t)(end - p) >= len && std::memcmp(p, token, len) == 0 && + ((size_t)(end - p) == len || p[len] <= ' '); +} + +static inline bool is_quadratic_section_start(const char* p, const char* end) +{ + return section_token_matches(p, end, "QUADOBJ", 7) || + section_token_matches(p, end, "QMATRIX", 7) || + section_token_matches(p, end, "QCMATRIX", 8); +} + +static inline bool is_rhs_section_end(const char* p, const char* end) +{ + switch (p[0]) { + case 'B': return std::memcmp(p, "BOUNDS", 6) == 0 && p[6] <= ' '; + case 'Q': return is_quadratic_section_start(p, end); + case 'R': return std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' '; + case 'E': return std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' '; + default: return false; + } +} + +static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section) +{ + const char* row_end = row_start; + while (row_end < cursor.end && *row_end > ' ') { + row_end++; + } + cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start); +} + +// ============================================================================= +// Parsing state shared across section parsers +// ============================================================================= + +// Hash and equality for string_view keys in unordered_map +struct string_view_hash { + size_t operator()(std::string_view sv) const { return std::hash{}(sv); } +}; + +static inline size_t next_power_of_2(size_t n) +{ + if (n == 0) return 1; + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + return n + 1; +} + +enum class row_index_mode_t { + hash, + dense_ordered, +}; + +static inline bool is_decimal_digit(char c) { return (unsigned)(c - '0') <= 9; } + +static inline size_t decimal_digits_u64(uint64_t value) +{ + size_t digits = 1; + while (value >= 10) { + value /= 10; + digits++; + } + return digits; +} + +static inline bool parse_trailing_u64(std::string_view name, + std::string_view& prefix, + uint64_t& value, + size_t& suffix_width) +{ + size_t pos = name.size(); + while (pos > 0 && is_decimal_digit(name[pos - 1])) { + pos--; + } + if (pos == name.size()) { return false; } + + uint64_t parsed = 0; + for (size_t i = pos; i < name.size(); ++i) { + uint64_t digit = (uint64_t)(name[i] - '0'); + if (parsed > (std::numeric_limits::max() - digit) / 10) { return false; } + parsed = parsed * 10 + digit; + } + + prefix = std::string_view(name.data(), pos); + value = parsed; + suffix_width = name.size() - pos; + return true; +} + +static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width) +{ + return suffix_width > 1 && name[name.size() - suffix_width] == '0'; +} + +static inline bool dense_suffix_width_ok(uint64_t value, + size_t suffix_width, + bool zero_padded, + size_t pad_width) +{ + size_t digits = decimal_digits_u64(value); + size_t expected_width = zero_padded ? std::max(pad_width, digits) : digits; + return suffix_width == expected_width; +} + +template +struct parse_state_t { + cuopt::linear_programming::io::mps_data_model_t& problem; + cursor_t& cursor; + + // Temporary string_view storage (points into input buffer, no allocation) + std::vector row_names_sv; + std::vector var_names_sv; + std::string_view problem_name_sv; + std::string_view objective_name_sv; + std::vector ignored_objective_names_sv; + + // Optional dense ordered column index for labels like V0, V1, ... + bool col_dense_ordered = false; + std::string_view col_dense_prefix; + uint64_t col_dense_min_id = 0; + uint64_t col_dense_max_id = 0; + size_t col_dense_pad_width = 0; + bool col_dense_zero_padded = false; + + // Row name hash table - sized at runtime based on row count + size_t row_hash_buckets = 0; + size_t row_hash_mask = 0; // buckets - 1, for fast modulo via & + mmap_region_t row_hash_region; + hash_slot_var_t* row_names_ht = nullptr; + // Overflow map for row names longer than HASH_KEY_BYTES + std::unordered_map row_names_long; + + // Optional dense ordered row index for labels like R0001, R0002, ... + row_index_mode_t row_index_mode = row_index_mode_t::hash; + bool row_dense_candidate = true; + std::string_view row_dense_prefix; + uint64_t row_dense_min_id = 0; + uint64_t row_dense_max_id = 0; + uint64_t row_dense_base_id = 0; + size_t row_dense_pad_width = 0; + bool row_dense_zero_padded = false; + + // var_names still uses STL (only used in parse_bounds, not as hot) + std::unordered_map var_names_map; + + parse_state_t(cuopt::linear_programming::io::mps_data_model_t& p, cursor_t& c) + : problem(p), cursor(c) + { + } + + void init_row_hash_table() + { + if (init_row_dense_ordered_table()) { return; } + init_row_hash_table_impl(); + } + + bool row_dense_has_expected_width(uint64_t value, size_t suffix_width) const + { + return dense_suffix_width_ok(value, suffix_width, row_dense_zero_padded, row_dense_pad_width); + } + + bool col_dense_has_expected_width(uint64_t value, size_t suffix_width) const + { + return dense_suffix_width_ok(value, suffix_width, col_dense_zero_padded, col_dense_pad_width); + } + + bool is_ignored_objective_name(std::string_view name) const + { + return std::find(ignored_objective_names_sv.begin(), ignored_objective_names_sv.end(), name) != + ignored_objective_names_sv.end(); + } + + void add_ignored_objective_name(std::string_view name) + { + if (name == objective_name_sv || is_ignored_objective_name(name)) { return; } + ignored_objective_names_sv.push_back(name); + } + + void observe_objective_row_name(std::string_view name) + { + if (objective_name_sv.empty()) { + objective_name_sv = name; + } else { + add_ignored_objective_name(name); + } + } + + void observe_row_name_for_dense_index(std::string_view name, size_t row_index) + { + if (!row_dense_candidate) { return; } + + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { + row_dense_candidate = false; + return; + } + + if (row_index == 0) { + row_dense_prefix = prefix; + row_dense_min_id = value; + row_dense_max_id = value; + row_dense_base_id = value; + row_dense_pad_width = suffix_width; + row_dense_zero_padded = dense_suffix_is_zero_padded(name, suffix_width); + return; + } + + if (prefix != row_dense_prefix) { + row_dense_candidate = false; + return; + } + + if (row_dense_base_id > std::numeric_limits::max() - row_index) { + row_dense_candidate = false; + return; + } + + uint64_t expected = row_dense_base_id + row_index; + if (value != expected || !row_dense_has_expected_width(value, suffix_width)) { + row_dense_candidate = false; + return; + } + + row_dense_min_id = std::min(row_dense_min_id, value); + row_dense_max_id = std::max(row_dense_max_id, value); + } + + bool init_row_dense_ordered_table() + { + scoped_timer_t timer("row_dense_finalize"); + size_t n_rows = row_names_sv.size(); + if (!row_dense_candidate || n_rows == 0) { return false; } + if (row_dense_max_id < row_dense_min_id) { return false; } + uint64_t dense_count = row_dense_max_id - row_dense_min_id + 1; + if (dense_count != n_rows) { return false; } + + row_index_mode = row_index_mode_t::dense_ordered; + return true; + } + + void init_row_hash_table_impl() + { + scoped_timer_t timer("row_hash_init_total"); + size_t n_rows = row_names_sv.size(); + // load factor 50% + row_hash_buckets = next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64)); + row_hash_mask = row_hash_buckets - 1; + size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t); + + { + scoped_timer_t timer("row_hash_mmap"); + // Use mmap for allocation - the OS provides zero'd pages + row_hash_region = mmap_region_t::anonymous( + row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table"); + row_names_ht = static_cast(row_hash_region.data()); + // Request huge pages to reduce TLB misses + row_hash_region.advise(MADV_HUGEPAGE); + } + + // pre-touch the 2MB huge pages to nudge the kernel into allocating them +#ifdef MPS_FAST_THP_PREFAULT + { + scoped_timer_t timer("row_hash_thp_prefault"); + materialize_hugepages( + "row_names_ht", row_names_ht, row_hash_region.size(), materialize_touch_t::write_2mb); + } +#endif + + { + scoped_timer_t timer("row_hash_insert_all"); + for (size_t idx = 0; idx < n_rows; ++idx) { + row_insert(row_names_sv[idx], idx); + } + } + + // Force the kernel to please please collapse the page range into THP pages +#ifdef MPS_FAST_MADV_COLLAPSE + { + scoped_timer_t timer("row_hash_madv_collapse"); + row_hash_region.advise(MADV_COLLAPSE); + } +#endif + } + + size_t row_lookup_dense_ordered(std::string_view name) const + { + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; } + if (prefix != row_dense_prefix || !row_dense_has_expected_width(value, suffix_width)) { + return SIZE_MAX; + } + if (value < row_dense_min_id || value > row_dense_max_id) { return SIZE_MAX; } + return (size_t)(value - row_dense_min_id); + } + + size_t row_lookup(std::string_view name) const + { + if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) { + return row_lookup_dense_ordered(name); + } + return row_lookup_hash(name); + } + + size_t read_row_lookup_dense_ordered(cursor_t& cursor) const + { + const char* start = cursor.ptr; + const char* p = start; + + size_t prefix_len = row_dense_prefix.size(); + if (prefix_len > 0) { + if ((size_t)(cursor.end - p) < prefix_len || + std::memcmp(p, row_dense_prefix.data(), prefix_len) != 0) { + cursor.read_field(); + return SIZE_MAX; + } + p += prefix_len; + } + + const char* digits_start = p; + uint64_t value = 0; + while (p < cursor.end && is_decimal_digit(*p)) { + uint64_t digit = (uint64_t)(*p - '0'); + if (value > (std::numeric_limits::max() - digit) / 10) { + cursor.ptr = start; + cursor.read_field(); + return SIZE_MAX; + } + value = value * 10 + digit; + p++; + } + + size_t suffix_width = (size_t)(p - digits_start); + if (suffix_width == 0 || p >= cursor.end || *p > ' ' || + !row_dense_has_expected_width(value, suffix_width) || value < row_dense_min_id || + value > row_dense_max_id) { + cursor.ptr = start; + cursor.read_field(); + return SIZE_MAX; + } + + cursor.ptr = p; + cursor.skip_ws(); + return (size_t)(value - row_dense_min_id); + } + + size_t read_row_lookup(cursor_t& cursor) const + { + if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) { + return read_row_lookup_dense_ordered(cursor); + } + + auto row_name = cursor.read_field(); + return row_lookup_hash(row_name); + } + + size_t row_lookup_hash(std::string_view name) const + { + if (__unlikely(name.size() > HASH_KEY_BYTES)) { + auto it = row_names_long.find(name); + return it != row_names_long.end() ? it->second : SIZE_MAX; + } + hash_key_t key = make_key(name.data(), name.size()); + uint32_t hash = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask; + const hash_slot_var_t* slots = row_names_ht; + const hash_slot_var_t* slot = &slots[hash]; + + for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) { + if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; } + if (slot->count == 0) { return SIZE_MAX; } + if (key_cmpeq(slot->key, key)) { return slot->count - 1; } + } + return SIZE_MAX; + } + + size_t col_lookup_dense_ordered(std::string_view name) const + { + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; } + if (prefix != col_dense_prefix || !col_dense_has_expected_width(value, suffix_width)) { + return SIZE_MAX; + } + if (value < col_dense_min_id || value > col_dense_max_id) { return SIZE_MAX; } + return (size_t)(value - col_dense_min_id); + } + + void dense_col_name(size_t idx, std::string& out) const + { + uint64_t value = col_dense_min_id + idx; + char digits_buf[32]; + auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value); + if (ec != std::errc()) { + out.assign(col_dense_prefix); + return; + } + size_t digits_len = (size_t)(digits_end - digits_buf); + size_t width = col_dense_zero_padded ? std::max(col_dense_pad_width, digits_len) : digits_len; + out.resize(col_dense_prefix.size() + width); + std::memcpy(out.data(), col_dense_prefix.data(), col_dense_prefix.size()); + char* suffix = out.data() + col_dense_prefix.size(); + if (width > digits_len) { + std::memset(suffix, '0', width - digits_len); + suffix += width - digits_len; + } + std::memcpy(suffix, digits_buf, digits_len); + } + + void row_insert(std::string_view name, size_t index) + { + if (__unlikely(name.size() > HASH_KEY_BYTES)) { + row_names_long[name] = index; + return; + } + hash_key_t key = make_key(name.data(), name.size()); + uint32_t hash = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask; + hash_slot_var_t* slots = row_names_ht; + hash_slot_var_t* slot = &slots[hash]; + + for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) { + if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; } + if (slot->count == 0) { + key_store(slot->key, key); // Writes 32 bytes, including garbage in last 4 + slot->count = (uint32_t)(index + 1); // Overwrite last 4 bytes with actual count + return; + } + if (key_cmpeq(slot->key, key)) { + slot->count = (uint32_t)(index + 1); + return; + } + } + __builtin_trap(); + } +}; + +// ============================================================================= +// Section parsers +// ============================================================================= + +template +static void parse_name_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_name"); + if (peek(state.cursor) == "ROWS") { return; } + expect(state.cursor, "NAME"); + if (!state.cursor.eol()) { + state.problem_name_sv = state.cursor.read_field(); + accept_comment(state.cursor); + } + expect_eol(state.cursor); +} + +template +static void parse_objsense_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_objsense"); + if (accept(state.cursor, "OBJSENSE")) { + if (state.cursor.eol()) { expect_eol(state.cursor); } + if (accept(state.cursor, "MIN")) { + state.problem.maximize_ = false; + } else if (accept(state.cursor, "MAX")) { + state.problem.maximize_ = true; + } else { + state.cursor.error("expected MIN or MAX, got '%s'", state.cursor.read_field().data()); + } + accept_comment(state.cursor); + expect_eol(state.cursor); + } +} + +template +static void parse_objname_section(parse_state_t& state) +{ + scoped_timer_t timer("parse_objname"); + if (accept(state.cursor, "OBJNAME")) { + if (state.cursor.eol()) { expect_eol(state.cursor); } + state.objective_name_sv = state.cursor.read_field(); + accept_comment(state.cursor); + expect_eol(state.cursor); + } +} + +struct RowChunkBoundary { + const char* start; + const char* end; +}; + +struct RowChunkInfo { + size_t constraints = 0; + bool malformed = false; + std::vector objective_names; + bool has_first_constraint = false; + std::string_view first_constraint_name; +}; + +static const char* rows_find_next_line(const char* p, const char* end) +{ + while (p < end && *p != '\n') + p++; + if (p < end) p++; + return p; +} + +static bool parse_rows_line_fast(const char*& p, + const char* end, + char& row_type, + std::string_view& row_name) +{ + while (p < end && *p <= ' ' && *p != '\n') + p++; + if (p >= end) { return false; } + if (*p == '\n') { + p++; + return false; + } + if (*p == '*' || *p == '$') { + p = rows_find_next_line(p, end); + return false; + } + + row_type = *p++; + while (p < end && *p <= ' ' && *p != '\n') + p++; + + const char* name_start = p; + while (p < end && *p > ' ') + p++; + if (name_start == p) { return false; } + row_name = std::string_view(name_start, (size_t)(p - name_start)); + + // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and + // field 3 may start with '$' to comment the rest of the record. + p = rows_find_next_line(p, end); + return true; +} + +static std::vector compute_row_chunk_boundaries(const char* rows_start, + const char* rows_end, + int num_threads) +{ + scoped_timer_t timer("rows_compute_chunk_boundaries"); + + std::vector boundaries((size_t)num_threads); + size_t total_size = (size_t)(rows_end - rows_start); + size_t chunk_size = total_size / (size_t)num_threads; + + boundaries[0].start = rows_start; + for (int t = 0; t < num_threads; ++t) { + if (t == num_threads - 1) { + boundaries[(size_t)t].end = rows_end; + } else { + const char* boundary = rows_start + (size_t)(t + 1) * chunk_size; + boundary = rows_find_next_line(boundary, rows_end); + boundaries[(size_t)t].end = boundary; + boundaries[(size_t)t + 1].start = boundary; + } + } + + return boundaries; +} + +template +static bool parse_rows_section_parallel_impl(parse_state_t& state, + const char* rows_start, + const char* rows_end, + int num_threads) +{ + scoped_timer_t timer("parse_rows_parallel"); + + auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads); + std::vector infos((size_t)num_threads); + + { + scoped_timer_t timer("rows_count_parallel"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows); + const char* p = boundaries[(size_t)t].start; + const char* end = boundaries[(size_t)t].end; + RowChunkInfo info; + + while (p < end) { + char row_type = 0; + std::string_view row_name; + const char* before = p; + if (!parse_rows_line_fast(p, end, row_type, row_name)) { + if (p == before) { + info.malformed = true; + break; + } + continue; + } + + if (row_type == 'N') { + info.objective_names.push_back(row_name); + } else { + if (!info.has_first_constraint) { + info.first_constraint_name = row_name; + info.has_first_constraint = true; + } + info.constraints++; + } + } + + infos[(size_t)t] = info; + } + } + + for (const auto& info : infos) { + if (info.malformed) { return false; } + } + + std::vector offsets((size_t)num_threads + 1, 0); + { + scoped_timer_t timer("rows_prefix_sum"); + for (int t = 0; t < num_threads; ++t) { + offsets[(size_t)t + 1] = offsets[(size_t)t] + infos[(size_t)t].constraints; + } + } + + size_t total_rows = offsets[(size_t)num_threads]; + { + scoped_timer_t timer("rows_resize_outputs"); + state.row_names_sv.resize(total_rows); + state.problem.row_types_.resize(total_rows); + } + + if (state.objective_name_sv.empty()) { + for (const auto& info : infos) { + if (!info.objective_names.empty()) { + state.objective_name_sv = info.objective_names.front(); + break; + } + } + } + for (const auto& info : infos) { + for (std::string_view name : info.objective_names) { + state.add_ignored_objective_name(name); + } + } + + bool dense_candidate = total_rows > 0; + std::string_view dense_prefix; + uint64_t dense_base_id = 0; + size_t dense_pad_width = 0; + bool dense_zero_padded = false; + + if (dense_candidate) { + std::string_view first_name; + for (const auto& info : infos) { + if (info.has_first_constraint) { + first_name = info.first_constraint_name; + break; + } + } + + uint64_t first_value = 0; + size_t first_suffix_width = 0; + if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) { + dense_candidate = false; + } else { + dense_base_id = first_value; + dense_pad_width = first_suffix_width; + dense_zero_padded = dense_suffix_is_zero_padded(first_name, first_suffix_width); + } + } + + std::vector dense_ok_by_chunk((size_t)num_threads, 1); + + { + scoped_timer_t timer("rows_fill_parallel"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + MPS_NVTX_RANGE(std::string("rows_fill_chunk ") + std::to_string(t), nvtx::colors::rows); + const char* p = boundaries[(size_t)t].start; + const char* end = boundaries[(size_t)t].end; + size_t out = offsets[(size_t)t]; + + bool local_dense_ok = dense_candidate; + + while (p < end) { + char row_type = 0; + std::string_view row_name; + const char* before = p; + if (!parse_rows_line_fast(p, end, row_type, row_name)) { + if (p == before) { + local_dense_ok = false; + break; + } + continue; + } + + if (row_type == 'N') { continue; } + + state.row_names_sv[out] = row_name; + state.problem.row_types_[out] = row_type; + + if (local_dense_ok) { + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + uint64_t expected = dense_base_id + out; + local_dense_ok = + parse_trailing_u64(row_name, prefix, value, suffix_width) && prefix == dense_prefix && + value == expected && + dense_suffix_width_ok(value, suffix_width, dense_zero_padded, dense_pad_width); + } + out++; + } + + dense_ok_by_chunk[(size_t)t] = local_dense_ok ? 1 : 0; + } + } + + { + scoped_timer_t timer("rows_dense_metadata"); + for (uint8_t ok : dense_ok_by_chunk) { + dense_candidate = dense_candidate && ok; + } + state.row_dense_candidate = dense_candidate; + if (dense_candidate) { + state.row_dense_prefix = dense_prefix; + state.row_dense_min_id = dense_base_id; + state.row_dense_max_id = dense_base_id + total_rows - 1; + state.row_dense_base_id = dense_base_id; + state.row_dense_pad_width = dense_pad_width; + state.row_dense_zero_padded = dense_zero_padded; + } + } + + return true; +} + +template +static void parse_rows_section_serial_impl(parse_state_t& state, const char* rows_end) +{ + scoped_timer_t timer("parse_rows_serial"); + + while (state.cursor.ptr < rows_end) { + auto row_type = state.cursor.ptr[0]; + state.cursor.advance(1); + state.cursor.skip_ws(); + // if (row_type != "E" && row_type != "L" && row_type != "G" && row_type != "N") { + // state.cursor.error("expected E, L, G, or N, got '%s'", row_type.data()); + // } + + auto row_name = state.cursor.read_field(); + // ROWS fields after the row name are unused; tolerate annotations/comments there. + state.cursor.skip_to_eol(); + + // 'N' type is the objective row - store its name but don't add to constraints + if (row_type == 'N') { + state.observe_objective_row_name(row_name); + } else { + size_t row_idx = state.row_names_sv.size(); + state.row_names_sv.push_back(row_name); + state.observe_row_name_for_dense_index(row_name, row_idx); + state.problem.row_types_.push_back(row_type); + } + expect_eol(state.cursor); + } +} + +template +static void parse_rows_section(parse_state_t& state, const char* rows_end) +{ + scoped_timer_t timer("parse_rows"); + expect_section(state.cursor, "ROWS"); + + { + scoped_timer_t timer("parse_rows_scan"); + const char* rows_start = state.cursor.ptr; + + size_t rows_bytes = (size_t)(rows_end - state.cursor.ptr); + int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); + bool parsed_parallel = false; + if (rows_bytes >= 512ull * 1024ull * 1024ull && num_threads > 1) { + parsed_parallel = + parse_rows_section_parallel_impl(state, state.cursor.ptr, rows_end, num_threads); + if (!parsed_parallel) { + state.row_names_sv.clear(); + state.problem.row_types_.clear(); + state.row_dense_candidate = true; + state.row_dense_prefix = {}; + state.row_dense_min_id = 0; + state.row_dense_max_id = 0; + state.row_dense_base_id = 0; + state.row_dense_pad_width = 0; + state.row_dense_zero_padded = false; + state.cursor.ptr = rows_start; + parse_rows_section_serial_impl(state, rows_end); + } + } else { + parse_rows_section_serial_impl(state, rows_end); + } + state.cursor.ptr = rows_end; + } + + state.problem.n_constraints_ = (i_t)state.row_names_sv.size(); + state.problem.b_.resize((size_t)state.problem.n_constraints_); + + { + scoped_timer_t timer("parse_rows_hash_init"); + state.init_row_hash_table(); + } +} + +// ============================================================================= +// Parallel COLUMNS parser +// ============================================================================= + +struct MarkerInfo { + enum Type { INTORG, INTEND }; + Type type; + size_t after_local_var_idx; // SIZE_MAX means "before first variable" +}; + +struct RowCountBlock { + size_t block_id = 0; + size_t storage_offset = 0; +}; + +struct DenseColChunkStats { + bool candidate = true; + std::string_view prefix; + uint64_t first_id = 0; + uint64_t last_id = 0; + size_t pad_width = 0; + bool zero_padded = false; + size_t count = 0; +}; + +struct ChunkResult { + std::vector values; + std::vector row_indices; + std::vector col_offsets; + std::vector var_names; + std::vector markers; + std::vector> objective_entries; // local_col_idx -> coefficient + // Sparse per-row scratch: each touched 4096-row block stores counts after parsing, + // then the same slots become CSR write cursors. This avoids scanning/allocating + // chunks*n_rows entries when a chunk only touches clustered row ranges. The + // block payloads live in one arena per chunk so scatter has hugepage-friendly + // write-position metadata instead of many independent 32 KiB allocations. + std::vector row_count_storage; + std::vector row_count_blocks; + std::vector row_count_block_dir; + std::string_view first_var_name; + std::string_view last_var_name; + DenseColChunkStats dense_col_stats; +}; + +struct ChunkBoundary { + const char* start; + const char* end; +}; + +struct BoundsChunkBoundary { + const char* start; + const char* end; +}; + +static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx) +{ + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = result.row_count_block_dir[block_id]; + if (__unlikely(block_pos < 0)) { + block_pos = (int32_t)result.row_count_blocks.size(); + result.row_count_block_dir[block_id] = block_pos; + RowCountBlock block; + block.block_id = block_id; + block.storage_offset = result.row_count_storage.size(); + result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0); + result.row_count_blocks.push_back(std::move(block)); + } + return result + .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local]; +} + +static void observe_dense_col_name(DenseColChunkStats& stats, std::string_view name) +{ + if (!stats.candidate) { return; } + + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { + stats.candidate = false; + return; + } + + if (stats.count == 0) { + stats.prefix = prefix; + stats.first_id = value; + stats.last_id = value; + stats.pad_width = suffix_width; + stats.zero_padded = dense_suffix_is_zero_padded(name, suffix_width); + stats.count = 1; + return; + } + + if (prefix != stats.prefix) { + stats.candidate = false; + return; + } + if (stats.last_id == std::numeric_limits::max() || value != stats.last_id + 1) { + stats.candidate = false; + return; + } + if (!dense_suffix_width_ok(value, suffix_width, stats.zero_padded, stats.pad_width)) { + stats.candidate = false; + return; + } + stats.last_id = value; + stats.count++; +} + +static bool dense_col_chunk_padding_compatible(const DenseColChunkStats& stats, + bool global_zero_padded, + size_t global_pad_width) +{ + if (global_zero_padded) { + return stats.pad_width == global_pad_width || + (!stats.zero_padded && decimal_digits_u64(stats.first_id) >= global_pad_width); + } + return !stats.zero_padded; +} + +// Read first field (column name) from a line without modifying any state +static std::string_view peek_line_column_name(const char* line_start, const char* end) +{ + const char* p = line_start; + while (p < end && *p <= ' ' && *p != '\n') + p++; + const char* field_start = p; + while (p < end && *p > ' ') + p++; + return std::string_view(field_start, (size_t)(p - field_start)); +} + +// Find the start of the next line +static const char* find_next_line(const char* p, const char* end) +{ + while (p < end && *p != '\n') + p++; + if (p < end) p++; + return p; +} + +static const char* find_bounds_body_end(const char* bounds_body_start, const char* parse_end) +{ + const char* p = bounds_body_start; + while (p < parse_end) { + if ((*p == 'E' && parse_end - p >= 6 && std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ') || + (*p == 'Q' && is_quadratic_section_start(p, parse_end)) || + (*p == 'R' && parse_end - p >= 6 && std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ')) { + return p; + } + p = find_next_line(p, parse_end); + } + return parse_end; +} + +static std::vector compute_line_chunk_boundaries(const char* section_start, + const char* section_end, + int num_threads) +{ + scoped_timer_t timer("bounds_compute_chunk_boundaries"); + + size_t total_size = (size_t)(section_end - section_start); + size_t chunk_size = total_size / (size_t)num_threads; + + std::vector boundaries((size_t)num_threads); + boundaries[0].start = section_start; + for (int t = 0; t < num_threads; ++t) { + if (t == num_threads - 1) { + boundaries[(size_t)t].end = section_end; + } else { + const char* boundary = section_start + (size_t)(t + 1) * chunk_size; + boundaries[(size_t)t].end = find_next_line(boundary, section_end); + boundaries[(size_t)t + 1].start = boundaries[(size_t)t].end; + } + } + return boundaries; +} + +static std::vector compute_chunk_boundaries(const char* columns_start, + const char* columns_end, + int num_threads) +{ + scoped_timer_t timer("compute_chunk_boundaries"); + + size_t total_size = (size_t)(columns_end - columns_start); + size_t chunk_size = total_size / (size_t)num_threads; + + std::vector boundaries(num_threads); + + // Parallel boundary finding - each thread finds its own end at a column transition + // #pragma omp parallel for + for (int t = 0; t < num_threads; t++) { + if (t == 0) { boundaries[t].start = columns_start; } + + if (t == num_threads - 1) { + boundaries[t].end = columns_end; + } else { + // Find estimated position and align to line boundary + const char* estimated_end = columns_start + (t + 1) * chunk_size; + const char* line_start = estimated_end; + while (line_start < columns_end && *line_start != '\n') + line_start++; + if (line_start < columns_end) line_start++; + + // Read column name at this line + std::string_view col_name = peek_line_column_name(line_start, columns_end); + + // Scan forward until column name changes (to avoid splitting a column) + const char* boundary = line_start; + while (boundary < columns_end) { + const char* next_line = find_next_line(boundary, columns_end); + if (next_line >= columns_end) break; + + std::string_view next_col = peek_line_column_name(next_line, columns_end); + if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') { + // Found a column transition (and it's not a MARKER line) + boundary = next_line; + break; + } + boundary = next_line; + } + boundaries[t].end = boundary; + } + } + + // Fix up start pointers (each start is previous end) + for (int t = 1; t < num_threads; t++) { + boundaries[t].start = boundaries[t - 1].end; + } + + return boundaries; +} + +template +static ChunkResult parse_columns_chunk(const char* chunk_start, + const char* chunk_end, + const parse_state_t& state) +{ + ChunkResult result; + + if (chunk_start >= chunk_end) { + result.col_offsets.push_back(0); + return result; + } + + size_t chunk_size = (size_t)(chunk_end - chunk_start); + size_t estimated_nnz = chunk_size / 100; + size_t estimated_cols = estimated_nnz / 10; + if (__unlikely(state.problem.n_constraints_ > (i_t)std::numeric_limits::max())) { + state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices"); + } + result.values.reserve(estimated_nnz); + result.row_indices.reserve(estimated_nnz); + result.col_offsets.reserve(estimated_cols + 1); + result.var_names.reserve(estimated_cols); + result.objective_entries.reserve(estimated_cols); + size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) / + COLUMN_ROW_COUNT_BLOCK_ROWS; + result.row_count_block_dir.resize(n_row_blocks, -1); + size_t estimated_touched_blocks = std::min(n_row_blocks, std::max(16, estimated_nnz)); + result.row_count_blocks.reserve(estimated_touched_blocks); + result.row_count_storage.reserve(estimated_touched_blocks * COLUMN_ROW_COUNT_BLOCK_ROWS); + + cursor_t cursor(chunk_start, (size_t)(chunk_end - chunk_start)); + std::string_view prev_var_name = ""; + + cursor.skip_ws(); + + while (!cursor.done()) { + if (__unlikely(*cursor.ptr == 'R')) { + auto next = cursor.peek_field(); + // RHS section is mandatory right after COLUMNS section + if (next == "RHS") { break; } + } + + auto [var_name, field2] = cursor.read_two_fields(); + if (__unlikely(!field2.empty() && field2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + + // Check for integer marker + if (__unlikely(field2[0] == '\'' && field2 == "'MARKER'")) { + auto marker_type = cursor.read_field(); + + MarkerInfo marker; + marker.after_local_var_idx = + result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1; + + if (marker_type == "'INTORG'") { + marker.type = MarkerInfo::INTORG; + } else { + marker.type = MarkerInfo::INTEND; + } + result.markers.push_back(marker); + + while (!cursor.done() && !cursor.eol()) + cursor.ptr++; + if (!cursor.done()) cursor.ptr++; + cursor.skip_ws(); + continue; + } + + auto row_name = field2; + // quite often in MIPs the coefficient is just a single-digit integer + double value; + double sign = 1.0; + if (cursor.ptr[0] == '-') { + sign = -1.0; + cursor.advance(1); + } + if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && cursor.ptr[1] == '\n') { + value = sign * (cursor.ptr[0] - '0'); + cursor.advance(1); + } else { + value = sign * fast_atof_advance(cursor.ptr, cursor.end); + } + // usually EOL directly follows + if (__unlikely(!cursor.eol())) { cursor.skip_ws(); } + accept_comment(cursor); + + if (result.first_var_name.empty()) { result.first_var_name = var_name; } + result.last_var_name = var_name; + + if (prev_var_name != var_name) { + result.var_names.push_back(var_name); + observe_dense_col_name(result.dense_col_stats, var_name); + result.col_offsets.push_back(result.values.size()); + prev_var_name = var_name; + } + + auto add_entry = [&](std::string_view rn, double val) { + size_t row_idx = state.row_lookup(rn); + if (__likely(row_idx != SIZE_MAX)) { + assert(row_idx <= (size_t)std::numeric_limits::max()); + result.values.push_back(val); + result.row_indices.push_back((uint32_t)row_idx); + column_row_count_slot(result, row_idx)++; + } else if (__likely(rn == state.objective_name_sv)) { + result.objective_entries.push_back({result.var_names.size() - 1, val}); + } else if (state.is_ignored_objective_name(rn)) { + return; + } else { + state.cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data()); + } + }; + + add_entry(row_name, value); + + // Optional second entry on same line + if (!cursor.eol()) { + auto row_name2 = cursor.read_field(); + if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + double value2 = fast_atof_advance(cursor.ptr, cursor.end); + cursor.skip_ws(); + accept_comment(cursor); + + add_entry(row_name2, value2); + } + + expect_eol(cursor); + } + + result.col_offsets.push_back(result.values.size()); + + return result; +} + +// Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC +template +static void merge_chunk_results_to_csr(parse_state_t& state, + std::vector& chunks, + int num_threads) +{ + scoped_timer_t timer("merge_chunks_to_csr"); + + int num_chunks = (int)chunks.size(); + if (num_chunks == 0) return; + + i_t n_rows = state.problem.n_constraints_; + + std::vector global_col_offset(num_chunks + 1); + global_col_offset[0] = 0; + size_t total_nnz = 0; + { + scoped_timer_t timer("columns_global_offsets"); + for (int t = 0; t < num_chunks; t++) { + global_col_offset[t + 1] = global_col_offset[t] + chunks[t].var_names.size(); + total_nnz += chunks[t].values.size(); + } + } + size_t total_cols = global_col_offset[num_chunks]; + { + scoped_timer_t timer("columns_dense_metadata"); + bool dense_ok = total_cols > 0; + bool have_first = false; + std::string_view dense_prefix; + uint64_t expected_next_id = 0; + uint64_t dense_min_id = 0; + uint64_t dense_max_id = 0; + size_t dense_pad_width = 0; + bool dense_zero_padded = false; + + for (int t = 0; t < num_chunks && dense_ok; ++t) { + const auto& stats = chunks[t].dense_col_stats; + if (stats.count == 0) { continue; } + if (!stats.candidate || stats.count != chunks[t].var_names.size()) { + dense_ok = false; + break; + } + if (!have_first) { + have_first = true; + dense_prefix = stats.prefix; + expected_next_id = stats.first_id; + dense_min_id = stats.first_id; + dense_pad_width = stats.pad_width; + dense_zero_padded = stats.zero_padded; + } + if (stats.prefix != dense_prefix || stats.first_id != expected_next_id || + !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) { + dense_ok = false; + break; + } + if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) { + dense_ok = false; + break; + } + dense_max_id = stats.last_id; + if (stats.last_id == std::numeric_limits::max()) { + expected_next_id = stats.last_id; + dense_ok = false; + break; + } + expected_next_id = stats.last_id + 1; + } + + if (!have_first || dense_max_id < dense_min_id || + dense_max_id - dense_min_id + 1 != total_cols) { + dense_ok = false; + } + + state.col_dense_ordered = dense_ok; + if (dense_ok) { + state.col_dense_prefix = dense_prefix; + state.col_dense_min_id = dense_min_id; + state.col_dense_max_id = dense_max_id; + state.col_dense_pad_width = dense_pad_width; + state.col_dense_zero_padded = dense_zero_padded; + } + } + + // Step 2: Sum row counts (already computed during parsing) and build CSR row_offsets + std::vector global_row_counts((size_t)n_rows, 0); + { + scoped_timer_t timer("columns_sum_row_counts"); + for (int t = 0; t < num_chunks; t++) { + for (const auto& block : chunks[t].row_count_blocks) { + const int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base); + for (size_t local = 0; local < block_limit; ++local) { + global_row_counts[row_base + local] += (i_t)block_counts[local]; + } + } + } + } + { + scoped_timer_t timer("columns_build_row_offsets"); + state.problem.A_offsets_.resize((size_t)n_rows + 1); + state.problem.A_offsets_[0] = 0; + for (i_t r = 0; r < n_rows; r++) { + state.problem.A_offsets_[(size_t)r + 1] = + state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r]; + } + } + + { + scoped_timer_t timer("columns_counts_to_write_positions"); + std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0}); + for (int t = 0; t < num_chunks; t++) { + for (auto& block : chunks[t].row_count_blocks) { + int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base); + for (size_t local = 0; local < block_limit; ++local) { + int64_t count = block_counts[local]; + if (count == 0) continue; + size_t row = row_base + local; + i_t pos = state.problem.A_offsets_[row] + global_row_counts[row]; + block_counts[local] = (int64_t)pos; + global_row_counts[row] += (i_t)count; + } + } + } + } + + { + scoped_timer_t timer("columns_row_count_storage_hugepages"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_chunks; ++t) { + materialize_vector_hugepages( + "column_row_count_storage", chunks[t].row_count_storage, materialize_touch_t::write_2mb); + } + } + + // Step 6: Allocate CSR arrays + { + scoped_timer_t timer("allocate_csr_arrays"); + + // May be unexpectedly slow, even if already reserved() to good fit. + // I assume the cause is probably that the pages aren't actually backed when reserve() is called + // and the actual physical allocation only happens now + + // evil tweak until we can refactior problem_t + // run the zero-init resize() calls in parallel + +#pragma omp parallel sections num_threads(4) + { +#pragma omp section + { + state.problem.A_.resize(total_nnz); + } +#pragma omp section + { + state.problem.A_indices_.resize(total_nnz); + } +#pragma omp section + { + if (!state.col_dense_ordered) { state.var_names_sv.resize(total_cols); } + } +#pragma omp section + { + state.problem.var_types_.resize(total_cols); + } + } + } + + // Step 6: Parallel scatter into CSR + copy var_names + { + scoped_timer_t timer("scatter_into_csr"); + { + scoped_timer_t matrix_timer("scatter_matrix_entries"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots((size_t)num_chunks); +#endif +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_chunks; t++) { +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + auto& chunk = chunks[t]; + + for (size_t local_col = 0; local_col < chunks[t].var_names.size(); local_col++) { + i_t global_col = (i_t)(global_col_offset[t] + local_col); + + size_t col_start = chunks[t].col_offsets[local_col]; + size_t col_end = chunks[t].col_offsets[local_col + 1]; + for (size_t idx = col_start; idx < col_end; idx++) { + i_t row = (i_t)chunks[t].row_indices[idx]; + size_t row_idx = (size_t)row; + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = chunk.row_count_block_dir[block_id]; + RowCountBlock& block = chunk.row_count_blocks[(size_t)block_pos]; + int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; + i_t dest = (i_t)write_pos++; + state.problem.A_[dest] = (f_t)chunks[t].values[idx]; + state.problem.A_indices_[dest] = global_col; + } + } +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[(size_t)t] = perf_counters.stop(); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("scatter_matrix_entries", perf_snapshots); +#endif + } + + if (!state.col_dense_ordered) { + { + scoped_timer_t names_timer("scatter_var_names"); +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_chunks; t++) { + for (size_t i = 0; i < chunks[t].var_names.size(); i++) { + state.var_names_sv[global_col_offset[t] + i] = chunks[t].var_names[i]; + } + } + } + } else { + scoped_timer_t names_timer("scatter_var_names"); + } + } + + // Step 7: Apply integer markers + struct GlobalMarker { + MarkerInfo::Type type; + size_t global_var_idx; + }; + { + scoped_timer_t timer("columns_apply_markers"); + std::vector all_markers; + + for (int t = 0; t < num_chunks; t++) { + for (const auto& m : chunks[t].markers) { + GlobalMarker gm; + gm.type = m.type; + + if (m.after_local_var_idx == SIZE_MAX) { + // Marker before any variable in this chunk + gm.global_var_idx = (global_col_offset[t] > 0) ? global_col_offset[t] - 1 : SIZE_MAX; + } else { + gm.global_var_idx = global_col_offset[t] + m.after_local_var_idx; + } + all_markers.push_back(gm); + } + } + + std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { + // SIZE_MAX means "before all variables" - should sort first + if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true; + if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false; + return a.global_var_idx < b.global_var_idx; + }); + + bool is_integer = false; + size_t marker_idx = 0; + + for (size_t v = 0; v < total_cols; v++) { + while (marker_idx < all_markers.size() && + (all_markers[marker_idx].global_var_idx == SIZE_MAX || + all_markers[marker_idx].global_var_idx < v)) { + if (all_markers[marker_idx].type == MarkerInfo::INTORG) { + is_integer = true; + } else { + is_integer = false; + } + marker_idx++; + } + state.problem.var_types_[v] = is_integer ? 'I' : 'C'; + } + } + + // Step 8: Handle objective entries + { + scoped_timer_t timer("columns_objective_entries"); + state.problem.c_.resize(total_cols, f_t{0}); + for (int t = 0; t < num_chunks; t++) { + for (const auto& [local_col, coeff] : chunks[t].objective_entries) { + size_t global_col = global_col_offset[t] + local_col; + if (global_col < total_cols) { state.problem.c_[global_col] = (f_t)coeff; } + } + } + } + + // Store final dimensions; CSR and objective coefficients are already complete. + state.problem.n_vars_ = (i_t)total_cols; + state.problem.nnz_ = (i_t)total_nnz; +} + +template +static void parse_columns_section_parallel(parse_state_t& state, + int num_threads, + const char* columns_end) +{ + scoped_timer_t timer("parse_columns_parallel"); + + if (num_threads <= 0) { num_threads = phase_thread_count(MPS_COLUMNS_THREAD_CAP); } + + // Skip the "COLUMNS" header + expect_section(state.cursor, "COLUMNS"); + + const char* columns_start = state.cursor.ptr; + size_t columns_bytes = (size_t)(columns_end - columns_start); + size_t chunk_limited_threads = std::max(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES); + num_threads = std::max(1, std::min(num_threads, (int)chunk_limited_threads)); + + // Compute chunk boundaries + auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads); + + // Parse chunks in parallel + std::vector results(num_threads); + + { + scoped_timer_t timer("parse_columns_chunk_parallel"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots((size_t)num_threads); +#endif + { +#pragma omp parallel for num_threads(num_threads) + for (int t = 0; t < num_threads; t++) { + MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns); +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + results[t] = + parse_columns_chunk(chunk_bounds[t].start, chunk_bounds[t].end, state); +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[(size_t)t] = perf_counters.stop(); +#endif + } + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("parse_columns_chunk_parallel", perf_snapshots); +#endif + } + + // Merge results directly into CSR format + merge_chunk_results_to_csr(state, results, num_threads); + + // Update cursor to RHS section + state.cursor.ptr = columns_end; + state.cursor.skip_ws(); +} + +template +static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_rhs"); + expect_section(cursor, "RHS"); + + auto field_from_start = [](const char* start, const char* end) { + const char* p = start; + while (p < end && *p > ' ') { + p++; + } + return std::string_view(start, (size_t)(p - start)); + }; + + auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) { + if (row_idx != SIZE_MAX) { + state.problem.b_[row_idx] = value; + return; + } + std::string_view row_name = field_from_start(row_start, cursor.end); + if (row_name == state.objective_name_sv) { + state.problem.objective_offset_ = -value; + return; + } + if (state.is_ignored_objective_name(row_name)) { return; } + error_unknown_row(cursor, row_start, "RHS"); + }; + + while (cursor.ptr < cursor.end && !is_rhs_section_end(cursor.ptr, cursor.end)) { + auto rhs_name = cursor.read_field(); + (void)rhs_name; + if (accept_comment(cursor)) { + expect_eol(cursor); + continue; + } + const char* row_start = cursor.ptr; + size_t row_idx = state.read_row_lookup(cursor); + auto value = expect_number_fast_pm_one(cursor); + apply_rhs(row_start, row_idx, (f_t)value); + + accept_comment(cursor); + if (!cursor.eol()) { + const char* row_start2 = cursor.ptr; + size_t row_idx2 = state.read_row_lookup(cursor); + auto value2 = expect_number_fast_pm_one(cursor); + apply_rhs(row_start2, row_idx2, (f_t)value2); + accept_comment(cursor); + } + expect_eol(cursor); + } +} + +template +static bool parse_bounds_section_parallel_dense(parse_state_t& state, + cursor_t& cursor, + const char* bounds_body_start, + const char* bounds_body_end, + size_t n_vars) +{ + const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start); + const int num_threads = phase_thread_count(MPS_BOUNDS_THREAD_CAP); + if (!state.col_dense_ordered || bounds_bytes < MPS_BOUNDS_PARALLEL_MIN_BYTES || num_threads < 2) { + return false; + } + + MPS_NVTX_RANGE("parse_bounds_parallel_dense", nvtx::colors::bounds); + + struct BoundsParallelStats { + size_t lines = 0; + size_t dense_hits = 0; + size_t dense_misses = 0; + size_t comments = 0; + size_t min_var = SIZE_MAX; + size_t max_var = 0; + size_t non_strict_order = 0; + bool saw_integer_type = false; + bool saw_negative_upper = false; + const char* error_ptr = nullptr; + char error_msg[192] = {}; + }; + + std::vector stats((size_t)num_threads); + auto boundaries = compute_line_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads); + + std::vector bound_seen; + { + scoped_timer_t timer("bounds_parallel_seen_alloc"); + bound_seen.resize(n_vars, 0); + } + + { + scoped_timer_t timer("parse_bounds_parallel_dense"); + // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse + // optimistically, then accept only if chunk summaries prove strict order. +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int t = 0; t < num_threads; ++t) { + auto& local = stats[(size_t)t]; + cursor_t cursor(boundaries[(size_t)t].start, + (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start)); + cursor.skip_ws(); + size_t prev_var = SIZE_MAX; + try { + while (cursor.ptr < cursor.end) { + if (__unlikely(*cursor.ptr == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + auto bound_type = cursor.read_field(); + if (__unlikely(bound_type.empty())) { break; } + if (__unlikely(bound_type[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + auto bound_name = cursor.read_field(); + (void)bound_name; + auto var_name = cursor.read_field(); + if (__unlikely(!var_name.empty() && var_name[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + local.comments++; + continue; + } + + size_t var_idx = state.col_lookup_dense_ordered(var_name); + if (__unlikely(var_idx == SIZE_MAX)) { + local.dense_misses++; + std::snprintf(local.error_msg, + sizeof(local.error_msg), + "unknown variable name in BOUNDS: %.*s", + (int)var_name.size(), + var_name.data()); + local.error_ptr = cursor.ptr; + break; + } + local.dense_hits++; + local.lines++; + local.min_var = std::min(local.min_var, var_idx); + local.max_var = std::max(local.max_var, var_idx); + if (prev_var != SIZE_MAX && var_idx <= prev_var) { local.non_strict_order++; } + prev_var = var_idx; + + bool first_bound_for_var = bound_seen[var_idx] == 0; + bound_seen[var_idx] = 1; + + f_t value = 0; + accept_comment(cursor); + if (!cursor.eol()) { + value = (f_t)expect_number_fast_pm_one(cursor); + accept_comment(cursor); + } + + if (bound_type == "LO") { + state.problem.variable_lower_bounds_[var_idx] = value; + } else if (bound_type == "UP") { + state.problem.variable_upper_bounds_[var_idx] = value; + if (first_bound_for_var && value < f_t{0}) { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + local.saw_negative_upper = true; + } + } else if (bound_type == "FX") { + state.problem.variable_lower_bounds_[var_idx] = value; + state.problem.variable_upper_bounds_[var_idx] = value; + } else if (bound_type == "FR") { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + } else if (bound_type == "MI") { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + } else if (bound_type == "PL") { + state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + } else if (bound_type == "BV") { + state.problem.variable_lower_bounds_[var_idx] = 0; + state.problem.variable_upper_bounds_[var_idx] = 1; + state.problem.var_types_[var_idx] = 'I'; + local.saw_integer_type = true; + } else if (bound_type == "LI") { + state.problem.variable_lower_bounds_[var_idx] = value; + state.problem.var_types_[var_idx] = 'I'; + local.saw_integer_type = true; + } else if (bound_type == "UI") { + state.problem.variable_upper_bounds_[var_idx] = value; + if (first_bound_for_var && value < f_t{0}) { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + local.saw_negative_upper = true; + } + state.problem.var_types_[var_idx] = 'I'; + local.saw_integer_type = true; + } else { + std::snprintf(local.error_msg, + sizeof(local.error_msg), + "unknown bound type: %.*s", + (int)bound_type.size(), + bound_type.data()); + local.error_ptr = cursor.ptr; + break; + } + + expect_eol(cursor); + } + } catch (const std::exception& e) { + std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", e.what()); + local.error_ptr = cursor.ptr; + } + } + } + + size_t dense_misses = 0; + size_t non_strict_order = 0; + size_t overlap_chunks = 0; + size_t prev_max = SIZE_MAX; + for (int t = 0; t < num_threads; ++t) { + const auto& local = stats[(size_t)t]; + if (local.error_ptr != nullptr) { + cursor.ptr = local.error_ptr; + cursor.error("%s", local.error_msg); + } + dense_misses += local.dense_misses; + non_strict_order += local.non_strict_order; + if (local.lines > 0) { + if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; } + prev_max = local.max_var; + } + } + + const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0; + + if (!order_safe) { + cursor.ptr = bounds_body_start; + return false; + } + + { + scoped_timer_t timer("bounds_integer_defaults"); + for (size_t i = 0; i < n_vars; ++i) { + if (!bound_seen[i] && state.problem.var_types_[i] == 'I') { + state.problem.variable_lower_bounds_[i] = f_t{0}; + state.problem.variable_upper_bounds_[i] = f_t{1}; + } + } + } + + cursor.ptr = bounds_body_end; + return true; +} + +template +static void parse_bounds_section(parse_state_t& state, + cursor_t& cursor, + bool allow_parallel_dense = false) +{ + size_t n_vars = (size_t)state.problem.n_vars_; + + // Initialize bounds with defaults + { + scoped_timer_t timer("bounds_init_defaults"); + const bool parallel_init = + n_vars >= MPS_BOUNDS_PARALLEL_INIT_MIN_VARS && omp_get_max_threads() >= 2; + + if (parallel_init) { +#pragma omp parallel sections num_threads(2) + { +#pragma omp section + { + state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); + } +#pragma omp section + { + state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); + } + } + } else { + state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); + state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); + } + } + + { + scoped_timer_t timer("bounds_madvise_pretouch"); + materialize_vector_hugepages("variable_lower_bounds", + state.problem.variable_lower_bounds_, + materialize_touch_t::write_4kb); + materialize_vector_hugepages("variable_upper_bounds", + state.problem.variable_upper_bounds_, + materialize_touch_t::write_4kb); + } + + std::vector bound_seen((n_vars + 63) / 64, 0); + auto has_bound = [&](size_t var_idx) { + return (bound_seen[var_idx >> 6] & (uint64_t{1} << (var_idx & 63))) != 0; + }; + auto mark_bound = [&](size_t var_idx) { + bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63); + }; + auto apply_unspecified_integer_bounds = [&]() { + scoped_timer_t timer("bounds_integer_defaults"); + for (size_t i = 0; i < n_vars; ++i) { + if (!has_bound(i) && state.problem.var_types_[i] == 'I') { + state.problem.variable_lower_bounds_[i] = f_t{0}; + state.problem.variable_upper_bounds_[i] = f_t{1}; + } + } + }; + + if (!accept_section(cursor, "BOUNDS")) { + apply_unspecified_integer_bounds(); + return; + } + + const char* bounds_body_start = cursor.ptr; + const char* bounds_body_end = + allow_parallel_dense ? find_bounds_body_end(bounds_body_start, cursor.end) : cursor.end; + if (allow_parallel_dense) { + if (parse_bounds_section_parallel_dense( + state, cursor, bounds_body_start, bounds_body_end, n_vars)) { + return; + } + { + scoped_timer_t timer("bounds_parallel_fallback_reset"); + std::fill(state.problem.variable_lower_bounds_.begin(), + state.problem.variable_lower_bounds_.end(), + f_t{0}); + std::fill(state.problem.variable_upper_bounds_.begin(), + state.problem.variable_upper_bounds_.end(), + std::numeric_limits::infinity()); + } + } + + size_t hint_idx = 0; + { + scoped_timer_t timer("parse_bounds"); + for (;;) { + bool done = cursor.done() || peek(cursor) == "RANGES" || peek(cursor) == "ENDATA" || + is_quadratic_section_start(cursor.ptr, cursor.end); + if (done) break; + + auto bound_type = cursor.read_field(); + auto bound_name = cursor.read_field(); + (void)bound_name; + auto var_name = cursor.read_field(); + if (__unlikely(!var_name.empty() && var_name[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + + // optimized lookup using hint (bounds often in same order as columns) + size_t var_idx = SIZE_MAX; + if (__likely(state.col_dense_ordered)) { + var_idx = state.col_lookup_dense_ordered(var_name); + if (var_idx == SIZE_MAX) { + cursor.error( + "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data()); + } + } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) { + var_idx = hint_idx + 1; + } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { + var_idx = hint_idx; + } else { + size_t search_start = hint_idx + 2; + size_t search_end = n_vars; + + search_loop: + for (size_t i = search_start; i < search_end; ++i) { + if (state.var_names_sv[i] == var_name) { + var_idx = i; + goto found; + } + } + if (search_start != 0) { + search_end = hint_idx; + search_start = 0; + goto search_loop; + } + cursor.error( + "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data()); + } + found: + hint_idx = var_idx; + bool first_bound_for_var = !has_bound(var_idx); + + f_t value = 0; + accept_comment(cursor); + if (!cursor.eol()) { + // bounds are often just set to 0 or 1 + if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') { + value = cursor.ptr[0] - '0'; + cursor.ptr += 1; + } else { + value = (f_t)expect_number(cursor); + } + accept_comment(cursor); + } + + if (bound_type == "LO") { + state.problem.variable_lower_bounds_[var_idx] = value; + } else if (bound_type == "UP") { + state.problem.variable_upper_bounds_[var_idx] = value; + if (first_bound_for_var && value < f_t{0}) { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + } + } else if (bound_type == "FX") { + state.problem.variable_lower_bounds_[var_idx] = value; + state.problem.variable_upper_bounds_[var_idx] = value; + } else if (bound_type == "FR") { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + } else if (bound_type == "MI") { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + } else if (bound_type == "PL") { + state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + } else if (bound_type == "BV") { + state.problem.variable_lower_bounds_[var_idx] = 0; + state.problem.variable_upper_bounds_[var_idx] = 1; + state.problem.var_types_[var_idx] = 'I'; + } else if (bound_type == "LI") { + state.problem.variable_lower_bounds_[var_idx] = value; + state.problem.var_types_[var_idx] = 'I'; + } else if (bound_type == "UI") { + state.problem.variable_upper_bounds_[var_idx] = value; + if (first_bound_for_var && value < f_t{0}) { + state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + } + state.problem.var_types_[var_idx] = 'I'; + } else { + cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data()); + } + mark_bound(var_idx); + + expect_eol(cursor); + } + } + apply_unspecified_integer_bounds(); +} + +template +static void parse_ranges_section(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_ranges"); + + // Initialize constraint bounds from row_types and b_ + state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_); + state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_); + + for (i_t i = 0; i < state.problem.n_constraints_; ++i) { + char row_type = state.problem.row_types_[i]; + f_t b = state.problem.b_[i]; + if (row_type == 'E') { + state.problem.constraint_lower_bounds_[i] = b; + state.problem.constraint_upper_bounds_[i] = b; + } else if (row_type == 'L') { + state.problem.constraint_lower_bounds_[i] = -std::numeric_limits::infinity(); + state.problem.constraint_upper_bounds_[i] = b; + } else if (row_type == 'G') { + state.problem.constraint_lower_bounds_[i] = b; + state.problem.constraint_upper_bounds_[i] = std::numeric_limits::infinity(); + } + } + + if (!accept_section(cursor, "RANGES")) { return; } + + auto apply_range = [&](std::string_view row_name, f_t range_val) { + size_t row_idx = state.row_lookup(row_name); + if (row_idx == SIZE_MAX) { + cursor.error("unknown row name in RANGES: %.*s", (int)row_name.size(), row_name.data()); + } + char row_type = state.problem.row_types_[row_idx]; + f_t abs_range = std::abs(range_val); + + if (row_type == 'E') { + if (range_val >= 0) { + state.problem.constraint_upper_bounds_[row_idx] = + state.problem.constraint_lower_bounds_[row_idx] + abs_range; + } else { + state.problem.constraint_lower_bounds_[row_idx] = + state.problem.constraint_upper_bounds_[row_idx] - abs_range; + } + } else if (row_type == 'L') { + state.problem.constraint_lower_bounds_[row_idx] = + state.problem.constraint_upper_bounds_[row_idx] - abs_range; + } else if (row_type == 'G') { + state.problem.constraint_upper_bounds_[row_idx] = + state.problem.constraint_lower_bounds_[row_idx] + abs_range; + } + }; + + while (cursor.ptr < cursor.end && peek(cursor) != "BOUNDS" && peek(cursor) != "ENDATA" && + !is_quadratic_section_start(cursor.ptr, cursor.end)) { + auto range_name = cursor.read_field(); + (void)range_name; + if (accept_comment(cursor)) { + expect_eol(cursor); + continue; + } + auto row_name = cursor.read_field(); + auto value = (f_t)expect_number(cursor); + apply_range(row_name, value); + + accept_comment(cursor); + if (!cursor.eol()) { + auto row_name2 = cursor.read_field(); + if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + auto value2 = (f_t)expect_number(cursor); + apply_range(row_name2, value2); + accept_comment(cursor); + } + expect_eol(cursor); + } +} + +template +static void build_var_name_map_if_needed(parse_state_t& state) +{ + if (state.col_dense_ordered || !state.var_names_map.empty()) { return; } + scoped_timer_t timer("quadratic_build_var_name_map"); + state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2); + for (size_t i = 0; i < state.var_names_sv.size(); ++i) { + state.var_names_map.emplace(state.var_names_sv[i], i); + } +} + +template +static size_t lookup_quadratic_var(parse_state_t& state, std::string_view name) +{ + if (state.col_dense_ordered) { return state.col_lookup_dense_ordered(name); } + auto it = state.var_names_map.find(name); + return it == state.var_names_map.end() ? SIZE_MAX : it->second; +} + +template +static void build_quadratic_csr(parse_state_t& state, + const std::vector>& entries, + bool symmetric_upper_triangular) +{ + scoped_timer_t timer("build_quadratic_csr"); + const size_t n_vars = (size_t)state.problem.n_vars_; + if (entries.empty()) { return; } + + struct ExpandedEntry { + size_t row; + size_t col; + size_t seq; + f_t value; + }; + + std::vector expanded; + expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size()); + size_t seq = 0; + for (const auto& [row_i, col_i, value] : entries) { + size_t row = (size_t)row_i; + size_t col = (size_t)col_i; + expanded.push_back({row, col, seq++, value}); + if (symmetric_upper_triangular && row != col) { expanded.push_back({col, row, seq++, value}); } + } + + std::stable_sort(expanded.begin(), expanded.end(), [](const auto& a, const auto& b) { + if (a.row != b.row) return a.row < b.row; + if (a.col != b.col) return a.col < b.col; + return a.seq < b.seq; + }); + + auto& values = state.problem.Q_objective_values_; + auto& indices = state.problem.Q_objective_indices_; + auto& offsets = state.problem.Q_objective_offsets_; + values.clear(); + indices.clear(); + offsets.assign(n_vars + 1, i_t{0}); + values.reserve(expanded.size()); + indices.reserve(expanded.size()); + + size_t current_row = 0; + offsets[0] = 0; + for (const auto& entry : expanded) { + while (current_row < entry.row) { + offsets[++current_row] = (i_t)values.size(); + } + values.push_back(entry.value * f_t{0.5}); + indices.push_back((i_t)entry.col); + } + while (current_row < n_vars) { + offsets[++current_row] = (i_t)values.size(); + } +} + +template +[[maybe_unused]] static void parse_quadratic_sections(parse_state_t& state, + cursor_t& cursor) +{ + scoped_timer_t timer("parse_quadratic_sections"); + if (cursor.done() || peek(cursor) == "ENDATA") { return; } + if (!is_quadratic_section_start(cursor.ptr, cursor.end)) { return; } + + build_var_name_map_if_needed(state); + std::vector> quadobj_entries; + std::vector> qmatrix_entries; + std::vector>* active_entries = nullptr; + + auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) { + size_t var1_idx = lookup_quadratic_var(state, var1); + if (var1_idx == SIZE_MAX) { + cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var1.size(), var1.data()); + } + size_t var2_idx = lookup_quadratic_var(state, var2); + if (var2_idx == SIZE_MAX) { + cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var2.size(), var2.data()); + } + active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value); + }; + + while (cursor.ptr < cursor.end) { + if (peek(cursor) == "ENDATA") { break; } + if (accept_section(cursor, "QUADOBJ")) { + active_entries = &quadobj_entries; + continue; + } + if (accept_section(cursor, "QMATRIX")) { + active_entries = &qmatrix_entries; + continue; + } + if (active_entries == nullptr) { break; } + + auto var1 = cursor.read_field(); + if (__unlikely(var1.empty())) { break; } + if (__unlikely(var1[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + auto var2 = cursor.read_field(); + if (__unlikely(!var2.empty() && var2[0] == '$')) { + cursor.skip_to_eol(); + expect_eol(cursor); + continue; + } + f_t value = (f_t)expect_number(cursor); + add_entry(var1, var2, value); + accept_comment(cursor); + expect_eol(cursor); + } + + if (!quadobj_entries.empty()) { + build_quadratic_csr(state, quadobj_entries, true); + } else if (!qmatrix_entries.empty()) { + build_quadratic_csr(state, qmatrix_entries, false); + } +} + +template +static void set_cursor_range(parse_state_t& state, mps_phase_range_t range) +{ + state.cursor.ptr = range.begin; + state.cursor.end = range.end; +} + +template +static void parse_header_range(parse_state_t& state, mps_phase_range_t range) +{ + set_cursor_range(state, range); + accept_comment_line(state.cursor); + if (state.cursor.done()) { return; } + parse_name_section(state); + parse_objsense_section(state); + parse_objname_section(state); +} + +template +static void parse_rows_range(parse_state_t& state, mps_phase_range_t range) +{ + set_cursor_range(state, range); + parse_rows_section(state, range.end); +} + +template +static void parse_columns_range(parse_state_t& state, + mps_phase_range_t range, + int num_threads = 0) +{ + set_cursor_range(state, range); + parse_columns_section_parallel(state, num_threads, range.end); +} + +template +static void parse_rhs_range(parse_state_t& state, mps_phase_range_t range) +{ + if (!range.present) { return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_rhs_section(state, cursor); +} + +template +static void parse_bounds_range(parse_state_t& state, + mps_phase_range_t range, + const char* fallback_ptr) +{ + if (range.present) { + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_bounds_section(state, cursor, range.present); + } else { + cursor_t cursor(fallback_ptr, 16); + parse_bounds_section(state, cursor, range.present); + } +} + +template +static void parse_ranges_range(parse_state_t& state, + mps_phase_range_t range, + const char* fallback_ptr) +{ + if (range.present) { + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_ranges_section(state, cursor); + } else { + cursor_t cursor(fallback_ptr, 16); + parse_ranges_section(state, cursor); + } +} + +template +static void parse_quadratic_range(parse_state_t& state, + mps_phase_range_t range, + const char* fallback_ptr) +{ + (void)state; + if (range.present) { + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) { + throw std::logic_error( + "experimental fast MPS reader currently supports LP/MIP MPS files only; " + "quadratic MPS sections are not supported"); + } + } else { + cursor_t cursor(fallback_ptr, 16); + if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) { + throw std::logic_error( + "experimental fast MPS reader currently supports LP/MIP MPS files only; " + "quadratic MPS sections are not supported"); + } + } +} + +template +static void materialize_problem_names(parse_state_t& state) +{ + scoped_timer_t timer("materialize_problem_names"); + int num_threads = phase_thread_count(MPS_NAMES_THREAD_CAP); + // Copy string_views to actual strings (this is where allocation happens) + { + scoped_timer_t timer("materialize_problem_scalar_names"); + state.problem.problem_name_ = std::string(state.problem_name_sv); + state.problem.objective_name_ = std::string(state.objective_name_sv); + } + + { + scoped_timer_t timer("materialize_problem_row_names"); + size_t n = state.row_names_sv.size(); + state.problem.row_names_.resize(n); + // row names are usually small enough for SSO - parallel assigns mostly don't touch the heap and + // as such may help a lot ideally we could just allocate an arena and store non-owning string + // views but that'd require a refactor of the problem representation + if (n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.problem.row_names_[i].assign(state.row_names_sv[i]); + } + } else { + for (size_t i = 0; i < n; ++i) { + state.problem.row_names_[i].assign(state.row_names_sv[i]); + } + } + } + + { + scoped_timer_t timer("materialize_problem_var_names"); + size_t n = state.col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size(); + state.problem.var_names_.resize(n); + if (state.col_dense_ordered && n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.dense_col_name(i, state.problem.var_names_[i]); + } + } else if (state.col_dense_ordered) { + for (size_t i = 0; i < n; ++i) { + state.dense_col_name(i, state.problem.var_names_[i]); + } + } else if (n >= 1'000'000 && num_threads > 1) { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (size_t i = 0; i < n; ++i) { + state.problem.var_names_[i].assign(state.var_names_sv[i]); + } + } else { + for (size_t i = 0; i < n; ++i) { + state.problem.var_names_[i].assign(state.var_names_sv[i]); + } + } + } +} + +template +static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( + Stream& stream, const char* total_timer_name, const char* producer_task_name) +{ + auto total_timer = std::make_unique(total_timer_name); + omp_set_max_active_levels(2); + + input_stream_view_t input = stream.view(); + cuopt::linear_programming::io::mps_data_model_t problem; + problem.n_vars_ = 0; + problem.n_constraints_ = 0; + problem.nnz_ = 0; + problem.maximize_ = false; + problem.objective_scaling_factor_ = f_t{1}; + problem.objective_offset_ = f_t{0}; + + std::size_t reserve_size = std::max(stream.reserve_size_hint(), 1024 * 1024); + std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); + problem.A_offsets_.reserve(reserve_dim); + problem.b_.reserve(reserve_dim); + problem.variable_lower_bounds_.reserve(reserve_dim); + problem.variable_upper_bounds_.reserve(reserve_dim); + problem.var_types_.reserve(reserve_dim); + problem.row_types_.reserve(reserve_dim); + problem.row_names_.reserve(reserve_dim); + problem.var_names_.reserve(reserve_dim); + problem.constraint_lower_bounds_.reserve(reserve_dim); + problem.constraint_upper_bounds_.reserve(reserve_dim); + + cursor_t cursor(input.data, 0); + parse_state_t state(problem, cursor); + state.row_names_sv.reserve(reserve_dim); + + auto phase_end = [](const char*) { flush_timers(); }; + + std::mutex task_error_mutex; + std::exception_ptr first_task_error = nullptr; + std::atomic task_failed{false}; + + auto mark_task_error = [&](std::exception_ptr eptr) { + { + std::lock_guard lock(task_error_mutex); + if (!first_task_error) { first_task_error = eptr; } + } + task_failed.store(true, std::memory_order_release); + }; + + auto run_parser_task = [&](auto&& fn) { + if (task_failed.load(std::memory_order_acquire)) { return; } + try { + fn(); + } catch (...) { + mark_task_error(std::current_exception()); + } + }; + + auto unblock_phase_waiters_after_error = [&]() { + mps_phase_range_t empty{input.data, input.data, false}; + input.registry->publish(mps_phase_kind::header, empty); + input.registry->publish(mps_phase_kind::rows, empty); + input.registry->publish(mps_phase_kind::columns, empty); + input.registry->publish(mps_phase_kind::rhs, empty); + input.registry->publish(mps_phase_kind::bounds, empty); + input.registry->publish(mps_phase_kind::ranges, empty); + input.registry->publish(mps_phase_kind::quadratic, empty); + }; + + int header_ready = 0, rows_ready = 0, columns_ready = 0; + int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0; + int header_done = 0, rows_done = 0, columns_done = 0; + int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0; + +#pragma omp parallel num_threads(omp_get_max_threads()) + { + std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num()); + nvtx::name_current_thread(thread_name.c_str()); + +#pragma omp single + { + omp_event_handle_t ev_header; +#pragma omp task detach(ev_header) depend(out : header_ready) + { + input.registry->attach_event(mps_phase_kind::header, ev_header); + } + omp_event_handle_t ev_rows; +#pragma omp task detach(ev_rows) depend(out : rows_ready) + { + input.registry->attach_event(mps_phase_kind::rows, ev_rows); + } + omp_event_handle_t ev_columns; +#pragma omp task detach(ev_columns) depend(out : columns_ready) + { + input.registry->attach_event(mps_phase_kind::columns, ev_columns); + } + omp_event_handle_t ev_rhs; +#pragma omp task detach(ev_rhs) depend(out : rhs_ready) + { + input.registry->attach_event(mps_phase_kind::rhs, ev_rhs); + } + omp_event_handle_t ev_bounds; +#pragma omp task detach(ev_bounds) depend(out : bounds_ready) + { + input.registry->attach_event(mps_phase_kind::bounds, ev_bounds); + } + omp_event_handle_t ev_ranges; +#pragma omp task detach(ev_ranges) depend(out : ranges_ready) + { + input.registry->attach_event(mps_phase_kind::ranges, ev_ranges); + } + omp_event_handle_t ev_quadratic; +#pragma omp task detach(ev_quadratic) depend(out : quadratic_ready) + { + input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic); + } + +#pragma omp task + { + MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io); + try { + stream.run_decode_tasks(); + } catch (...) { + mark_task_error(std::current_exception()); + unblock_phase_waiters_after_error(); + } + } + +#pragma omp task depend(in : header_ready) depend(out : header_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_header", nvtx::colors::generic); + parse_header_range(state, input.registry->range(mps_phase_kind::header)); + phase_end("header"); + }); + } + +#pragma omp task depend(in : rows_ready, header_done) depend(out : rows_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_rows", nvtx::colors::rows); + parse_rows_range(state, input.registry->range(mps_phase_kind::rows)); + phase_end("rows"); + }); + } + +#pragma omp task depend(in : columns_ready, rows_done) depend(out : columns_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_columns", nvtx::colors::columns); + parse_columns_range(state, input.registry->range(mps_phase_kind::columns)); + phase_end("columns"); + }); + } + +#pragma omp task depend(in : columns_done) depend(out : names_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_materialize_names", nvtx::colors::names); + scoped_timer_t timer("materialize_problem_names_task"); + materialize_problem_names(state); + }); + } + +#pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_rhs", nvtx::colors::rhs); + parse_rhs_range(state, input.registry->range(mps_phase_kind::rhs)); + phase_end("rhs"); + }); + } + +#pragma omp task depend(in : ranges_ready, rhs_done) depend(out : ranges_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges); + parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges), input.data); + phase_end("ranges"); + }); + } + +#pragma omp task depend(in : bounds_ready, columns_done) depend(out : bounds_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds); + parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds), input.data); + phase_end("bounds"); + }); + } + +#pragma omp task depend(in : quadratic_ready, columns_done) depend(out : quadratic_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic); + parse_quadratic_range( + state, input.registry->range(mps_phase_kind::quadratic), input.data); + phase_end("quadratic"); + }); + } + +#pragma omp taskwait + } + } + + if (first_task_error) { std::rethrow_exception(first_task_error); } + + input.size = stream.size(); + cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present + ? input.registry->range(mps_phase_kind::quadratic).end + : (input.registry->range(mps_phase_kind::bounds).present + ? input.registry->range(mps_phase_kind::bounds).end + : (input.registry->range(mps_phase_kind::ranges).present + ? input.registry->range(mps_phase_kind::ranges).end + : input.registry->range(mps_phase_kind::rhs).end)); + cursor.end = input.data + input.size; + if (!cursor.done()) { expect(cursor, "ENDATA"); } + + total_timer.reset(); + flush_timers(); + return problem; +} + +template +cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method) +{ + FileReadMethod effective_method = effective_file_read_method(path, read_method); + if (effective_method == FileReadMethod::Lz4) { + Lz4InputStream stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode"); + } + if (effective_method == FileReadMethod::Read) { + RawInputStream stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); + } + throw std::runtime_error("experimental fast MPS reader supports raw and LZ4 inputs only"); +} + +template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method); +template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method); +template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method); +template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method); + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp new file mode 100644 index 0000000000..20e9901024 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "file_reader.hpp" + +#include + +#include +#include + +namespace mps_fast { + +template +cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( + const std::string& path, FileReadMethod read_method = FileReadMethod::Read); + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp new file mode 100644 index 0000000000..9e5777efc2 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp @@ -0,0 +1,23 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include "fast_parser.hpp" + +namespace cuopt::linear_programming::io { + +template +mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) +{ + return mps_fast::parse_mps_fast_file(mps_file_path); +} + +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); + +} // namespace cuopt::linear_programming::io diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp new file mode 100644 index 0000000000..819b1948bf --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -0,0 +1,252 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "file_reader.hpp" +#include "nvtx_ranges.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +char* string_buffer; +char* string_buffer_ptr; + +namespace { + +constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t raw_input_max_read_threads = 8; + +bool path_has_suffix(const std::string& path, const char* suffix) noexcept +{ + std::size_t suffix_len = std::strlen(suffix); + return path.size() >= suffix_len && + path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; +} + +} // namespace + +namespace { + +class FileDescriptor { + public: + explicit FileDescriptor(int fd) : fd_(fd) {} + ~FileDescriptor() + { + if (fd_ >= 0) { ::close(fd_); } + } + + FileDescriptor(const FileDescriptor&) = delete; + FileDescriptor& operator=(const FileDescriptor&) = delete; + + int get() const noexcept { return fd_; } + bool valid() const noexcept { return fd_ >= 0; } + + private: + int fd_; +}; + +std::size_t get_file_size(int fd, const std::string& path) +{ + struct stat st; + if (::fstat(fd, &st) != 0) { + throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno)); + } + return static_cast(st.st_size); +} + +std::size_t system_page_size() +{ + static std::size_t page_size = [] { + long value = ::sysconf(_SC_PAGESIZE); + return value > 0 ? static_cast(value) : static_cast(4096); + }(); + return page_size; +} + +std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) +{ + if (alignment == 0) { return value; } + std::size_t remainder = value % alignment; + if (remainder == 0) { return value; } + std::size_t increment = alignment - remainder; + if (value > std::numeric_limits::max() - increment) { + throw std::runtime_error("allocation size overflow"); + } + return value + increment; +} + +} // namespace + +RawInputStream::RawInputStream(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) { + throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno)); + } + + file_size_ = get_file_size(fd_, path); + window_bytes_ = raw_input_window_bytes; + window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); + + output_mapped_size_ = + round_up_to_multiple(std::max(file_size_, 1), system_page_size()); + output_region_ = mmap_region_t::anonymous( + output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); + output_data_ = output_region_.char_data(); + output_region_.advise(MADV_HUGEPAGE); + + block_done_.resize(window_count_, 0); + block_end_.resize(window_count_, 0); + section_scanner_ = + std::make_unique(output_data_, window_count_, registry_); +} + +RawInputStream::~RawInputStream() +{ + if (fd_ >= 0) { ::close(fd_); } +} + +const char* RawInputStream::data() const noexcept { return output_data_; } +char* RawInputStream::mutable_data() noexcept { return output_data_; } +std::size_t RawInputStream::size() const noexcept { return output_view_size_; } +std::size_t RawInputStream::compressed_size() const noexcept { return file_size_; } +std::size_t RawInputStream::reserve_size_hint() const noexcept { return file_size_; } +mps_phase_registry_t& RawInputStream::registry() noexcept { return registry_; } +input_stream_view_t RawInputStream::view() noexcept +{ + return {output_data_, output_data_, output_view_size_, file_size_, ®istry_}; +} + +void RawInputStream::run_decode_tasks() +{ + MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io); + if (file_size_ == 0) { + output_view_size_ = 0; + section_scanner_->publish_ready(0); + return; + } + + std::size_t hw_threads = + std::max(1, static_cast(std::thread::hardware_concurrency())); + std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); + thread_count = std::max(1, std::min(thread_count, window_count_)); + + std::atomic_size_t next_window{0}; + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; + std::atomic_bool stop{false}; + + auto mark_error = [&](std::exception_ptr eptr) { + std::lock_guard lock(error_mutex); + if (!first_error) { + first_error = eptr; + stop.store(true, std::memory_order_release); + } + }; + + auto read_window = [&](std::size_t index) { + std::size_t offset = index * window_bytes_; + std::size_t size = std::min(window_bytes_, file_size_ - offset); + std::size_t done = 0; + while (done < size) { + ssize_t got = + ::pread(fd_, output_data_ + offset + done, size - done, static_cast(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + throw std::runtime_error("Failed to pread raw MPS file '" + path_ + + "': " + std::strerror(errno)); + } + if (got == 0) { + throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'"); + } + done += static_cast(got); + } + + section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); + frontier_mutex_.lock(); + block_done_[index] = 1; + block_end_[index] = offset + size; + std::size_t before = ready_bytes_; + while (next_block_ < block_done_.size() && block_done_[next_block_]) { + ready_bytes_ = block_end_[next_block_]; + ++next_block_; + } + std::size_t after = ready_bytes_; + frontier_mutex_.unlock(); + if (after > before) { section_scanner_->publish_ready(after); } + }; + + std::vector workers; + workers.reserve(thread_count); + for (std::size_t t = 0; t < thread_count; ++t) { + workers.emplace_back([&, t] { + std::string thread_name = "raw-input-read-" + std::to_string(t); + nvtx::name_current_thread(thread_name.c_str()); + while (!stop.load(std::memory_order_acquire)) { + std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); + if (index >= window_count_) { break; } + try { + read_window(index); + } catch (...) { + mark_error(std::current_exception()); + return; + } + } + }); + } + for (auto& worker : workers) { + worker.join(); + } + if (first_error) { std::rethrow_exception(first_error); } + + output_view_size_ = ready_bytes_; + section_scanner_->publish_ready(output_view_size_); +} + +bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); } + +void drop_file_cache(const std::string& path) +{ + MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io); + FileDescriptor fd(::open(path.c_str(), O_RDONLY)); + if (!fd.valid()) { return; } + + ::posix_fadvise(fd.get(), 0, 0, POSIX_FADV_DONTNEED); +} + +FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method) +{ + if (has_lz4_extension(path)) { return FileReadMethod::Lz4; } + if (method == FileReadMethod::Lz4) { + throw std::runtime_error("lz4 read method requires a .lz4 input: " + path); + } + return method; +} + +const char* file_read_method_name(FileReadMethod method) noexcept +{ + switch (method) { + case FileReadMethod::Read: return "read"; + case FileReadMethod::Lz4: return "lz4"; + default: return "unknown"; + } +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp new file mode 100644 index 0000000000..3232a23e84 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -0,0 +1,168 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mmap_region.hpp" +#include "mps_section_scanner.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +/** + * @brief File reading method selection + */ +enum class FileReadMethod { Read, Lz4 }; + +/** + * @brief Return the effective method for a path. + * + * .lz4 inputs are decompressed; all other inputs use raw input reads. + */ +FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method); + +/** + * @brief Human-readable method name. + */ +const char* file_read_method_name(FileReadMethod method) noexcept; + +/** + * @brief True when the file name has an lz4 extension. + */ +bool has_lz4_extension(const std::string& path) noexcept; + +/** + * @brief Ask the OS to evict clean cached pages for this file. + * + * This is advisory and affects the local client page cache only. + */ +void drop_file_cache(const std::string& path); + +struct input_stream_view_t { + const char* data = nullptr; + char* mutable_data = nullptr; + std::size_t size = 0; + std::size_t compressed_size = 0; + mps_phase_registry_t* registry = nullptr; +}; + +class Lz4InputStream { + public: + explicit Lz4InputStream(const std::string& path); + ~Lz4InputStream(); + + Lz4InputStream(const Lz4InputStream&) = delete; + Lz4InputStream& operator=(const Lz4InputStream&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + mps_phase_registry_t& registry() noexcept; + input_stream_view_t view() noexcept; + + void run_decode_tasks(); + + private: + struct Block { + std::size_t compressed_offset = 0; + std::size_t compressed_size = 0; + std::size_t read_end_offset = 0; + std::size_t decompressed_offset = 0; + std::size_t decompressed_size = 0; + std::size_t index = 0; + bool uncompressed = false; + }; + + void commit_up_to(std::size_t bytes); + + std::string path_; + int fd_ = -1; + mmap_region_t output_region_; + std::size_t compressed_size_ = 0; + char* output_data_ = nullptr; + std::size_t output_mapped_size_ = 0; + std::size_t output_view_size_ = 0; + std::size_t output_committed_size_ = 0; + std::size_t block_max_size_ = 0; + std::size_t content_size_ = 0; + std::size_t header_size_ = 0; + bool content_size_present_ = false; + bool block_checksum_ = false; + bool content_checksum_ = false; + bool dict_id_ = false; + std::vector blocks_; + mps_phase_registry_t registry_; + std::mutex commit_mutex_; + std::mutex frontier_mutex_; + std::vector block_done_; + std::vector block_end_; + std::unique_ptr section_scanner_; + std::size_t next_block_ = 0; + std::size_t ready_bytes_ = 0; + + struct BatchMetric { + std::size_t index = 0; + std::size_t first_block = 0; + std::size_t blocks = 0; + std::size_t file_bytes = 0; + std::size_t decompressed_bytes = 0; + double read_ms = 0.0; + double decode_ms = 0.0; + double commit_ms = 0.0; + double frontier_lock_wait_ms = 0.0; + double frontier_update_ms = 0.0; + double section_scan_ms = 0.0; + std::size_t ready_bytes_delta = 0; + std::size_t frontier_blocks_advanced = 0; + double total_ms = 0.0; + }; + std::vector batch_metrics_; +}; + +class RawInputStream { + public: + explicit RawInputStream(const std::string& path); + ~RawInputStream(); + + RawInputStream(const RawInputStream&) = delete; + RawInputStream& operator=(const RawInputStream&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + mps_phase_registry_t& registry() noexcept; + input_stream_view_t view() noexcept; + + void run_decode_tasks(); + + private: + std::string path_; + int fd_ = -1; + mmap_region_t output_region_; + char* output_data_ = nullptr; + std::size_t output_mapped_size_ = 0; + std::size_t output_view_size_ = 0; + std::size_t file_size_ = 0; + std::size_t window_bytes_ = 0; + std::size_t window_count_ = 0; + mps_phase_registry_t registry_; + std::mutex frontier_mutex_; + std::vector block_done_; + std::vector block_end_; + std::unique_ptr section_scanner_; + std::size_t next_block_ = 0; + std::size_t ready_bytes_ = 0; +}; + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp new file mode 100644 index 0000000000..7aa302da23 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -0,0 +1,330 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "simd_compat.hpp" + +#include +#include + +#define __assume(cond) \ + do { \ + if (!(cond)) __builtin_unreachable(); \ + } while (0) + +#define BUCKET_COUNT (4194304 * 2 * 2 * 4) // 2^22 + +// Set to 1 for 32-byte keys, 0 for 16-byte keys +#ifndef USE_32B_HASH_KEYS +#define USE_32B_HASH_KEYS 1 +#endif + +namespace mps_fast { + +static inline uint32_t crcHash(const uint8_t* key, int64_t len) +{ + __assume(len < 256); + + uint64_t crc = 0; + while (len > 8) { + uint64_t val = *(const uint64_t*)key; + crc = simde_mm_crc32_u64(crc, val); + len -= 8; + key += 8; + } + + // CRC the final 1-7 bytes + uint64_t val = *(const uint64_t*)key; + val &= ~(~0ULL << len * 8); // Compiles to a bzhi instruction (also UB) + crc = simde_mm_crc32_u64(crc, val); + + return crc; +} + +static const simde__m128i aes_seed_128 = + simde_mm_set_epi64x(0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL); +static const simde__m256i aes_seed_256 = simde_mm256_set_epi64x( + 0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL, 0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL); + +static inline uint32_t aes_hash(simde__m128i key) +{ + simde__m128i h = simde_mm_aesenc_si128(key, aes_seed_128); + h = simde_mm_aesenc_si128(h, aes_seed_128); + simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8)); + return (uint32_t)simde_mm_cvtsi128_si32(folded); +} + +static inline uint32_t aes_hash(simde__m256i key) +{ + simde__m128i lo = simde_mm256_castsi256_si128(key); + simde__m128i hi = simde_mm256_extracti128_si256(key, 1); + simde__m128i h = simde_mm_xor_si128(lo, hi); + h = simde_mm_aesenc_si128(h, aes_seed_128); + h = simde_mm_aesenc_si128(h, aes_seed_128); + simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8)); + return (uint32_t)simde_mm_cvtsi128_si32(folded); +} + +static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3) +{ + uint64_t crc = 0; + crc = simde_mm_crc32_u64(crc, q0); + crc = simde_mm_crc32_u64(crc, q1); + crc = simde_mm_crc32_u64(crc, q2); + crc = simde_mm_crc32_u64(crc, q3); + + return crc; +} + +// FNV-1a hash, processes bytes in reverse to better handle common-prefix strings +static inline uint32_t fnv1a_hash(const char* ptr, size_t len) +{ + constexpr uint32_t FNV_OFFSET = 2166136261u; + constexpr uint32_t FNV_PRIME = 16777619u; + + uint32_t h = FNV_OFFSET; + const char* p = ptr + len; + while (p > ptr) { + --p; + h ^= (uint8_t)*p; + h *= FNV_PRIME; + } + return h; +} + +struct __attribute__((packed)) hash_slot_32_t { + uint32_t count; + simde__m256i node; +}; + +struct alignas(16) hash_slot_16_t { + char key[16]; + uint32_t count; +}; + +static inline bool key_cmpeq_16(const char* slot_key, simde__m128i key) +{ + simde__m128i slot_vec = simde_mm_loadu_si128((const simde__m128i*)slot_key); + int mask = simde_mm_movemask_epi8(simde_mm_cmpeq_epi8(slot_vec, key)); + return mask == 0xFFFF; +} + +// 32-byte aligned slot: 28-byte key + 4-byte count = 32 bytes total (one cache line half) +struct alignas(32) hash_slot_28_t { + char key[28]; + uint32_t count; +}; + +static inline simde__m256i make_key_28(const char* ptr, size_t len) +{ + alignas(32) char buf[32] = {0}; + size_t copy_len = len < 28 ? len : 28; + std::memcpy(buf, ptr, copy_len); + return simde_mm256_load_si256((const simde__m256i*)buf); +} + +// Compare 28-byte keys stored in simde__m256i (ignore last 4 bytes) +static inline bool key_cmpeq_28(const char* slot_key, simde__m256i key) +{ + simde__m256i slot_vec = simde_mm256_loadu_si256((const simde__m256i*)slot_key); + int mask = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key)); + return (mask & 0x0FFFFFFF) == 0x0FFFFFFF; // Only check first 28 bytes +} + +#if USE_32B_HASH_KEYS +using hash_key_t = simde__m256i; +using hash_slot_var_t = hash_slot_28_t; +constexpr size_t HASH_KEY_BYTES = 28; +constexpr int HASH_KEY_CMP_MASK = 0x0FFFFFFF; +#define make_key make_key_28 +#define key_cmpeq(slot_key, key) key_cmpeq_28(slot_key, key) +#define key_store(slot_key, key) simde_mm256_store_si256((simde__m256i*)(slot_key), key) +#else +using hash_key_t = simde__m128i; +using hash_slot_var_t = hash_slot_16_t; +constexpr size_t HASH_KEY_BYTES = 16; +constexpr int HASH_KEY_CMP_MASK = 0xFFFF; +#define make_key make_key_16 +#define key_cmpeq(slot_key, key) key_cmpeq_16(slot_key, key) +#define key_store(slot_key, key) simde_mm_store_si128((simde__m128i*)(slot_key), key) +#endif + +// Legacy alias +using hash_slot_t = hash_slot_32_t; + +struct hash_table_t { + hash_slot_t slots[BUCKET_COUNT]; +}; + +static inline void hash_table_push( + hash_table_t* table, uint32_t hash, simde__m256i val, int len, const uint8_t* ptr) +{ + hash %= BUCKET_COUNT; + + hash_slot_t* slot = &table->slots[hash]; + + if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) { + ++slot->count; + return; + } + + bool relooped = false; + +loop: + for (; slot < &table->slots[BUCKET_COUNT]; ++slot) { + if (slot->count == 0) { + slot->count = 1; + slot->node = val; + return; + } + + if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) { + ++slot->count; + return; + } + } + + if (!relooped) { + relooped = true; + slot = &table->slots[0]; + goto loop; + } else { + __builtin_trap(); + } +} + +extern char* string_buffer; +extern char* string_buffer_ptr; + +// Lookup: returns the stored value (count-1) or SIZE_MAX if not found +// For small strings <= 32 bytes stored inline in node +static inline size_t hash_table_lookup(const hash_table_t* table, uint32_t hash, simde__m256i val) +{ + hash %= BUCKET_COUNT; + const hash_slot_t* slot = &table->slots[hash]; + + for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) { + if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; } + + if (slot->count == 0) { + return SIZE_MAX; // Not found + } + + if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) { + return slot->count - 1; // Found, return index + } + } + + return SIZE_MAX; // Not found +} + +// Insert with index: stores index+1 in count field (0 means empty) +static inline void hash_table_insert(hash_table_t* table, + uint32_t hash, + simde__m256i val, + size_t index) +{ + hash %= BUCKET_COUNT; + hash_slot_t* slot = &table->slots[hash]; + + for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) { + if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; } + + if (slot->count == 0) { + slot->count = (uint32_t)(index + 1); + slot->node = val; + return; + } + + if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) { + // Already exists, update index + slot->count = (uint32_t)(index + 1); + return; + } + } + + __builtin_trap(); +} + +// Create simde__m256i key from string_view (zero-padded) +static inline simde__m256i make_key_32(const char* ptr, size_t len) +{ + alignas(32) char buf[32] = {0}; + if (len > 32) len = 32; + memcpy(buf, ptr, len); + return simde_mm256_load_si256((const simde__m256i*)buf); +} + +// Create simde__m128i key from string_view (zero-padded, for strings <= 16 bytes) +static inline simde__m128i make_key_16(const char* ptr, size_t len) +{ + alignas(16) char buf[16] = {0}; + if (len > 16) len = 16; + memcpy(buf, ptr, len); + return simde_mm_load_si128((const simde__m128i*)buf); +} + +static inline uint64_t m256_u64_lane(simde__m256i value, size_t lane) +{ + simde__m256i_private private_value = simde__m256i_to_private(value); + return private_value.u64[lane]; +} + +static inline void hash_table_push_ptr(hash_table_t* table, + uint32_t hash, + int len, + const uint8_t* ptr) +{ + hash %= BUCKET_COUNT; + + hash_slot_t* slot = &table->slots[hash]; + bool relooped = false; + + uint32_t len_in_qwords = (len / 8) + (len % 8 ? 1 : 0); + +loop: + do { + uint64_t node_len = m256_u64_lane(slot->node, 3); + uint64_t node_tag = m256_u64_lane(slot->node, 0); + // nonzero, it's not a pointer of the same length, skip + if (__builtin_expect(node_len != (uint64_t)len, 0)) { + if (__builtin_expect(node_tag == 0, 1)) { + slot->count = 1; + slot->node = simde_mm256_set_epi64x(len, + ((uint64_t*)ptr)[0], + (uint64_t)string_buffer_ptr, + 0u | ((uint64_t)len_in_qwords << 32u)); + + memcpy(string_buffer_ptr, ptr, len); + string_buffer_ptr += len; + // Pad + string_buffer_ptr += (8 - len % 8) + 8; + + return; + } else + continue; + } + if (m256_u64_lane(slot->node, 2) != ((uint64_t*)ptr)[0]) // First 8 bytes differ + continue; + + uint8_t* other_ptr = reinterpret_cast(m256_u64_lane(slot->node, 1)); + if (__builtin_expect(memcmp(ptr + 16, other_ptr + 16, len - 16) == 0, 1)) { + ++slot->count; + + return; + } + } while (++slot < &table->slots[BUCKET_COUNT]); + + if (!relooped) { + relooped = true; + slot = &table->slots[0]; + goto loop; + } else { + __builtin_trap(); + } +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp new file mode 100644 index 0000000000..fbe18768af --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -0,0 +1,759 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "file_reader.hpp" +#include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +namespace { + +constexpr uint32_t lz4_frame_magic = 0x184D2204u; +constexpr uint32_t lz4_uncompressed_block = 0x80000000u; +constexpr uint32_t lz4_block_size_mask = 0x7FFFFFFFu; +constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t lz4_input_max_io_threads = 8; +constexpr std::size_t lz4_no_content_size_reserve_ratio = 16; + +#if defined(MPS_PARSER_WITH_LZ4) +using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int); + +struct lz4_runtime_t { + void* handle = nullptr; + LZ4_decompress_safe_t decompress_safe = nullptr; + + lz4_runtime_t() + { + for (const char* soname : {"liblz4.so.1", "liblz4.so"}) { + handle = dlopen(soname, RTLD_LAZY); + if (handle != nullptr) { break; } + } + if (handle == nullptr) { + throw std::logic_error( + "Could not open .mps.lz4 file since liblz4 was not found " + "(tried liblz4.so.1, liblz4.so). In order to open .mps.lz4 files " + "directly, please ensure liblz4 is installed. Alternatively, decompress " + "the .lz4 file manually and open the uncompressed .mps file."); + } + + decompress_safe = reinterpret_cast(dlsym(handle, "LZ4_decompress_safe")); + if (decompress_safe == nullptr) { + throw std::logic_error( + "Error loading liblz4! Library version might be incompatible. Please decompress " + "the .lz4 file manually and open the uncompressed .mps file."); + } + } + + ~lz4_runtime_t() + { + if (handle != nullptr) { dlclose(handle); } + } + + lz4_runtime_t(const lz4_runtime_t&) = delete; + lz4_runtime_t& operator=(const lz4_runtime_t&) = delete; +}; + +const lz4_runtime_t& lz4_runtime() +{ + static const lz4_runtime_t runtime; + return runtime; +} +#endif + +int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, int dst_capacity) +{ +#if defined(MPS_PARSER_WITH_LZ4) + return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity); +#else + (void)src; + (void)dst; + (void)compressed_size; + (void)dst_capacity; + throw std::logic_error( + "Experimental fast MPS parser was built without LZ4 decompression support. " + "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); +#endif +} + +void ensure_lz4_runtime_available() +{ +#if defined(MPS_PARSER_WITH_LZ4) + (void)lz4_runtime(); +#else + throw std::logic_error( + "Experimental fast MPS parser was built without LZ4 decompression support. " + "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); +#endif +} + +int open_lz4_fd(const std::string& path) +{ + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { + throw std::runtime_error("Failed to open LZ4 file '" + path + "': " + std::strerror(errno)); + } + return fd; +} + +std::size_t system_page_size(); +std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment); + +class FileDescriptor { + public: + explicit FileDescriptor(int fd) : fd_(fd) {} + ~FileDescriptor() + { + if (fd_ >= 0) { ::close(fd_); } + } + + FileDescriptor(const FileDescriptor&) = delete; + FileDescriptor& operator=(const FileDescriptor&) = delete; + + int get() const noexcept { return fd_; } + bool valid() const noexcept { return fd_ >= 0; } + + private: + int fd_; +}; + +uint32_t read_le32(const char* ptr) +{ + const auto* p = reinterpret_cast(ptr); + return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24); +} + +uint64_t read_le64(const char* ptr) +{ + const auto* p = reinterpret_cast(ptr); + uint64_t value = 0; + for (int i = 7; i >= 0; --i) { + value = (value << 8) | p[i]; + } + return value; +} + +std::size_t block_max_size_from_bd(unsigned char bd) +{ + unsigned block_size_id = (bd >> 4) & 0x7u; + switch (block_size_id) { + case 4: return 64ull * 1024ull; + case 5: return 256ull * 1024ull; + case 6: return 1024ull * 1024ull; + case 7: return 4ull * 1024ull * 1024ull; + default: throw std::runtime_error("unsupported LZ4 frame block size ID"); + } +} + +std::size_t checked_size(uint64_t value, const char* label) +{ + if (value > static_cast(std::numeric_limits::max())) { + throw std::runtime_error(std::string("LZ4 ") + label + " exceeds size_t"); + } + return static_cast(value); +} + +std::size_t get_file_size(int fd, const std::string& path) +{ + struct stat st; + if (::fstat(fd, &st) != 0) { + throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno)); + } + if (st.st_size < 0) { throw std::runtime_error("Invalid negative file size for '" + path + "'"); } + return static_cast(st.st_size); +} + +std::size_t system_page_size() +{ + static std::size_t page_size = [] { + long value = ::sysconf(_SC_PAGESIZE); + return value > 0 ? static_cast(value) : static_cast(4096); + }(); + return page_size; +} + +std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) +{ + if (alignment == 0) { return value; } + std::size_t remainder = value % alignment; + if (remainder == 0) { return value; } + std::size_t increment = alignment - remainder; + if (value > std::numeric_limits::max() - increment) { + throw std::runtime_error("allocation size overflow"); + } + return value + increment; +} + +std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) +{ + if (a != 0 && b > std::numeric_limits::max() / a) { + throw std::runtime_error(std::string(label) + " size overflow"); + } + return a * b; +} + +bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) +{ + std::size_t done = 0; + while (done < bytes) { + std::size_t remaining = bytes - done; + std::size_t chunk = std::min( + remaining, static_cast(std::numeric_limits::max())); + ssize_t got = ::pread(fd, dst + done, chunk, static_cast(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + return false; + } + if (got == 0) { + errno = EIO; + return false; + } + done += static_cast(got); + } + return true; +} + +struct lz4_resident_window_t { + std::size_t index = 0; + std::size_t file_offset = 0; + std::size_t size = 0; + std::unique_ptr data; +}; + +class lz4_resident_windows_t { + public: + explicit lz4_resident_windows_t(std::vector& windows) : windows_(windows) + { + } + + const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const + { + if (size == 0) return nullptr; + const auto& w = window_for_offset(offset); + std::size_t local = offset - w.file_offset; + if (local <= w.size && size <= w.size - local) { return w.data.get() + local; } + return nullptr; + } + + void copy_to(std::size_t offset, char* dst, std::size_t size) const + { + std::size_t copied = 0; + while (copied < size) { + const auto& w = window_for_offset(offset + copied); + std::size_t local = offset + copied - w.file_offset; + std::size_t take = std::min(w.size - local, size - copied); + std::memcpy(dst + copied, w.data.get() + local, take); + copied += take; + } + } + + uint8_t read_u8(std::size_t offset) const + { + uint8_t value = 0; + copy_to(offset, reinterpret_cast(&value), sizeof(value)); + return value; + } + + uint32_t read_u32(std::size_t offset) const + { + char bytes[4]; + copy_to(offset, bytes, sizeof(bytes)); + return read_le32(bytes); + } + + uint64_t read_u64(std::size_t offset) const + { + char bytes[8]; + copy_to(offset, bytes, sizeof(bytes)); + return read_le64(bytes); + } + + private: + const lz4_resident_window_t& window_for_offset(std::size_t offset) const + { + if (windows_.empty()) { + throw std::runtime_error("LZ4 resident window lookup with no windows"); + } + std::size_t lo = 0; + std::size_t hi = windows_.size(); + while (lo < hi) { + std::size_t mid = lo + (hi - lo) / 2; + const auto& w = windows_[mid]; + if (offset < w.file_offset) { + hi = mid; + } else if (offset >= w.file_offset + w.size) { + lo = mid + 1; + } else { + return w; + } + } + throw std::runtime_error("LZ4 offset outside resident windows"); + } + + std::vector& windows_; +}; + +} // namespace + +Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io); + ensure_lz4_runtime_available(); + + fd_ = open_lz4_fd(path); + ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + + compressed_size_ = get_file_size(fd_, path); + + char header[32]; + if (compressed_size_ < 7) { + throw std::runtime_error("LZ4 input is too small to contain a frame header"); + } + std::size_t header_bytes = std::min(sizeof(header), compressed_size_); + if (!pread_full_plain(fd_, header, header_bytes, 0)) { + throw std::runtime_error("Failed to read LZ4 frame header '" + path + + "': " + std::strerror(errno)); + } + + std::size_t offset = 0; + uint32_t magic = read_le32(header + offset); + if (magic != lz4_frame_magic) { + throw std::runtime_error("unsupported LZ4 input: expected standard LZ4 frame magic"); + } + offset += 4; + unsigned char flg = static_cast(header[offset++]); + unsigned char bd = static_cast(header[offset++]); + unsigned version = (flg >> 6) & 0x3u; + if (version != 1) { throw std::runtime_error("unsupported LZ4 frame version"); } + bool block_independent = (flg & 0x20u) != 0; + block_checksum_ = (flg & 0x10u) != 0; + content_size_present_ = (flg & 0x08u) != 0; + content_checksum_ = (flg & 0x04u) != 0; + dict_id_ = (flg & 0x01u) != 0; + if (!block_independent) { + throw std::runtime_error("parallel LZ4 reader requires independent blocks; compress with -BI"); + } + block_max_size_ = block_max_size_from_bd(bd); + if (content_size_present_) { + if (offset + 8 > header_bytes) { + throw std::runtime_error("truncated LZ4 frame while reading content size"); + } + content_size_ = checked_size(read_le64(header + offset), "content size"); + offset += 8; + } + if (dict_id_) { + if (offset + 4 > header_bytes) { + throw std::runtime_error("truncated LZ4 frame while reading dictionary id"); + } + offset += 4; + } + if (offset + 1 > header_bytes) { + throw std::runtime_error("truncated LZ4 frame while reading header checksum"); + } + offset += 1; + header_size_ = offset; + + std::size_t reserve_size = content_size_; + if (!content_size_present_) { + reserve_size = + checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve"); + reserve_size = std::max(reserve_size, block_max_size_); + } + + constexpr std::size_t huge_alignment = 2 * 1024 * 1024; + output_mapped_size_ = round_up_to_multiple(reserve_size, system_page_size()); + output_region_ = mmap_region_t::anonymous_aligned(output_mapped_size_, + huge_alignment, + PROT_NONE, + MAP_PRIVATE | MAP_NORESERVE, + "LZ4 output buffer"); + output_data_ = output_region_.char_data(); + + std::size_t block_slots = + std::max(1, (reserve_size + block_max_size_ - 1) / block_max_size_ + 1); + block_done_.resize(block_slots, 0); + block_end_.resize(block_slots, 0); + + section_scanner_ = + std::make_unique(output_data_, block_slots, registry_); +} + +Lz4InputStream::~Lz4InputStream() +{ + if (fd_ >= 0) { ::close(fd_); } +} + +const char* Lz4InputStream::data() const noexcept { return output_data_; } +char* Lz4InputStream::mutable_data() noexcept { return output_data_; } +std::size_t Lz4InputStream::size() const noexcept { return output_view_size_; } +std::size_t Lz4InputStream::compressed_size() const noexcept { return compressed_size_; } +std::size_t Lz4InputStream::reserve_size_hint() const noexcept +{ + return content_size_present_ ? content_size_ + : std::max(compressed_size_ * 6, 1024 * 1024); +} +mps_phase_registry_t& Lz4InputStream::registry() noexcept { return registry_; } +input_stream_view_t Lz4InputStream::view() noexcept +{ + return {output_data_, output_data_, output_view_size_, compressed_size_, ®istry_}; +} + +void Lz4InputStream::commit_up_to(std::size_t bytes) +{ + MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc); + std::lock_guard lock(commit_mutex_); + if (bytes <= output_committed_size_) return; + if (bytes > output_mapped_size_) { + throw std::runtime_error("LZ4 output exceeded reserved virtual mapping"); + } + std::size_t new_committed = round_up_to_multiple(bytes, system_page_size()); + if (new_committed > output_mapped_size_) new_committed = output_mapped_size_; + std::size_t add = new_committed - output_committed_size_; + void* target = output_data_ + output_committed_size_; + mmap_region_t::map_fixed_or_throw( + target, add, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0, "LZ4 output commit"); + ::madvise(target, add, MADV_HUGEPAGE); + output_committed_size_ = new_committed; +} + +void Lz4InputStream::run_decode_tasks() +{ + MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io); + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; + std::atomic_bool stop_workers{false}; + auto mark_error = [&](std::exception_ptr eptr) { + std::lock_guard lock(error_mutex); + if (!first_error) { + first_error = eptr; + stop_workers.store(true, std::memory_order_release); + } + }; + + const std::size_t window_bytes = lz4_pipeline_batch_bytes; + const std::size_t window_count = (compressed_size_ + window_bytes - 1) / window_bytes; + std::vector windows(window_count); + for (std::size_t i = 0; i < window_count; ++i) { + std::size_t offset = i * window_bytes; + std::size_t size = std::min(window_bytes, compressed_size_ - offset); + windows[i].index = i; + windows[i].file_offset = offset; + windows[i].size = size; + windows[i].data.reset(new char[size]); + } + + const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count); + + struct resident_block_desc_t { + const char* src = nullptr; + std::size_t compressed_size = 0; + std::size_t decompressed_offset = 0; + std::size_t decompressed_size = 0; + std::size_t index = 0; + bool uncompressed = false; + }; + + std::atomic_size_t next_window{0}; + std::vector window_done(window_count, 0); + std::mutex window_mutex; + std::condition_variable window_cv; + + std::deque> desc_queue; + bool scanner_done = false; + std::mutex desc_mutex; + std::condition_variable desc_cv; + + auto fail_and_notify = [&](std::exception_ptr eptr) { + mark_error(eptr); + window_cv.notify_all(); + desc_cv.notify_all(); + }; + + auto decode_worker = [&](std::size_t tid) { + try { + std::string thread_name = "lz4-window-decode-" + std::to_string(tid); + nvtx::name_current_thread(thread_name.c_str()); + while (true) { + std::vector batch; + { + MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); + std::unique_lock lock(desc_mutex); + desc_cv.wait(lock, [&] { + return stop_workers.load(std::memory_order_acquire) || scanner_done || + !desc_queue.empty(); + }); + if (stop_workers.load(std::memory_order_acquire)) { return; } + if (desc_queue.empty()) { + if (scanner_done) return; + continue; + } + batch = std::move(desc_queue.front()); + desc_queue.pop_front(); + } + + MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode); + for (const auto& block : batch) { + char* dst = output_data_ + block.decompressed_offset; + int actual = 0; + { + MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode); + if (block.uncompressed) { + std::memcpy(dst, block.src, block.decompressed_size); + actual = static_cast(block.decompressed_size); + } else if (block.compressed_size > + static_cast(std::numeric_limits::max()) || + block.decompressed_size > + static_cast(std::numeric_limits::max())) { + actual = -1; + } else { + actual = lz4_decompress_safe_runtime(block.src, + dst, + static_cast(block.compressed_size), + static_cast(block.decompressed_size)); + } + } + if (actual < 0 || static_cast(actual) > block.decompressed_size) { + throw std::runtime_error("LZ4 input block decompressed to invalid size"); + } + + std::size_t actual_size = static_cast(actual); + { + MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); + section_scanner_->observe_block(block.index, dst, dst + actual_size); + } + std::size_t before = 0; + std::size_t after = 0; + { + MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic); + frontier_mutex_.lock(); + block_done_[block.index] = 1; + block_end_[block.index] = block.decompressed_offset + actual_size; + before = ready_bytes_; + while (next_block_ < block_done_.size() && block_done_[next_block_]) { + ready_bytes_ = block_end_[next_block_]; + ++next_block_; + } + after = ready_bytes_; + frontier_mutex_.unlock(); + } + if (after > before) { + MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic); + section_scanner_->publish_ready(after); + } + } + } + } catch (...) { + fail_and_notify(std::current_exception()); + } + }; + + std::vector readers; + readers.reserve(io_threads); + for (std::size_t t = 0; t < io_threads; ++t) { + readers.emplace_back([&, t] { + std::string thread_name = "lz4-window-read-" + std::to_string(t); + nvtx::name_current_thread(thread_name.c_str()); + while (!stop_workers.load(std::memory_order_acquire)) { + std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); + if (index >= windows.size()) { break; } + auto& w = windows[index]; + bool ok = false; + { + MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); + ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset); + } + if (!ok) { + fail_and_notify(std::make_exception_ptr(std::runtime_error( + "Failed to pread LZ4 resident window: " + std::string(std::strerror(errno))))); + return; + } + { + MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); + std::lock_guard lock(window_mutex); + window_done[index] = 1; + } + window_cv.notify_all(); + } + }); + } + + std::atomic_size_t blocks_scanned{0}; + std::vector> crossing_payloads; + std::thread scanner([&] { + try { + nvtx::name_current_thread("lz4-metadata-scan"); + lz4_resident_windows_t resident(windows); + auto wait_range_ready = [&](std::size_t begin, std::size_t size) { + if (size == 0) return; + std::size_t first = begin / window_bytes; + std::size_t last = (begin + size - 1) / window_bytes; + for (std::size_t wi = first; wi <= last; ++wi) { + MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); + std::unique_lock lock(window_mutex); + window_cv.wait(lock, [&] { + return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; + }); + if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) { + throw std::runtime_error( + "LZ4 metadata scanner stopped before required window was ready"); + } + } + }; + auto push_batch = [&](std::vector& batch) { + if (batch.empty()) return; + { + MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc); + commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size); + } + { + MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic); + std::lock_guard lock(desc_mutex); + desc_queue.push_back(std::move(batch)); + } + batch.clear(); + desc_cv.notify_one(); + }; + + std::vector batch; + batch.reserve(1024); + std::size_t offset = header_size_; + std::size_t decompressed_offset = 0; + while (true) { + MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic); + wait_range_ready(offset, 4); + if (offset + 4 > compressed_size_) { + throw std::runtime_error("truncated LZ4 frame while reading block header"); + } + uint32_t raw_block_size = resident.read_u32(offset); + offset += 4; + if (raw_block_size == 0) { break; } + + bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; + std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; + if (block_payload_size == 0) { + throw std::runtime_error("invalid zero-sized LZ4 data block"); + } + if (block_payload_size > block_max_size_ && uncompressed) { + throw std::runtime_error("LZ4 uncompressed block exceeds frame block maximum"); + } + if (content_size_present_ && decompressed_offset >= content_size_) { + throw std::runtime_error("LZ4 frame contains more blocks than content size allows"); + } + wait_range_ready(offset, block_payload_size); + if (offset + block_payload_size > compressed_size_) { + throw std::runtime_error("truncated LZ4 frame while reading block payload"); + } + + std::size_t decompressed_size = block_payload_size; + if (!uncompressed) { + if (content_size_present_) { + decompressed_size = std::min(block_max_size_, content_size_ - decompressed_offset); + } else { + decompressed_size = block_max_size_; + } + } + if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) { + throw std::runtime_error("LZ4 block exceeds declared content size"); + } + + const char* src = resident.ptr_if_contiguous(offset, block_payload_size); + if (src == nullptr) { + crossing_payloads.emplace_back(block_payload_size); + resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size); + src = crossing_payloads.back().data(); + } + batch.push_back({src, + block_payload_size, + decompressed_offset, + decompressed_size, + blocks_scanned.load(std::memory_order_relaxed), + uncompressed}); + blocks_scanned.fetch_add(1, std::memory_order_relaxed); + decompressed_offset += decompressed_size; + offset += block_payload_size; + if (block_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > compressed_size_) { + throw std::runtime_error("truncated LZ4 frame while reading block checksum"); + } + offset += 4; + } + if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) { + throw std::runtime_error("LZ4 input block count exceeded reserved metadata slots"); + } + if (batch.size() >= 1024) { push_batch(batch); } + } + if (content_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > compressed_size_) { + throw std::runtime_error("truncated LZ4 frame while reading content checksum"); + } + offset += 4; + } + if (content_size_present_ && decompressed_offset != content_size_) { + throw std::runtime_error("LZ4 frame ended before declared content size was reached"); + } + if (offset != compressed_size_) { + throw std::runtime_error("LZ4 input contains trailing data after the first frame"); + } + push_batch(batch); + { + std::lock_guard lock(desc_mutex); + scanner_done = true; + } + desc_cv.notify_all(); + } catch (...) { + { + std::lock_guard lock(desc_mutex); + scanner_done = true; + } + fail_and_notify(std::current_exception()); + } + }); + + std::vector io_workers; + io_workers.reserve(io_threads); + for (std::size_t t = 0; t < io_threads; ++t) { + io_workers.emplace_back(decode_worker, t); + } + for (auto& reader : readers) { + reader.join(); + } + scanner.join(); + for (auto& worker : io_workers) { + worker.join(); + } + if (first_error) std::rethrow_exception(first_error); + output_view_size_ = ready_bytes_; + section_scanner_->publish_ready(output_view_size_); +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp new file mode 100644 index 0000000000..c1f411111a --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +// Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range +// are still released by unmapping the owning outer range. +class mmap_region_t { + public: + mmap_region_t() = default; + mmap_region_t(void* ptr, std::size_t size) noexcept : ptr_(ptr), size_(size) {} + + mmap_region_t(const mmap_region_t&) = delete; + mmap_region_t& operator=(const mmap_region_t&) = delete; + + mmap_region_t(mmap_region_t&& other) noexcept : ptr_(other.ptr_), size_(other.size_) + { + other.ptr_ = nullptr; + other.size_ = 0; + } + + mmap_region_t& operator=(mmap_region_t&& other) noexcept + { + if (this != &other) { + reset(); + ptr_ = other.ptr_; + size_ = other.size_; + other.ptr_ = nullptr; + other.size_ = 0; + } + return *this; + } + + ~mmap_region_t() { reset(); } + + static mmap_region_t map( + void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context) + { + void* ptr = ::mmap(address, size, prot, flags, fd, offset); + if (ptr == MAP_FAILED) { + throw std::runtime_error(std::string("mmap failed for ") + context + ": " + + std::strerror(errno)); + } + return mmap_region_t(ptr, size); + } + + static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context) + { + return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context); + } + + static mmap_region_t anonymous_aligned( + std::size_t size, std::size_t alignment, int prot, int flags, const char* context) + { + if (alignment == 0 || (alignment & (alignment - 1)) != 0) { + throw std::runtime_error("mmap aligned allocation requires power-of-two alignment"); + } + if (size > std::numeric_limits::max() - alignment) { + throw std::runtime_error("mmap aligned allocation size overflow"); + } + + std::size_t raw_size = size + alignment; + void* raw = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0); + if (raw == MAP_FAILED) { + throw std::runtime_error(std::string("mmap failed for ") + context + ": " + + std::strerror(errno)); + } + + uintptr_t raw_addr = reinterpret_cast(raw); + uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1); + std::size_t prefix = static_cast(aligned_addr - raw_addr); + std::size_t suffix = raw_size - prefix - size; + if (prefix > 0) { ::munmap(raw, prefix); } + if (suffix > 0) { ::munmap(reinterpret_cast(aligned_addr + size), suffix); } + return mmap_region_t(reinterpret_cast(aligned_addr), size); + } + + static void map_fixed_or_throw( + void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context) + { + void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset); + if (ptr == MAP_FAILED) { + throw std::runtime_error(std::string("mmap failed for ") + context + ": " + + std::strerror(errno)); + } + } + + void reset() noexcept + { + if (ptr_ != nullptr && size_ != 0) { ::munmap(ptr_, size_); } + ptr_ = nullptr; + size_ = 0; + } + + void reset(void* ptr, std::size_t size) noexcept + { + reset(); + ptr_ = ptr; + size_ = size; + } + + void* release() noexcept + { + void* ptr = ptr_; + ptr_ = nullptr; + size_ = 0; + return ptr; + } + + void advise(int advice) const noexcept + { + if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); } + } + + void* data() noexcept { return ptr_; } + const void* data() const noexcept { return ptr_; } + char* char_data() noexcept { return static_cast(ptr_); } + const char* char_data() const noexcept { return static_cast(ptr_); } + std::size_t size() const noexcept { return size_; } + bool empty() const noexcept { return ptr_ == nullptr || size_ == 0; } + explicit operator bool() const noexcept { return !empty(); } + + private: + void* ptr_ = nullptr; + std::size_t size_ = 0; +}; + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp new file mode 100644 index 0000000000..3ed8763428 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -0,0 +1,413 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "mps_section_scanner.hpp" +#include "simd_compat.hpp" + +#include +#include +#include +#include +#include + +namespace mps_fast { + +namespace { + +bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; } + +simde__m256i nonblank_column1_mask(simde__m256i bytes) +{ + return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' ')); +} + +const char* section_name(mps_section_kind kind) +{ + switch (kind) { + case mps_section_kind::rows: return "ROWS"; + case mps_section_kind::columns: return "COLUMNS"; + case mps_section_kind::rhs: return "RHS"; + case mps_section_kind::bounds: return "BOUNDS"; + case mps_section_kind::ranges: return "RANGES"; + case mps_section_kind::quadobj: return "QUADOBJ"; + case mps_section_kind::qmatrix: return "QMATRIX"; + case mps_section_kind::qcmatrix: return "QCMATRIX"; + case mps_section_kind::endata: return "ENDATA"; + } + return ""; +} + +std::size_t section_name_len(mps_section_kind kind) { return std::strlen(section_name(kind)); } + +} // namespace + +std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) +{ + switch (phase) { + case mps_phase_kind::header: return 0; + case mps_phase_kind::rows: return 1; + case mps_phase_kind::columns: return 2; + case mps_phase_kind::rhs: return 3; + case mps_phase_kind::bounds: return 4; + case mps_phase_kind::ranges: return 5; + case mps_phase_kind::quadratic: return 6; + } + throw std::runtime_error("invalid MPS phase kind"); +} + +void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range) +{ + std::size_t idx = phase_index(phase); + omp_event_handle_t event{}; + bool fulfill = false; + { + std::lock_guard lock(mutex_); + if (ready_[idx].load(std::memory_order_acquire)) { return; } + ranges_[idx] = range; + ready_[idx].store(true, std::memory_order_release); + if (has_event_[idx] && !event_fulfilled_[idx]) { + event = events_[idx]; + event_fulfilled_[idx] = true; + fulfill = true; + } + } + if (fulfill) { omp_fulfill_event(event); } +} + +void mps_phase_registry_t::attach_event(mps_phase_kind phase, omp_event_handle_t event) +{ + std::size_t idx = phase_index(phase); + bool fulfill = false; + { + std::lock_guard lock(mutex_); + events_[idx] = event; + has_event_[idx] = true; + if (ready_[idx].load(std::memory_order_acquire) && !event_fulfilled_[idx]) { + event_fulfilled_[idx] = true; + fulfill = true; + } + } + if (fulfill) { omp_fulfill_event(event); } +} + +bool mps_phase_registry_t::ready(mps_phase_kind phase) const +{ + return ready_[phase_index(phase)].load(std::memory_order_acquire); +} + +mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const +{ + return ranges_[phase_index(phase)]; +} + +bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind) +{ + if (line_start >= line_end) { return false; } + + mps_section_kind candidate; + switch (*line_start) { + case 'R': + if (line_end - line_start >= 3 && std::memcmp(line_start, "RHS", 3) == 0) { + candidate = mps_section_kind::rhs; + } else if (line_end - line_start >= 4 && std::memcmp(line_start, "ROWS", 4) == 0) { + candidate = mps_section_kind::rows; + } else if (line_end - line_start >= 6 && std::memcmp(line_start, "RANGES", 6) == 0) { + candidate = mps_section_kind::ranges; + } else { + return false; + } + break; + case 'C': + if (line_end - line_start >= 7 && std::memcmp(line_start, "COLUMNS", 7) == 0) { + candidate = mps_section_kind::columns; + } else { + return false; + } + break; + case 'B': + if (line_end - line_start >= 6 && std::memcmp(line_start, "BOUNDS", 6) == 0) { + candidate = mps_section_kind::bounds; + } else { + return false; + } + break; + case 'E': + if (line_end - line_start >= 6 && std::memcmp(line_start, "ENDATA", 6) == 0) { + candidate = mps_section_kind::endata; + } else { + return false; + } + break; + case 'Q': + if (line_end - line_start >= 7 && std::memcmp(line_start, "QUADOBJ", 7) == 0) { + candidate = mps_section_kind::quadobj; + } else if (line_end - line_start >= 7 && std::memcmp(line_start, "QMATRIX", 7) == 0) { + candidate = mps_section_kind::qmatrix; + } else if (line_end - line_start >= 8 && std::memcmp(line_start, "QCMATRIX", 8) == 0) { + candidate = mps_section_kind::qcmatrix; + } else { + return false; + } + break; + default: return false; + } + + const char* after = line_start + section_name_len(candidate); + while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) { + ++after; + } + if (after != line_end) { return false; } + *kind = candidate; + return true; +} + +mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data, + std::size_t block_count, + mps_phase_registry_t& registry) + : data_(data), + block_count_(block_count), + registry_(registry), + block_decoded_(std::make_unique[]>(block_count)), + block_begin_offsets_(std::make_unique(block_count)), + block_end_offsets_(std::make_unique(block_count)) +{ + for (std::size_t i = 0; i < block_count_; ++i) { + block_decoded_[i].store(0, std::memory_order_relaxed); + block_begin_offsets_[i].store(0, std::memory_order_relaxed); + block_end_offsets_[i].store(0, std::memory_order_relaxed); + } +} + +std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind) +{ + switch (kind) { + case mps_section_kind::rows: return 0; + case mps_section_kind::columns: return 1; + case mps_section_kind::rhs: return 2; + case mps_section_kind::bounds: return 3; + case mps_section_kind::ranges: return 4; + case mps_section_kind::quadobj: return 5; + case mps_section_kind::qmatrix: return 6; + case mps_section_kind::qcmatrix: return 7; + case mps_section_kind::endata: return 8; + } + return 0; +} + +void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr) +{ + std::atomic& slot = section_hits_[section_hit_index(kind)]; + const char* expected = nullptr; + if (slot.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_acquire)) { + publish_section_ranges(); + } +} + +void mps_section_block_scanner_t::scan_section_range(const char* begin, + const char* end, + bool boundary_scan) +{ + (void)boundary_scan; + if (begin >= end) return; + const char* p = begin; + + // Interior scans that start inside a decoded block skip the leading partial + // line. A separate boundary scan covers section titles whose newline/title + // bytes straddle adjacent LZ4 blocks. + if (p != data_) { + const void* nl = __builtin_memchr(p, '\n', static_cast(end - p)); + if (nl == nullptr) { return; } + p = static_cast(nl) + 1; + } + + auto try_candidate = [&](const char* line_start) { + const void* nl = __builtin_memchr(line_start, '\n', static_cast(end - line_start)); + const char* line_end = nl == nullptr ? end : static_cast(nl); + mps_section_kind kind; + if (line_is_section(line_start, line_end, &kind)) { record_section_hit(kind, line_start); } + }; + + // Handle the very first line of a file (NAME indicator, usually) + if (p == data_) { + if (p < end && is_nonblank_column1(static_cast(*p))) { try_candidate(p); } + ++p; + } + + // In compliant MPS, indicator records begin in column 1 while data records + // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap + // candidate signal, then run the exact section matcher only for candidates. + const simde__m256i newline = simde_mm256_set1_epi8('\n'); + while (static_cast(end - p) >= 32) { + simde__m256i current = simde_mm256_loadu_si256(reinterpret_cast(p)); + simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast(p - 1)); + std::uint32_t mask = static_cast(simde_mm256_movemask_epi8(simde_mm256_and_si256( + simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current)))); + while (mask != 0) { + int bit = __builtin_ctz(mask); + try_candidate(p + bit); + mask &= mask - 1; + } + p += 32; + } + + // scalar tail + while (p < end) { + if (*(p - 1) == '\n' && is_nonblank_column1(static_cast(*p))) { + try_candidate(p); + } + ++p; + } +} + +void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::size_t right_index) +{ + std::size_t left_begin = block_begin_offsets_[left_index].load(std::memory_order_acquire); + std::size_t boundary = block_begin_offsets_[right_index].load(std::memory_order_acquire); + std::size_t right_end = block_end_offsets_[right_index].load(std::memory_order_acquire); + std::size_t begin = + boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin; + std::size_t end = + right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end; + scan_section_range(data_ + begin, data_ + end, true); +} + +void mps_section_block_scanner_t::observe_block(std::size_t block_index, + const char* begin, + const char* end) +{ + if (block_index >= block_count_) { + throw std::runtime_error("MPS section scanner observed invalid LZ4 block index"); + } + + scan_section_range(begin, end, false); + block_begin_offsets_[block_index].store(static_cast(begin - data_), + std::memory_order_relaxed); + block_end_offsets_[block_index].store(static_cast(end - data_), + std::memory_order_relaxed); + block_decoded_[block_index].store(1, std::memory_order_release); + + if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) { + scan_boundary(block_index - 1, block_index); + } + if (block_index + 1 < block_count_ && + block_decoded_[block_index + 1].load(std::memory_order_acquire)) { + scan_boundary(block_index, block_index + 1); + } +} + +void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes) +{ + ready_bytes_.store(ready_bytes, std::memory_order_release); + publish_section_ranges(); +} + +void mps_section_block_scanner_t::publish_section_ranges() +{ + std::lock_guard lock(publish_mutex_); + std::size_t ready = ready_bytes_.load(std::memory_order_acquire); + const char* ready_ptr = data_ + ready; + const char* rows = + section_hits_[section_hit_index(mps_section_kind::rows)].load(std::memory_order_acquire); + const char* columns = + section_hits_[section_hit_index(mps_section_kind::columns)].load(std::memory_order_acquire); + const char* rhs = + section_hits_[section_hit_index(mps_section_kind::rhs)].load(std::memory_order_acquire); + const char* bounds = + section_hits_[section_hit_index(mps_section_kind::bounds)].load(std::memory_order_acquire); + const char* ranges = + section_hits_[section_hit_index(mps_section_kind::ranges)].load(std::memory_order_acquire); + const char* quadobj = + section_hits_[section_hit_index(mps_section_kind::quadobj)].load(std::memory_order_acquire); + const char* qmatrix = + section_hits_[section_hit_index(mps_section_kind::qmatrix)].load(std::memory_order_acquire); + const char* qcmatrix = + section_hits_[section_hit_index(mps_section_kind::qcmatrix)].load(std::memory_order_acquire); + const char* endata = + section_hits_[section_hit_index(mps_section_kind::endata)].load(std::memory_order_acquire); + auto available = [&](const char* p) { return p != nullptr && p <= ready_ptr; }; + bool final_ready = + block_count_ == 0 || + (block_decoded_[block_count_ - 1].load(std::memory_order_acquire) && + ready == block_end_offsets_[block_count_ - 1].load(std::memory_order_acquire)); + const char* final_boundary = available(endata) ? endata : (final_ready ? ready_ptr : nullptr); + auto earliest_available_after = [&](const char* after, + std::initializer_list candidates) { + const char* best = nullptr; + for (const char* p : candidates) { + if (!available(p) || (after != nullptr && p <= after)) { continue; } + if (best == nullptr || p < best) { best = p; } + } + return best; + }; + + if (available(rows) && !registry_.ready(mps_phase_kind::header)) { + registry_.publish(mps_phase_kind::header, {data_, rows, true}); + } + if (available(rows) && available(columns) && !registry_.ready(mps_phase_kind::rows)) { + registry_.publish(mps_phase_kind::rows, {rows, columns, true}); + } + if (available(columns) && !registry_.ready(mps_phase_kind::columns)) { + const char* columns_end = earliest_available_after( + columns, {rhs, ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + if (columns_end != nullptr) { + registry_.publish(mps_phase_kind::columns, {columns, columns_end, true}); + } + } + + if (!registry_.ready(mps_phase_kind::rhs)) { + if (available(rhs)) { + const char* rhs_end = + earliest_available_after(rhs, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + if (rhs_end != nullptr) { registry_.publish(mps_phase_kind::rhs, {rhs, rhs_end, true}); } + } else { + const char* after_columns = earliest_available_after( + columns, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + if (after_columns != nullptr) { + registry_.publish(mps_phase_kind::rhs, {nullptr, nullptr, false}); + } + } + } + + if (!registry_.ready(mps_phase_kind::ranges)) { + const char* ranges_end = + earliest_available_after(ranges, {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + const char* after_rhs = earliest_available_after( + rhs ? rhs : columns, {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + if (available(ranges) && ranges_end != nullptr) { + registry_.publish(mps_phase_kind::ranges, {ranges, ranges_end, true}); + } else if (!ranges && after_rhs != nullptr) { + registry_.publish(mps_phase_kind::ranges, {nullptr, nullptr, false}); + } + } + + if (!registry_.ready(mps_phase_kind::bounds)) { + const char* bounds_end = + earliest_available_after(bounds, {quadobj, qmatrix, qcmatrix, final_boundary}); + const char* after_ranges = earliest_available_after( + ranges ? ranges : (rhs ? rhs : columns), {quadobj, qmatrix, qcmatrix, final_boundary}); + if (available(bounds) && bounds_end != nullptr) { + registry_.publish(mps_phase_kind::bounds, {bounds, bounds_end, true}); + } else if (!bounds && after_ranges != nullptr) { + registry_.publish(mps_phase_kind::bounds, {nullptr, nullptr, false}); + } + } + + if (!registry_.ready(mps_phase_kind::quadratic)) { + const char* quadratic_begin = nullptr; + if (available(quadobj)) { quadratic_begin = quadobj; } + if (available(qmatrix) && (quadratic_begin == nullptr || qmatrix < quadratic_begin)) { + quadratic_begin = qmatrix; + } + if (available(qcmatrix) && (quadratic_begin == nullptr || qcmatrix < quadratic_begin)) { + quadratic_begin = qcmatrix; + } + if (quadratic_begin != nullptr && final_boundary != nullptr) { + registry_.publish(mps_phase_kind::quadratic, {quadratic_begin, final_boundary, true}); + } else if (quadratic_begin == nullptr && final_boundary != nullptr) { + registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false}); + } + } +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp new file mode 100644 index 0000000000..0c492b0074 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -0,0 +1,98 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace mps_fast { + +enum class mps_section_kind { + rows, + columns, + rhs, + bounds, + ranges, + quadobj, + qmatrix, + qcmatrix, + endata, +}; + +enum class mps_phase_kind { + header, + rows, + columns, + rhs, + bounds, + ranges, + quadratic, +}; + +struct mps_phase_range_t { + const char* begin = nullptr; + const char* end = nullptr; + bool present = false; +}; + +class mps_phase_registry_t { + public: + void publish(mps_phase_kind phase, mps_phase_range_t range); + void attach_event(mps_phase_kind phase, omp_event_handle_t event); + + bool ready(mps_phase_kind phase) const; + mps_phase_range_t range(mps_phase_kind phase) const; + + private: + static constexpr std::size_t phase_count = 7; + + static std::size_t phase_index(mps_phase_kind phase); + + mps_phase_range_t ranges_[phase_count]{}; + std::atomic ready_[phase_count]{}; + omp_event_handle_t events_[phase_count]{}; + bool has_event_[phase_count]{}; + bool event_fulfilled_[phase_count]{}; + mutable std::mutex mutex_; +}; + +bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind); + +class mps_section_block_scanner_t { + public: + mps_section_block_scanner_t(const char* data, + std::size_t block_count, + mps_phase_registry_t& registry); + + void observe_block(std::size_t block_index, const char* begin, const char* end); + void publish_ready(std::size_t ready_bytes); + + private: + static constexpr std::size_t section_count = 9; + static constexpr std::size_t boundary_overlap = 128; + + static std::size_t section_hit_index(mps_section_kind kind); + + void scan_section_range(const char* begin, const char* end, bool boundary_scan); + void scan_boundary(std::size_t left_index, std::size_t right_index); + void record_section_hit(mps_section_kind kind, const char* ptr); + void publish_section_ranges(); + + const char* data_ = nullptr; + std::size_t block_count_ = 0; + mps_phase_registry_t& registry_; + std::mutex publish_mutex_; + std::unique_ptr[]> block_decoded_; + std::unique_ptr block_begin_offsets_; + std::unique_ptr block_end_offsets_; + std::atomic_size_t ready_bytes_{0}; + std::atomic section_hits_[section_count]{}; +}; + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp new file mode 100644 index 0000000000..650d28dbc2 --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -0,0 +1,135 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#ifdef MPS_FAST_NVTX +#include +#include +#include +#endif + +namespace mps_fast::nvtx { + +namespace colors { +constexpr std::uint32_t generic = 0xff8b949e; +constexpr std::uint32_t io = 0xff58a6ff; +constexpr std::uint32_t decode = 0xff3fb950; +constexpr std::uint32_t rows = 0xffd29922; +constexpr std::uint32_t columns = 0xffff7b72; +constexpr std::uint32_t rhs = 0xffa371f7; +constexpr std::uint32_t bounds = 0xfff0883e; +constexpr std::uint32_t ranges = 0xff79c0ff; +constexpr std::uint32_t names = 0xff56d364; +constexpr std::uint32_t alloc = 0xffdb61a2; +constexpr std::uint32_t finalize = 0xffc9d1d9; +} // namespace colors + +inline std::uint32_t color_for_name(std::string_view name) noexcept +{ + if (name.find("lz4") != std::string_view::npos || name.find("read") != std::string_view::npos) { + return colors::io; + } + if (name.find("decode") != std::string_view::npos || + name.find("decompress") != std::string_view::npos) { + return colors::decode; + } + if (name.find("row") != std::string_view::npos) { return colors::rows; } + if (name.find("column") != std::string_view::npos || name.find("csr") != std::string_view::npos) { + return colors::columns; + } + if (name.find("rhs") != std::string_view::npos) { return colors::rhs; } + if (name.find("bound") != std::string_view::npos) { return colors::bounds; } + if (name.find("range") != std::string_view::npos) { return colors::ranges; } + if (name.find("name") != std::string_view::npos || + name.find("materialize") != std::string_view::npos) { + return colors::names; + } + if (name.find("alloc") != std::string_view::npos || + name.find("resize") != std::string_view::npos || + name.find("mmap") != std::string_view::npos) { + return colors::alloc; + } + if (name.find("finalize") != std::string_view::npos) { return colors::finalize; } + return colors::generic; +} + +class scoped_range { + public: + explicit scoped_range(const char* name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) + { + push(name, color, category); + } + + explicit scoped_range(std::string name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) + : owned_name_(std::move(name)) + { + push(owned_name_.c_str(), color, category); + } + + ~scoped_range() { end(); } + + void end() + { +#ifdef MPS_FAST_NVTX + if (active_) { + nvtxRangePop(); + active_ = false; + } +#endif + } + + scoped_range(const scoped_range&) = delete; + scoped_range& operator=(const scoped_range&) = delete; + + private: + void push(const char* name, std::uint32_t color, std::uint32_t category) + { +#ifdef MPS_FAST_NVTX + nvtxEventAttributes_t event{}; + event.version = NVTX_VERSION; + event.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + event.colorType = NVTX_COLOR_ARGB; + event.color = color; + event.messageType = NVTX_MESSAGE_TYPE_ASCII; + event.message.ascii = name; + event.category = category; + nvtxRangePushEx(&event); + active_ = true; +#else + (void)name; + (void)color; + (void)category; +#endif + } + + std::string owned_name_; +#ifdef MPS_FAST_NVTX + bool active_ = false; +#endif +}; + +inline void name_current_thread(const char* name) +{ +#ifdef MPS_FAST_NVTX + nvtxNameOsThreadA(static_cast(::syscall(SYS_gettid)), name); +#else + (void)name; +#endif +} + +} // namespace mps_fast::nvtx + +#define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b +#define MPS_FAST_NVTX_CONCAT(a, b) MPS_FAST_NVTX_CONCAT_INNER(a, b) +#define MPS_NVTX_RANGE(name, color) \ + ::mps_fast::nvtx::scoped_range MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color) diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp new file mode 100644 index 0000000000..d81af7a2eb --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/simd_compat.hpp @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Use SIMDe's explicit simde_* API. On x86 it can still lower to native +// intrinsics; on other targets it provides the portable implementation. +#include +#include +#include diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index af1368865d..12f9ed488a 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -56,6 +56,21 @@ bool file_exists(const std::string& file) namespace { +struct mps_reader_param_t { + const char* name; + mps_reader_type_t reader; +}; + +constexpr mps_reader_param_t default_mps_reader_param{"default_reader", + mps_reader_type_t::default_reader}; +constexpr mps_reader_param_t fast_mps_reader_param{"fast_experimental", + mps_reader_type_t::fast_experimental}; + +std::string mps_reader_param_name(const ::testing::TestParamInfo& info) +{ + return info.param.name; +} + // Non-template forwarding wrapper around read_lp_from_string. // Exists only so EXPECT_THROW(read_lp_string(R"LP(...)LP"), exc) is parsed // correctly — gtest's macro splits its args on top-level commas, and the @@ -115,14 +130,14 @@ double q_entry(const mps_data_model_t& m, int row, int col) // =========================================================================== // Per-fixture test classes. Each class describes one named problem fixture // and owns the checker for that problem's expected parsed data model. The -// MPS and LP TEST_F cases within a fixture share the same `check_model` +// MPS TEST_P and LP TEST_F cases within a fixture share the same `check_model` // method, so the expected values live in exactly one place per fixture. // // All fixtures inherit a common base that supplies read_mps_file and // read_lp_file helpers. // =========================================================================== -class parser_fixture_base : public ::testing::Test { +class parser_fixture_base : public ::testing::TestWithParam { protected: static mps_data_model_t read_mps_file(const std::string& file, bool fixed_format = true) @@ -131,6 +146,18 @@ class parser_fixture_base : public ::testing::Test { return read_mps(root + "/" + file, fixed_format); } + mps_data_model_t read_param_mps_file(const std::string& file, + bool fixed_format = true) const + { + const std::string& root = cuopt::test::get_rapids_dataset_root_dir(); + const auto reader = GetParam().reader; + // The experimental reader has no fixed/free parser mode. Use the same file but do not force + // fixed-format dispatch for that reader. + const bool reader_fixed_format = + reader == mps_reader_type_t::default_reader ? fixed_format : false; + return read(root + "/" + file, reader, reader_fixed_format); + } + static mps_data_model_t read_lp_file(const std::string& file) { const std::string& root = cuopt::test::get_rapids_dataset_root_dir(); @@ -357,9 +384,13 @@ TEST(mps_parser, bad_mps_files) } } -TEST_F(good_mps_1_test, mps) +TEST_P(good_mps_1_test, mps) +{ + check_model(read_param_mps_file("linear_programming/good-mps-1.mps", false)); +} + +TEST_F(good_mps_1_test, mps_parser_internals) { - check_model(read_mps_file("linear_programming/good-mps-1.mps")); // Parser-struct fields that are MPS-only (not exposed via the data model). auto mps = read_from_mps("linear_programming/good-mps-1.mps"); EXPECT_EQ("good-1", mps.problem_name); @@ -592,9 +623,13 @@ TEST(mps_parser_free_format, bad_mps_files_free_format) } } -TEST_F(up_low_bounds_test, mps) +TEST_P(up_low_bounds_test, mps) +{ + check_model(read_param_mps_file("linear_programming/lp_model_with_var_bounds.mps", false)); +} + +TEST_F(up_low_bounds_test, mps_parser_internals) { - check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false)); auto mps = read_from_mps("linear_programming/lp_model_with_var_bounds.mps", false); EXPECT_EQ("lp_model_with_var_bounds", mps.problem_name); EXPECT_EQ("OBJ", mps.objective_name); @@ -607,16 +642,16 @@ TEST_F(up_low_bounds_test, lp) check_model(read_lp_file("linear_programming/lp_model_with_var_bounds.lp")); } -TEST_F(good_mps_1_test, mps_free_format) +TEST_P(good_mps_1_test, mps_free_format) { // free-format-mps-1.mps encodes the same problem as good-mps-1 with default // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker. - check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false)); + check_model(read_param_mps_file("linear_programming/free-format-mps-1.mps", false)); } -TEST_F(some_var_bounds_test, mps) +TEST_P(some_var_bounds_test, mps) { - check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps")); + check_model(read_param_mps_file("linear_programming/good-mps-some-var-bounds.mps")); } TEST_F(some_var_bounds_test, lp) @@ -624,9 +659,9 @@ TEST_F(some_var_bounds_test, lp) check_model(read_lp_file("linear_programming/good-mps-some-var-bounds.lp")); } -TEST_F(fixed_var_bound_test, mps) +TEST_P(fixed_var_bound_test, mps) { - check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps")); + check_model(read_param_mps_file("linear_programming/good-mps-fixed-var.mps")); } TEST_F(fixed_var_bound_test, lp) @@ -634,9 +669,9 @@ TEST_F(fixed_var_bound_test, lp) check_model(read_lp_file("linear_programming/good-mps-fixed-var.lp")); } -TEST_F(free_var_bound_test, mps) +TEST_P(free_var_bound_test, mps) { - check_model(read_mps_file("linear_programming/good-mps-free-var.mps")); + check_model(read_param_mps_file("linear_programming/good-mps-free-var.mps")); } TEST_F(free_var_bound_test, lp) @@ -644,9 +679,9 @@ TEST_F(free_var_bound_test, lp) check_model(read_lp_file("linear_programming/good-mps-free-var.lp")); } -TEST_F(lower_inf_var_bound_test, mps) +TEST_P(lower_inf_var_bound_test, mps) { - check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps")); + check_model(read_param_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps")); } TEST_F(lower_inf_var_bound_test, lp) @@ -662,9 +697,9 @@ TEST(mps_bounds, rhs_cost) EXPECT_EQ(int(-5), mps.objective_offset_value); } -TEST_F(upper_inf_var_bound_test, mps) +TEST_P(upper_inf_var_bound_test, mps) { - check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps")); + check_model(read_param_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps")); } TEST_F(upper_inf_var_bound_test, lp) @@ -817,9 +852,13 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types) }; } -TEST_F(mip_with_bounds_test, mps) +TEST_P(mip_with_bounds_test, mps) +{ + check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false)); +} + +TEST_F(mip_with_bounds_test, mps_parser_internals) { - check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false)); auto mps = read_from_mps("mixed_integer_programming/good-mip-mps-1.mps", false); EXPECT_EQ("COST", mps.objective_name); ASSERT_EQ(int(2), mps.row_types.size()); @@ -877,9 +916,9 @@ TEST(mps_parser, good_mps_file_mip_no_marker) EXPECT_EQ(10., mps.variable_upper_bounds[1]); } -TEST_F(mip_no_bounds_test, mps) +TEST_P(mip_no_bounds_test, mps) { - check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false)); + check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false)); } TEST_F(mip_no_bounds_test, lp) @@ -887,9 +926,10 @@ TEST_F(mip_no_bounds_test, lp) check_model(read_lp_file("mixed_integer_programming/good-mip-mps-no-bounds.lp")); } -TEST_F(mip_partial_bounds_test, mps) +TEST_P(mip_partial_bounds_test, mps) { - check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false)); + check_model( + read_param_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false)); } TEST_F(mip_partial_bounds_test, lp) @@ -897,6 +937,25 @@ TEST_F(mip_partial_bounds_test, lp) check_model(read_lp_file("mixed_integer_programming/good-mip-mps-partial-bounds.lp")); } +#define INSTANTIATE_MPS_READER_TEST(Fixture) \ + INSTANTIATE_TEST_SUITE_P(mps_readers, \ + Fixture, \ + ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \ + mps_reader_param_name) + +INSTANTIATE_MPS_READER_TEST(good_mps_1_test); +INSTANTIATE_MPS_READER_TEST(up_low_bounds_test); +INSTANTIATE_MPS_READER_TEST(some_var_bounds_test); +INSTANTIATE_MPS_READER_TEST(fixed_var_bound_test); +INSTANTIATE_MPS_READER_TEST(free_var_bound_test); +INSTANTIATE_MPS_READER_TEST(lower_inf_var_bound_test); +INSTANTIATE_MPS_READER_TEST(upper_inf_var_bound_test); +INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test); +INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test); +INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test); + +#undef INSTANTIATE_MPS_READER_TEST + #ifdef MPS_PARSER_WITH_BZIP2 TEST(mps_parser, good_mps_file_bzip2_compressed) { From 68daf3d31af007cb36fb44fe7a45da67b962ee23 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Wed, 3 Jun 2026 04:17:59 -0700 Subject: [PATCH 02/22] thread count cap --- cpp/src/io/experimental_mps_fast/fast_parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index bce17a435f..ae881bebe2 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -2591,7 +2591,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ int header_done = 0, rows_done = 0, columns_done = 0; int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0; -#pragma omp parallel num_threads(omp_get_max_threads()) +#pragma omp parallel num_threads(std::min(32, omp_get_max_threads())) { std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num()); nvtx::name_current_thread(thread_name.c_str()); From eb0e285da92169aff0cf94f9978d1fd8b96bc7e2 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 5 Jun 2026 02:42:06 -0700 Subject: [PATCH 03/22] fix crashes, more opti --- .../fast_parse_primitives.hpp | 231 ++---- .../io/experimental_mps_fast/fast_parser.cpp | 705 +++++++++++++----- .../fast_parser_adapter.cpp | 6 + .../io/experimental_mps_fast/file_reader.cpp | 52 +- .../experimental_mps_fast/lz4_file_reader.cpp | 33 +- .../experimental_mps_fast/perf_counters.hpp | 163 ++++ 6 files changed, 810 insertions(+), 380 deletions(-) create mode 100644 cpp/src/io/experimental_mps_fast/perf_counters.hpp diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index 9da59e7b44..453687df01 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -26,124 +26,6 @@ namespace mps_fast { -// double values in MPS data rarely need more than this many fractional digits. -inline constexpr double decimals[16][10] = { - {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}, - {0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}, - {0.000, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009}, - {0.0000, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009}, - {0.00000, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009}, - {0.000000, - 0.000001, - 0.000002, - 0.000003, - 0.000004, - 0.000005, - 0.000006, - 0.000007, - 0.000008, - 0.000009}, - {0.0000000, - 0.0000001, - 0.0000002, - 0.0000003, - 0.0000004, - 0.0000005, - 0.0000006, - 0.0000007, - 0.0000008, - 0.0000009}, - {0.00000000, - 0.00000001, - 0.00000002, - 0.00000003, - 0.00000004, - 0.00000005, - 0.00000006, - 0.00000007, - 0.00000008, - 0.00000009}, - {0.000000000, - 0.000000001, - 0.000000002, - 0.000000003, - 0.000000004, - 0.000000005, - 0.000000006, - 0.000000007, - 0.000000008, - 0.000000009}, - {0.0000000000, - 0.0000000001, - 0.0000000002, - 0.0000000003, - 0.0000000004, - 0.0000000005, - 0.0000000006, - 0.0000000007, - 0.0000000008, - 0.0000000009}, - {0.00000000000, - 0.00000000001, - 0.00000000002, - 0.00000000003, - 0.00000000004, - 0.00000000005, - 0.00000000006, - 0.00000000007, - 0.00000000008, - 0.00000000009}, - {0.000000000000, - 0.000000000001, - 0.000000000002, - 0.000000000003, - 0.000000000004, - 0.000000000005, - 0.000000000006, - 0.000000000007, - 0.000000000008, - 0.000000000009}, - {0.0000000000000, - 0.0000000000001, - 0.0000000000002, - 0.0000000000003, - 0.0000000000004, - 0.0000000000005, - 0.0000000000006, - 0.0000000000007, - 0.0000000000008, - 0.0000000000009}, - {0.00000000000000, - 0.00000000000001, - 0.00000000000002, - 0.00000000000003, - 0.00000000000004, - 0.00000000000005, - 0.00000000000006, - 0.00000000000007, - 0.00000000000008, - 0.00000000000009}, - {0.000000000000000, - 0.000000000000001, - 0.000000000000002, - 0.000000000000003, - 0.000000000000004, - 0.000000000000005, - 0.000000000000006, - 0.000000000000007, - 0.000000000000008, - 0.000000000000009}, - {0.0000000000000000, - 0.0000000000000001, - 0.0000000000000002, - 0.0000000000000003, - 0.0000000000000004, - 0.0000000000000005, - 0.0000000000000006, - 0.0000000000000007, - 0.0000000000000008, - 0.0000000000000009}}; - inline constexpr int EXP10_TABLE_MAX = 308; constexpr double constexpr_pow10(int exp) @@ -173,42 +55,9 @@ inline constexpr auto table_exp10 = make_exp10_table(); static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; } -static inline double fast_frac_atoi(const char*& data, const char* end) -{ - double val = 0.0; - -#define MPS_FAST_FRAC_DIGIT(i) \ - do { \ - if (data >= end || !is_digit_byte(*data)) return val; \ - val += decimals[i][static_cast(*data) & 0xF]; \ - ++data; \ - } while (0) - - MPS_FAST_FRAC_DIGIT(0); - MPS_FAST_FRAC_DIGIT(1); - MPS_FAST_FRAC_DIGIT(2); - MPS_FAST_FRAC_DIGIT(3); - MPS_FAST_FRAC_DIGIT(4); - MPS_FAST_FRAC_DIGIT(5); - MPS_FAST_FRAC_DIGIT(6); - MPS_FAST_FRAC_DIGIT(7); - MPS_FAST_FRAC_DIGIT(8); - MPS_FAST_FRAC_DIGIT(9); - MPS_FAST_FRAC_DIGIT(10); - MPS_FAST_FRAC_DIGIT(11); - MPS_FAST_FRAC_DIGIT(12); - MPS_FAST_FRAC_DIGIT(13); - MPS_FAST_FRAC_DIGIT(14); - MPS_FAST_FRAC_DIGIT(15); - -#undef MPS_FAST_FRAC_DIGIT - - while (data < end && is_digit_byte(*data)) { - ++data; - } - return val; -} - +// Honestly, it's pretty bare bones as it is. It could take advantage of SIMD/SWAR +// or use the Eisel-Lemire trick. Would have to be validated through benchmarking +// but usually MPS files use simple enough coefficients static inline double fast_atof_core(const char*& data, const char* end) { double sign = 1.0; @@ -219,17 +68,32 @@ static inline double fast_atof_core(const char*& data, const char* end) ++data; } - uint64_t int_part = 0; - while (data < end && is_digit_byte(*data)) { - int_part = int_part * 10 + (*data - '0'); - ++data; - } - - double result = static_cast(int_part); - - if (data < end && *data == '.') { - ++data; - result += fast_frac_atoi(data, end); + uint64_t significand = 0; + int decimal_exponent = 0; + int significant_digits = 0; + bool seen_dot = false; + + while (data < end) { + char c = *data; + if (is_digit_byte(c)) { + int digit = c - '0'; + if (seen_dot) { --decimal_exponent; } + if (significand != 0 || digit != 0) { + // FP64 can't represent more than that + if (significant_digits < 19) { + significand = significand * 10 + static_cast(digit); + ++significant_digits; + } else if (!seen_dot) { + ++decimal_exponent; + } + } + ++data; + } else if (c == '.' && !seen_dot) { + seen_dot = true; + ++data; + } else { + break; + } } if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) { @@ -249,11 +113,14 @@ static inline double fast_atof_core(const char*& data, const char* end) } exponent *= exp_sign; - if (exponent >= -EXP10_TABLE_MAX && exponent <= EXP10_TABLE_MAX) { - result *= table_exp10[static_cast(exponent + EXP10_TABLE_MAX)]; - } else { - result *= std::pow(10.0, exponent); - } + decimal_exponent += exponent; + } + + double result = static_cast(significand); + if (decimal_exponent >= -EXP10_TABLE_MAX && decimal_exponent <= EXP10_TABLE_MAX) { + result *= table_exp10[static_cast(decimal_exponent + EXP10_TABLE_MAX)]; + } else { + result *= std::pow(10.0, decimal_exponent); } return sign * result; @@ -352,17 +219,29 @@ struct cursor_t { void skip_ws() { ptr = simd_scan(ptr, end); } + bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); } + + void consume_eol() + { + if (ptr < end && *ptr == '\r') { + ptr++; + if (ptr < end && *ptr == '\n') { ptr++; } + return; + } + if (ptr < end && *ptr == '\n') { ptr++; } + } + void skip_comment_line() { - while (!done() && *ptr != '\n') { + while (!done() && *ptr != '\n' && *ptr != '\r') { ptr++; } - if (!done()) ptr++; + consume_eol(); } void skip_to_eol() { - while (!done() && *ptr != '\n') { + while (!done() && *ptr != '\n' && *ptr != '\r') { ptr++; } } @@ -480,8 +359,6 @@ struct cursor_t { return {std::string_view(field1_start, field1_end_off), std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)}; } - - bool eol() const { return ptr < end && *ptr == '\n'; } }; static inline void expect(cursor_t& cursor, const char* field) @@ -494,7 +371,7 @@ static inline void accept_comment_line(cursor_t& cursor) { for (;;) { while (!cursor.done() && cursor.eol()) { - cursor.advance(1); + cursor.consume_eol(); } if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; } cursor.skip_comment_line(); @@ -507,7 +384,7 @@ static inline void expect_eol(cursor_t& cursor) for (;;) { while (cursor.eol()) { - cursor.advance(1); + cursor.consume_eol(); } if (__unlikely(cursor.done())) { return; } diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index ae881bebe2..35a67346c3 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -44,18 +45,39 @@ namespace mps_fast { -static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; -static constexpr int MPS_ROWS_THREAD_CAP = 16; -static constexpr int MPS_COLUMNS_THREAD_CAP = 32; -static constexpr int MPS_BOUNDS_THREAD_CAP = 32; -static constexpr int MPS_NAMES_THREAD_CAP = 16; -static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024; -static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256ull * 1024ull * 1024ull; -static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * 1024 * 1024; +static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; +static constexpr int MPS_ROWS_THREAD_CAP = 16; +static constexpr int MPS_COLUMNS_THREAD_CAP = 32; +static constexpr int MPS_BOUNDS_THREAD_CAP = 32; +static constexpr int MPS_NAMES_THREAD_CAP = 16; +static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024; +static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256ull * 1024ull * 1024ull; +static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8ull * 1024ull * 1024ull; +static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * 1024 * 1024; +static constexpr size_t MPS_SMALL_RAW_FILE_BYTES = 4ull * 1024ull * 1024ull; +static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull; +static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64ull * 1024ull; +static constexpr size_t MPS_ROW_HASH_PARTITIONS = 32; +static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; +static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16; +static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32; + +static int parser_thread_cap_for_size(size_t bytes) +{ + int size_cap = bytes < MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES ? MPS_SMALL_FILE_THREAD_CAP + : MPS_LARGE_FILE_THREAD_CAP; + return std::max(1, std::min(size_cap, omp_get_max_threads())); +} static int phase_thread_count(int phase_cap) { - return std::max(1, std::min(phase_cap, omp_get_max_threads())); + const int available_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads(); + return std::max(1, std::min(phase_cap, available_threads)); +} + +static inline size_t row_hash_partition_for(uint32_t hash) +{ + return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); } // ============================================================================= @@ -82,12 +104,14 @@ static std::mutex& get_timer_mutex() static void flush_timers() { +#ifdef MPS_FAST_TIMERS std::lock_guard lock(get_timer_mutex()); auto& buffer = get_timer_buffer(); for (const auto& entry : buffer) { std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms); } buffer.clear(); +#endif } static size_t system_page_size() @@ -144,60 +168,44 @@ static void materialize_vector_hugepages(const char* label, class scoped_timer_t { public: scoped_timer_t(const char* name, double* accumulator = nullptr) +#ifdef MPS_FAST_TIMERS : name_(name), accumulator_(accumulator), nvtx_(name, nvtx::color_for_name(name)), - start_(std::chrono::high_resolution_clock::now()) + start_(std::chrono::high_resolution_clock::now()){} +#else + : accumulator_(accumulator) { + (void)name; } +#endif - ~scoped_timer_t() + ~scoped_timer_t() { +#ifdef MPS_FAST_TIMERS auto end = std::chrono::high_resolution_clock::now(); double elapsed_ms = std::chrono::duration(end - start_).count(); nvtx_.end(); if (accumulator_) { *accumulator_ += elapsed_ms; } std::lock_guard lock(get_timer_mutex()); get_timer_buffer().push_back({name_, elapsed_ms}); +#endif } scoped_timer_t(const scoped_timer_t&) = delete; scoped_timer_t& operator=(const scoped_timer_t&) = delete; private: +#ifdef MPS_FAST_TIMERS const char* name_; +#endif double* accumulator_; +#ifdef MPS_FAST_TIMERS nvtx::scoped_range nvtx_; std::chrono::high_resolution_clock::time_point start_; +#endif }; -static inline bool section_token_matches(const char* p, - const char* end, - const char* token, - size_t len) -{ - return (size_t)(end - p) >= len && std::memcmp(p, token, len) == 0 && - ((size_t)(end - p) == len || p[len] <= ' '); -} - -static inline bool is_quadratic_section_start(const char* p, const char* end) -{ - return section_token_matches(p, end, "QUADOBJ", 7) || - section_token_matches(p, end, "QMATRIX", 7) || - section_token_matches(p, end, "QCMATRIX", 8); -} - -static inline bool is_rhs_section_end(const char* p, const char* end) -{ - switch (p[0]) { - case 'B': return std::memcmp(p, "BOUNDS", 6) == 0 && p[6] <= ' '; - case 'Q': return is_quadratic_section_start(p, end); - case 'R': return std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' '; - case 'E': return std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' '; - default: return false; - } -} - static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section) { const char* row_end = row_start; @@ -287,6 +295,12 @@ static inline bool dense_suffix_width_ok(uint64_t value, template struct parse_state_t { + struct row_hash_partition_t { + hash_slot_var_t* slots = nullptr; + size_t buckets = 0; + size_t mask = 0; + }; + cuopt::linear_programming::io::mps_data_model_t& problem; cursor_t& cursor; @@ -309,7 +323,9 @@ struct parse_state_t { size_t row_hash_buckets = 0; size_t row_hash_mask = 0; // buckets - 1, for fast modulo via & mmap_region_t row_hash_region; - hash_slot_var_t* row_names_ht = nullptr; + hash_slot_var_t* row_names_ht = nullptr; + size_t row_hash_partition_count = 0; + std::array row_hash_partitions = {}; // Overflow map for row names longer than HASH_KEY_BYTES std::unordered_map row_names_long; @@ -326,6 +342,15 @@ struct parse_state_t { // var_names still uses STL (only used in parse_bounds, not as hot) std::unordered_map var_names_map; + struct bounds_only_var_t { + f_t lb = f_t{0}; + f_t ub = std::numeric_limits::infinity(); + char type = 'C'; + }; + + // Some writers introduce zero-column variables only in BOUNDS. + std::map bounds_only_vars; + parse_state_t(cuopt::linear_programming::io::mps_data_model_t& p, cursor_t& c) : problem(p), cursor(c) { @@ -423,13 +448,73 @@ struct parse_state_t { return true; } + size_t row_hash_bucket_count_for(size_t n_rows) const + { +#ifdef MPS_FAST_COMPACT_ROW_HASH + // Keep the row hash compact. Probe counts are usually low, and a smaller + // table reduces cache/TLB footprint on medium instances. + return next_power_of_2(std::max(n_rows + n_rows / 2, (size_t)64)); +#else + // Original conservative sizing policy. + return next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64)); +#endif + } + void init_row_hash_table_impl() { scoped_timer_t timer("row_hash_init_total"); - size_t n_rows = row_names_sv.size(); - // load factor 50% - row_hash_buckets = next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64)); - row_hash_mask = row_hash_buckets - 1; + size_t n_rows = row_names_sv.size(); + const int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); + const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1; + std::vector row_hashes; + std::vector row_order; + std::array partition_counts = {}; + std::array partition_offsets = {}; + + if (use_partitioned) { + scoped_timer_t timer("row_hash_partition_metadata"); + row_hashes.resize(n_rows); + size_t inline_rows = 0; + for (size_t idx = 0; idx < n_rows; ++idx) { + std::string_view name = row_names_sv[idx]; + if (__unlikely(name.size() > HASH_KEY_BYTES)) { + row_names_long[name] = idx; + continue; + } + uint32_t hash = fnv1a_hash(name.data(), name.size()); + row_hashes[idx] = hash; + ++partition_counts[row_hash_partition_for(hash)]; + ++inline_rows; + } + + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + partition_offsets[p + 1] = partition_offsets[p] + partition_counts[p]; + } + + row_order.resize(inline_rows); + auto next_offsets = partition_offsets; + for (size_t idx = 0; idx < n_rows; ++idx) { + if (__unlikely(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; } + size_t part = row_hash_partition_for(row_hashes[idx]); + row_order[next_offsets[part]++] = idx; + } + } + + if (use_partitioned) { + row_hash_partition_count = MPS_ROW_HASH_PARTITIONS; + size_t total_buckets = 0; + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + row_hash_partitions[p].buckets = row_hash_bucket_count_for(partition_counts[p]); + row_hash_partitions[p].mask = row_hash_partitions[p].buckets - 1; + total_buckets += row_hash_partitions[p].buckets; + } + row_hash_buckets = total_buckets; + row_hash_mask = row_hash_buckets - 1; + } else { + row_hash_partition_count = 0; + row_hash_buckets = row_hash_bucket_count_for(n_rows); + row_hash_mask = row_hash_buckets - 1; + } size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t); { @@ -438,6 +523,13 @@ struct parse_state_t { row_hash_region = mmap_region_t::anonymous( row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table"); row_names_ht = static_cast(row_hash_region.data()); + if (use_partitioned) { + hash_slot_var_t* next_slots = row_names_ht; + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + row_hash_partitions[p].slots = next_slots; + next_slots += row_hash_partitions[p].buckets; + } + } // Request huge pages to reduce TLB misses row_hash_region.advise(MADV_HUGEPAGE); } @@ -453,9 +545,86 @@ struct parse_state_t { { scoped_timer_t timer("row_hash_insert_all"); - for (size_t idx = 0; idx < n_rows; ++idx) { - row_insert(row_names_sv[idx], idx); +#ifdef MPS_FAST_PERF_COUNTERS + size_t total_probes = 0; + size_t max_probes = 0; + size_t long_names = row_names_long.size(); +#endif + if (use_partitioned) { + scoped_timer_t timer("row_hash_insert_partitioned"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots(MPS_ROW_HASH_PARTITIONS); + std::vector partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0); + std::vector partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0); +#endif +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { + size_t p = (size_t)part_id; +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; + size_t local_total_probes = 0; + size_t local_max_probes = 0; +#endif + const auto& part = row_hash_partitions[p]; + for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { + size_t idx = row_order[pos]; +#ifdef MPS_FAST_PERF_COUNTERS + size_t probes = row_insert_into( + part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx); + local_total_probes += probes; + local_max_probes = std::max(local_max_probes, probes); +#else + row_insert_into( + part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + partition_total_probes[p] = local_total_probes; + partition_max_probes[p] = local_max_probes; + perf_snapshots[p] = perf_counters.stop(); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + total_probes += partition_total_probes[p]; + max_probes = std::max(max_probes, partition_max_probes[p]); + } + print_perf_totals("row_hash_insert_partitioned", perf_snapshots); +#endif + } else { +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + for (size_t idx = 0; idx < n_rows; ++idx) { +#ifdef MPS_FAST_PERF_COUNTERS + size_t probes = row_insert(row_names_sv[idx], idx); + if (probes == 0) { + ++long_names; + } else { + total_probes += probes; + max_probes = std::max(max_probes, probes); + } +#else + row_insert(row_names_sv[idx], idx); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("row_hash_insert_all", {perf_counters.stop()}); +#endif } +#ifdef MPS_FAST_PERF_COUNTERS + size_t probed_rows = n_rows - long_names; + double mean_probes = probed_rows == 0 ? 0.0 : (double)total_probes / (double)probed_rows; + double load_factor = row_hash_buckets == 0 ? 0.0 : (double)n_rows / (double)row_hash_buckets; + std::fprintf(stderr, + "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n", + n_rows, + row_hash_buckets, + load_factor, + long_names, + mean_probes, + max_probes); +#endif } // Force the kernel to please please collapse the page range into THP pages @@ -546,13 +715,21 @@ struct parse_state_t { auto it = row_names_long.find(name); return it != row_names_long.end() ? it->second : SIZE_MAX; } - hash_key_t key = make_key(name.data(), name.size()); - uint32_t hash = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask; - const hash_slot_var_t* slots = row_names_ht; - const hash_slot_var_t* slot = &slots[hash]; + hash_key_t key = make_key(name.data(), name.size()); + uint32_t hash = fnv1a_hash(name.data(), name.size()); + if (__likely(row_hash_partition_count != 0)) { + const auto& part = row_hash_partitions[row_hash_partition_for(hash)]; + return row_lookup_in(part.slots, part.buckets, part.mask, key, hash); + } + return row_lookup_in(row_names_ht, row_hash_buckets, row_hash_mask, key, hash); + } - for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) { - if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; } + size_t row_lookup_in( + const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) const + { + const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } if (slot->count == 0) { return SIZE_MAX; } if (key_cmpeq(slot->key, key)) { return slot->count - 1; } } @@ -593,27 +770,39 @@ struct parse_state_t { std::memcpy(suffix, digits_buf, digits_len); } - void row_insert(std::string_view name, size_t index) + size_t row_insert(std::string_view name, size_t index) { if (__unlikely(name.size() > HASH_KEY_BYTES)) { row_names_long[name] = index; - return; - } - hash_key_t key = make_key(name.data(), name.size()); - uint32_t hash = fnv1a_hash(name.data(), name.size()) & (uint32_t)row_hash_mask; - hash_slot_var_t* slots = row_names_ht; - hash_slot_var_t* slot = &slots[hash]; - - for (size_t i = 0; i < row_hash_buckets; ++i, ++slot) { - if (slot >= &slots[row_hash_buckets]) { slot = &slots[0]; } + return 0; + } + return row_insert_into(row_names_ht, + row_hash_buckets, + row_hash_mask, + name, + fnv1a_hash(name.data(), name.size()), + index); + } + + size_t row_insert_into(hash_slot_var_t* slots, + size_t buckets, + size_t mask, + std::string_view name, + uint32_t hash, + size_t index) + { + hash_key_t key = make_key(name.data(), name.size()); + hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } if (slot->count == 0) { key_store(slot->key, key); // Writes 32 bytes, including garbage in last 4 slot->count = (uint32_t)(index + 1); // Overwrite last 4 bytes with actual count - return; + return i + 1; } if (key_cmpeq(slot->key, key)) { slot->count = (uint32_t)(index + 1); - return; + return i + 1; } } __builtin_trap(); @@ -624,16 +813,31 @@ struct parse_state_t { // Section parsers // ============================================================================= +static std::string_view read_rest_of_line_trimmed(cursor_t& cursor) +{ + const char* begin = cursor.ptr; + const char* end = begin; + while (end < cursor.end && *end != '\n' && *end != '\r') { + ++end; + } + + while (begin < end && (*begin == ' ' || *begin == '\t')) { + ++begin; + } + while (end > begin && (end[-1] == ' ' || end[-1] == '\t')) { + --end; + } + cursor.ptr = end; + return std::string_view(begin, (size_t)(end - begin)); +} + template static void parse_name_section(parse_state_t& state) { scoped_timer_t timer("parse_name"); if (peek(state.cursor) == "ROWS") { return; } expect(state.cursor, "NAME"); - if (!state.cursor.eol()) { - state.problem_name_sv = state.cursor.read_field(); - accept_comment(state.cursor); - } + if (!state.cursor.eol()) { state.problem_name_sv = read_rest_of_line_trimmed(state.cursor); } expect_eol(state.cursor); } @@ -643,12 +847,13 @@ static void parse_objsense_section(parse_state_t& state) scoped_timer_t timer("parse_objsense"); if (accept(state.cursor, "OBJSENSE")) { if (state.cursor.eol()) { expect_eol(state.cursor); } - if (accept(state.cursor, "MIN")) { + auto sense = state.cursor.read_field(); + if (sense == "MIN" || sense == "MINIMIZE") { state.problem.maximize_ = false; - } else if (accept(state.cursor, "MAX")) { + } else if (sense == "MAX" || sense == "MAXIMIZE") { state.problem.maximize_ = true; } else { - state.cursor.error("expected MIN or MAX, got '%s'", state.cursor.read_field().data()); + state.cursor.error("expected MIN/MAX or MINIMIZE/MAXIMIZE, got '%s'", sense.data()); } accept_comment(state.cursor); expect_eol(state.cursor); @@ -693,8 +898,7 @@ static bool parse_rows_line_fast(const char*& p, char& row_type, std::string_view& row_name) { - while (p < end && *p <= ' ' && *p != '\n') - p++; + p = cursor_t::simd_scan(p, end); if (p >= end) { return false; } if (*p == '\n') { p++; @@ -706,12 +910,10 @@ static bool parse_rows_line_fast(const char*& p, } row_type = *p++; - while (p < end && *p <= ' ' && *p != '\n') - p++; + p = cursor_t::simd_scan(p, end); const char* name_start = p; - while (p < end && *p > ' ') - p++; + p = cursor_t::simd_scan(p, end); if (name_start == p) { return false; } row_name = std::string_view(name_start, (size_t)(p - name_start)); @@ -1135,20 +1337,6 @@ static const char* find_next_line(const char* p, const char* end) return p; } -static const char* find_bounds_body_end(const char* bounds_body_start, const char* parse_end) -{ - const char* p = bounds_body_start; - while (p < parse_end) { - if ((*p == 'E' && parse_end - p >= 6 && std::memcmp(p, "ENDATA", 6) == 0 && p[6] <= ' ') || - (*p == 'Q' && is_quadratic_section_start(p, parse_end)) || - (*p == 'R' && parse_end - p >= 6 && std::memcmp(p, "RANGES", 6) == 0 && p[6] <= ' ')) { - return p; - } - p = find_next_line(p, parse_end); - } - return parse_end; -} - static std::vector compute_line_chunk_boundaries(const char* section_start, const char* section_end, int num_threads) @@ -1306,7 +1494,8 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, sign = -1.0; cursor.advance(1); } - if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && cursor.ptr[1] == '\n') { + if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && + (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) { value = sign * (cursor.ptr[0] - '0'); cursor.advance(1); } else { @@ -1720,7 +1909,8 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) scoped_timer_t timer("parse_rhs"); expect_section(cursor, "RHS"); - auto field_from_start = [](const char* start, const char* end) { + // necessary on the cold path since we directly read and lookup on the hot path + auto reread_field_name = [](const char* start, const char* end) { const char* p = start; while (p < end && *p > ' ') { p++; @@ -1729,20 +1919,24 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) }; auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) { + // This is a regular non-obj row. if (row_idx != SIZE_MAX) { state.problem.b_[row_idx] = value; return; } - std::string_view row_name = field_from_start(row_start, cursor.end); + // This is the objective row. + std::string_view row_name = reread_field_name(row_start, cursor.end); if (row_name == state.objective_name_sv) { state.problem.objective_offset_ = -value; return; } + // Other objectives, ignored currently. cold path if (state.is_ignored_objective_name(row_name)) { return; } + // Unexpected! error_unknown_row(cursor, row_start, "RHS"); }; - while (cursor.ptr < cursor.end && !is_rhs_section_end(cursor.ptr, cursor.end)) { + while (cursor.ptr < cursor.end) { auto rhs_name = cursor.read_field(); (void)rhs_name; if (accept_comment(cursor)) { @@ -1755,6 +1949,7 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) apply_rhs(row_start, row_idx, (f_t)value); accept_comment(cursor); + // Optional second entry if (!cursor.eol()) { const char* row_start2 = cursor.ptr; size_t row_idx2 = state.read_row_lookup(cursor); @@ -1773,13 +1968,16 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, const char* bounds_body_end, size_t n_vars) { - const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start); - const int num_threads = phase_thread_count(MPS_BOUNDS_THREAD_CAP); - if (!state.col_dense_ordered || bounds_bytes < MPS_BOUNDS_PARALLEL_MIN_BYTES || num_threads < 2) { - return false; - } + const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start); + const int num_threads = phase_thread_count(MPS_BOUNDS_THREAD_CAP); + const bool use_dense_lookup = state.col_dense_ordered; + const size_t min_parallel_bytes = + use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES; + if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; } - MPS_NVTX_RANGE("parse_bounds_parallel_dense", nvtx::colors::bounds); + MPS_NVTX_RANGE( + use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint", + nvtx::colors::bounds); struct BoundsParallelStats { size_t lines = 0; @@ -1805,7 +2003,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } { - scoped_timer_t timer("parse_bounds_parallel_dense"); + scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense" + : "parse_bounds_parallel_ordered_hint"); // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse // optimistically, then accept only if chunk summaries prove strict order. #pragma omp parallel for schedule(static) num_threads(num_threads) @@ -1815,6 +2014,27 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start)); cursor.skip_ws(); size_t prev_var = SIZE_MAX; + size_t hint_idx = 0; + auto lookup_var = [&](std::string_view var_name) { + if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); } + if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) { + return hint_idx + 1; + } + if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { return hint_idx; } + + size_t search_start = hint_idx + 2; + size_t search_end = n_vars; + search_loop: + for (size_t i = search_start; i < search_end; ++i) { + if (state.var_names_sv[i] == var_name) { return i; } + } + if (search_start != 0) { + search_end = hint_idx; + search_start = 0; + goto search_loop; + } + return SIZE_MAX; + }; try { while (cursor.ptr < cursor.end) { if (__unlikely(*cursor.ptr == '$')) { @@ -1843,17 +2063,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, continue; } - size_t var_idx = state.col_lookup_dense_ordered(var_name); + size_t var_idx = lookup_var(var_name); if (__unlikely(var_idx == SIZE_MAX)) { local.dense_misses++; - std::snprintf(local.error_msg, - sizeof(local.error_msg), - "unknown variable name in BOUNDS: %.*s", - (int)var_name.size(), - var_name.data()); - local.error_ptr = cursor.ptr; break; } + hint_idx = var_idx; local.dense_hits++; local.lines++; local.min_var = std::min(local.min_var, var_idx); @@ -1864,10 +2079,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, bool first_bound_for_var = bound_seen[var_idx] == 0; bound_seen[var_idx] = 1; - f_t value = 0; + f_t value = 0; + bool has_value = false; accept_comment(cursor); if (!cursor.eol()) { - value = (f_t)expect_number_fast_pm_one(cursor); + value = (f_t)expect_number_fast_pm_one(cursor); + has_value = true; accept_comment(cursor); } @@ -1906,6 +2123,15 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } state.problem.var_types_[var_idx] = 'I'; local.saw_integer_type = true; + } else if (bound_type == "SC") { + if (__unlikely(!has_value)) { + std::snprintf( + local.error_msg, sizeof(local.error_msg), "SC bound requires an upper bound value"); + local.error_ptr = cursor.ptr; + break; + } + state.problem.variable_upper_bounds_[var_idx] = value; + state.problem.var_types_[var_idx] = 'S'; } else { std::snprintf(local.error_msg, sizeof(local.error_msg), @@ -1946,6 +2172,12 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0; if (!order_safe) { + std::fprintf(stderr, + "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu " + "non_strict_order=%zu overlap_chunks=%zu\n", + dense_misses, + non_strict_order, + overlap_chunks); cursor.ptr = bounds_body_start; return false; } @@ -2028,8 +2260,7 @@ static void parse_bounds_section(parse_state_t& state, } const char* bounds_body_start = cursor.ptr; - const char* bounds_body_end = - allow_parallel_dense ? find_bounds_body_end(bounds_body_start, cursor.end) : cursor.end; + const char* bounds_body_end = cursor.end; if (allow_parallel_dense) { if (parse_bounds_section_parallel_dense( state, cursor, bounds_body_start, bounds_body_end, n_vars)) { @@ -2049,11 +2280,7 @@ static void parse_bounds_section(parse_state_t& state, size_t hint_idx = 0; { scoped_timer_t timer("parse_bounds"); - for (;;) { - bool done = cursor.done() || peek(cursor) == "RANGES" || peek(cursor) == "ENDATA" || - is_quadratic_section_start(cursor.ptr, cursor.end); - if (done) break; - + while (!cursor.done()) { auto bound_type = cursor.read_field(); auto bound_name = cursor.read_field(); (void)bound_name; @@ -2065,13 +2292,11 @@ static void parse_bounds_section(parse_state_t& state, } // optimized lookup using hint (bounds often in same order as columns) - size_t var_idx = SIZE_MAX; + size_t var_idx = SIZE_MAX; + typename parse_state_t::bounds_only_var_t* aux_var = nullptr; if (__likely(state.col_dense_ordered)) { var_idx = state.col_lookup_dense_ordered(var_name); - if (var_idx == SIZE_MAX) { - cursor.error( - "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data()); - } + if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) { var_idx = hint_idx + 1; } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { @@ -2092,60 +2317,88 @@ static void parse_bounds_section(parse_state_t& state, search_start = 0; goto search_loop; } - cursor.error( - "unknown variable name in BOUNDS: %.*s", (int)var_name.size(), var_name.data()); + aux_var = &state.bounds_only_vars[var_name]; } found: - hint_idx = var_idx; - bool first_bound_for_var = !has_bound(var_idx); + if (var_idx != SIZE_MAX) { hint_idx = var_idx; } + bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx); - f_t value = 0; + f_t value = 0; + bool has_value = false; accept_comment(cursor); if (!cursor.eol()) { // bounds are often just set to 0 or 1 if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') { value = cursor.ptr[0] - '0'; cursor.ptr += 1; + has_value = true; } else { - value = (f_t)expect_number(cursor); + value = (f_t)expect_number(cursor); + has_value = true; } accept_comment(cursor); } + auto set_lb = [&](f_t x) { + if (aux_var) { + aux_var->lb = x; + } else { + state.problem.variable_lower_bounds_[var_idx] = x; + } + }; + auto set_ub = [&](f_t x) { + if (aux_var) { + aux_var->ub = x; + } else { + state.problem.variable_upper_bounds_[var_idx] = x; + } + }; + auto set_type = [&](char t) { + if (aux_var) { + aux_var->type = t; + } else { + state.problem.var_types_[var_idx] = t; + } + }; + if (bound_type == "LO") { - state.problem.variable_lower_bounds_[var_idx] = value; + set_lb(value); } else if (bound_type == "UP") { - state.problem.variable_upper_bounds_[var_idx] = value; + set_ub(value); if (first_bound_for_var && value < f_t{0}) { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + set_lb(-std::numeric_limits::infinity()); } } else if (bound_type == "FX") { - state.problem.variable_lower_bounds_[var_idx] = value; - state.problem.variable_upper_bounds_[var_idx] = value; + set_lb(value); + set_ub(value); } else if (bound_type == "FR") { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); - state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + set_lb(-std::numeric_limits::infinity()); + set_ub(std::numeric_limits::infinity()); } else if (bound_type == "MI") { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + set_lb(-std::numeric_limits::infinity()); } else if (bound_type == "PL") { - state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); + set_ub(std::numeric_limits::infinity()); } else if (bound_type == "BV") { - state.problem.variable_lower_bounds_[var_idx] = 0; - state.problem.variable_upper_bounds_[var_idx] = 1; - state.problem.var_types_[var_idx] = 'I'; + set_lb(0); + set_ub(1); + set_type('I'); } else if (bound_type == "LI") { - state.problem.variable_lower_bounds_[var_idx] = value; - state.problem.var_types_[var_idx] = 'I'; + set_lb(value); + set_type('I'); } else if (bound_type == "UI") { - state.problem.variable_upper_bounds_[var_idx] = value; + set_ub(value); if (first_bound_for_var && value < f_t{0}) { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); + set_lb(-std::numeric_limits::infinity()); } - state.problem.var_types_[var_idx] = 'I'; + set_type('I'); + } else if (bound_type == "SC") { + if (__unlikely(!has_value)) { cursor.error("SC bound requires an upper bound value"); } + set_ub(value); + set_type('S'); } else { cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data()); } - mark_bound(var_idx); + if (aux_var == nullptr) { mark_bound(var_idx); } expect_eol(cursor); } @@ -2204,8 +2457,7 @@ static void parse_ranges_section(parse_state_t& state, cursor_t& curso } }; - while (cursor.ptr < cursor.end && peek(cursor) != "BOUNDS" && peek(cursor) != "ENDATA" && - !is_quadratic_section_start(cursor.ptr, cursor.end)) { + while (cursor.ptr < cursor.end) { auto range_name = cursor.read_field(); (void)range_name; if (accept_comment(cursor)) { @@ -2307,12 +2559,10 @@ static void build_quadratic_csr(parse_state_t& state, } template -[[maybe_unused]] static void parse_quadratic_sections(parse_state_t& state, - cursor_t& cursor) +static void parse_quadratic_sections(parse_state_t& state, cursor_t& cursor) { scoped_timer_t timer("parse_quadratic_sections"); - if (cursor.done() || peek(cursor) == "ENDATA") { return; } - if (!is_quadratic_section_start(cursor.ptr, cursor.end)) { return; } + if (cursor.done()) { return; } build_var_name_map_if_needed(state); std::vector> quadobj_entries; @@ -2332,7 +2582,6 @@ template }; while (cursor.ptr < cursor.end) { - if (peek(cursor) == "ENDATA") { break; } if (accept_section(cursor, "QUADOBJ")) { active_entries = &quadobj_entries; continue; @@ -2341,6 +2590,9 @@ template active_entries = &qmatrix_entries; continue; } + if (accept_section(cursor, "QCMATRIX")) { + cursor.error("QCMATRIX sections are not supported by the experimental fast MPS parser"); + } if (active_entries == nullptr) { break; } auto var1 = cursor.read_field(); @@ -2442,24 +2694,11 @@ static void parse_ranges_range(parse_state_t& state, template static void parse_quadratic_range(parse_state_t& state, mps_phase_range_t range, - const char* fallback_ptr) + const char*) { - (void)state; - if (range.present) { - cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); - if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) { - throw std::logic_error( - "experimental fast MPS reader currently supports LP/MIP MPS files only; " - "quadratic MPS sections are not supported"); - } - } else { - cursor_t cursor(fallback_ptr, 16); - if (!cursor.done() && is_quadratic_section_start(cursor.ptr, cursor.end)) { - throw std::logic_error( - "experimental fast MPS reader currently supports LP/MIP MPS files only; " - "quadratic MPS sections are not supported"); - } - } + if (!range.present) { return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_quadratic_sections(state, cursor); } template @@ -2519,6 +2758,23 @@ static void materialize_problem_names(parse_state_t& state) } } +template +static void append_bounds_only_variables(parse_state_t& state) +{ + if (state.bounds_only_vars.empty()) { return; } + scoped_timer_t timer("append_bounds_only_variables"); + + // BOUNDS-only variables have no matrix entries; append after COLUMNS vars. + for (const auto& [name, aux] : state.bounds_only_vars) { + state.problem.var_names_.emplace_back(name); + state.problem.var_types_.push_back(aux.type); + state.problem.c_.push_back(f_t{0}); + state.problem.variable_lower_bounds_.push_back(aux.lb); + state.problem.variable_upper_bounds_.push_back(aux.ub); + } + state.problem.n_vars_ = (i_t)state.problem.var_names_.size(); +} + template static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( Stream& stream, const char* total_timer_name, const char* producer_task_name) @@ -2591,7 +2847,10 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ int header_done = 0, rows_done = 0, columns_done = 0; int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0; -#pragma omp parallel num_threads(std::min(32, omp_get_max_threads())) + const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size); + const int parser_threads = parser_thread_cap_for_size(parser_size); + +#pragma omp parallel num_threads(parser_threads) { std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num()); nvtx::name_current_thread(thread_name.c_str()); @@ -2724,6 +2983,8 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ if (first_task_error) { std::rethrow_exception(first_task_error); } + append_bounds_only_variables(state); + input.size = stream.size(); cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present ? input.registry->range(mps_phase_kind::quadratic).end @@ -2740,6 +3001,102 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ return problem; } +struct small_raw_read_t { + bool use_small_path = false; + std::vector buffer; +}; + +static small_raw_read_t try_read_small_raw_file(const std::string& path) +{ + FILE* file = std::fopen(path.c_str(), "rb"); + if (file == nullptr) { + throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno)); + } + std::unique_ptr file_guard(file, &std::fclose); + + if (std::fseek(file, 0, SEEK_END) != 0) { + throw std::runtime_error("Failed to seek raw MPS file '" + path + "'"); + } + long file_size_long = std::ftell(file); + if (file_size_long < 0) { + throw std::runtime_error("Failed to determine raw MPS file size '" + path + "'"); + } + std::size_t file_size = static_cast(file_size_long); + if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; } + if (std::fseek(file, 0, SEEK_SET) != 0) { + throw std::runtime_error("Failed to rewind raw MPS file '" + path + "'"); + } + + std::vector buffer(file_size); + if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) { + throw std::runtime_error("Failed to read raw MPS file '" + path + "'"); + } + return {true, std::move(buffer)}; +} + +template +static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_small_raw_file( + std::vector buffer) +{ + auto total_timer = std::make_unique("parse_mps_fast_file_raw_small (total)"); + const char* data = buffer.data(); + const char* end = data + buffer.size(); + + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(data, 1, registry); + scanner.observe_block(0, data, end); + scanner.publish_ready(buffer.size()); + + cuopt::linear_programming::io::mps_data_model_t problem; + problem.n_vars_ = 0; + problem.n_constraints_ = 0; + problem.nnz_ = 0; + problem.maximize_ = false; + problem.objective_scaling_factor_ = f_t{1}; + problem.objective_offset_ = f_t{0}; + + std::size_t reserve_size = std::max(buffer.size(), 1024 * 1024); + std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); + problem.A_offsets_.reserve(reserve_dim); + problem.b_.reserve(reserve_dim); + problem.variable_lower_bounds_.reserve(reserve_dim); + problem.variable_upper_bounds_.reserve(reserve_dim); + problem.var_types_.reserve(reserve_dim); + problem.row_types_.reserve(reserve_dim); + problem.row_names_.reserve(reserve_dim); + problem.var_names_.reserve(reserve_dim); + problem.constraint_lower_bounds_.reserve(reserve_dim); + problem.constraint_upper_bounds_.reserve(reserve_dim); + + cursor_t cursor(data, buffer.size()); + parse_state_t state(problem, cursor); + state.row_names_sv.reserve(reserve_dim); + + parse_header_range(state, registry.range(mps_phase_kind::header)); + parse_rows_range(state, registry.range(mps_phase_kind::rows)); + parse_columns_range(state, registry.range(mps_phase_kind::columns), 1); + materialize_problem_names(state); + parse_rhs_range(state, registry.range(mps_phase_kind::rhs)); + parse_ranges_range(state, registry.range(mps_phase_kind::ranges), data); + parse_bounds_range(state, registry.range(mps_phase_kind::bounds), data); + parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data); + append_bounds_only_variables(state); + + cursor.ptr = registry.range(mps_phase_kind::quadratic).present + ? registry.range(mps_phase_kind::quadratic).end + : (registry.range(mps_phase_kind::bounds).present + ? registry.range(mps_phase_kind::bounds).end + : (registry.range(mps_phase_kind::ranges).present + ? registry.range(mps_phase_kind::ranges).end + : registry.range(mps_phase_kind::rhs).end)); + cursor.end = end; + if (!cursor.done()) { expect(cursor, "ENDATA"); } + + total_timer.reset(); + flush_timers(); + return problem; +} + template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( const std::string& path, FileReadMethod read_method) @@ -2751,11 +3108,15 @@ cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode"); } if (effective_method == FileReadMethod::Read) { + small_raw_read_t small_raw = try_read_small_raw_file(path); + if (small_raw.use_small_path) { + return parse_mps_fast_small_raw_file(std::move(small_raw.buffer)); + } RawInputStream stream(path); return parse_mps_fast_stream( stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); } - throw std::runtime_error("experimental fast MPS reader supports raw and LZ4 inputs only"); + throw std::runtime_error("single-path parser supports raw read and LZ4 inputs only"); } template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp index 9e5777efc2..49a7602739 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp @@ -9,6 +9,8 @@ #include "fast_parser.hpp" +#include + namespace cuopt::linear_programming::io { template @@ -19,5 +21,9 @@ mps_data_model_t read_mps_fast_experimental(const std::string& mps_fil template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); } // namespace cuopt::linear_programming::io diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 819b1948bf..97ef5c5cc4 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -162,35 +162,42 @@ void RawInputStream::run_decode_tasks() }; auto read_window = [&](std::size_t index) { + MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); std::size_t offset = index * window_bytes_; std::size_t size = std::min(window_bytes_, file_size_ - offset); std::size_t done = 0; - while (done < size) { - ssize_t got = - ::pread(fd_, output_data_ + offset + done, size - done, static_cast(offset + done)); - if (got < 0) { - if (errno == EINTR) { continue; } - throw std::runtime_error("Failed to pread raw MPS file '" + path_ + - "': " + std::strerror(errno)); - } - if (got == 0) { - throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'"); + { + MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); + while (done < size) { + ssize_t got = ::pread( + fd_, output_data_ + offset + done, size - done, static_cast(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + throw std::runtime_error("Failed to pread raw MPS file '" + path_ + + "': " + std::strerror(errno)); + } + if (got == 0) { + throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'"); + } + done += static_cast(got); } - done += static_cast(got); } - section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); - frontier_mutex_.lock(); - block_done_[index] = 1; - block_end_[index] = offset + size; - std::size_t before = ready_bytes_; - while (next_block_ < block_done_.size() && block_done_[next_block_]) { - ready_bytes_ = block_end_[next_block_]; - ++next_block_; + { + MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); + section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); + frontier_mutex_.lock(); + block_done_[index] = 1; + block_end_[index] = offset + size; + std::size_t before = ready_bytes_; + while (next_block_ < block_done_.size() && block_done_[next_block_]) { + ready_bytes_ = block_end_[next_block_]; + ++next_block_; + } + std::size_t after = ready_bytes_; + frontier_mutex_.unlock(); + if (after > before) { section_scanner_->publish_ready(after); } } - std::size_t after = ready_bytes_; - frontier_mutex_.unlock(); - if (after > before) { section_scanner_->publish_ready(after); } }; std::vector workers; @@ -199,6 +206,7 @@ void RawInputStream::run_decode_tasks() workers.emplace_back([&, t] { std::string thread_name = "raw-input-read-" + std::to_string(t); nvtx::name_current_thread(thread_name.c_str()); + MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io); while (!stop.load(std::memory_order_acquire)) { std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); if (index >= window_count_) { break; } diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index fbe18768af..36c42ba79a 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -9,6 +9,7 @@ #include #endif +#ifndef _WIN32 #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#endif #include #include @@ -44,9 +46,9 @@ constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024 constexpr std::size_t lz4_input_max_io_threads = 8; constexpr std::size_t lz4_no_content_size_reserve_ratio = 16; -#if defined(MPS_PARSER_WITH_LZ4) using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int); +#if defined(MPS_PARSER_WITH_LZ4) struct lz4_runtime_t { void* handle = nullptr; LZ4_decompress_safe_t decompress_safe = nullptr; @@ -54,28 +56,28 @@ struct lz4_runtime_t { lz4_runtime_t() { for (const char* soname : {"liblz4.so.1", "liblz4.so"}) { - handle = dlopen(soname, RTLD_LAZY); + handle = ::dlopen(soname, RTLD_LAZY); if (handle != nullptr) { break; } } if (handle == nullptr) { throw std::logic_error( "Could not open .mps.lz4 file since liblz4 was not found " - "(tried liblz4.so.1, liblz4.so). In order to open .mps.lz4 files " - "directly, please ensure liblz4 is installed. Alternatively, decompress " - "the .lz4 file manually and open the uncompressed .mps file."); + "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually " + "or install liblz4."); } - decompress_safe = reinterpret_cast(dlsym(handle, "LZ4_decompress_safe")); + decompress_safe = + reinterpret_cast(::dlsym(handle, "LZ4_decompress_safe")); if (decompress_safe == nullptr) { throw std::logic_error( - "Error loading liblz4! Library version might be incompatible. Please decompress " - "the .lz4 file manually and open the uncompressed .mps file."); + "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file manually " + "or install a compatible liblz4."); } } ~lz4_runtime_t() { - if (handle != nullptr) { dlclose(handle); } + if (handle != nullptr) { ::dlclose(handle); } } lz4_runtime_t(const lz4_runtime_t&) = delete; @@ -124,9 +126,12 @@ int open_lz4_fd(const std::string& path) return fd; } +#ifndef _WIN32 std::size_t system_page_size(); +#endif std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment); +#ifndef _WIN32 class FileDescriptor { public: explicit FileDescriptor(int fd) : fd_(fd) {} @@ -145,6 +150,8 @@ class FileDescriptor { int fd_; }; +#endif + uint32_t read_le32(const char* ptr) { const auto* p = reinterpret_cast(ptr); @@ -181,6 +188,7 @@ std::size_t checked_size(uint64_t value, const char* label) return static_cast(value); } +#ifndef _WIN32 std::size_t get_file_size(int fd, const std::string& path) { struct stat st; @@ -191,6 +199,9 @@ std::size_t get_file_size(int fd, const std::string& path) return static_cast(st.st_size); } +#endif + +#ifndef _WIN32 std::size_t system_page_size() { static std::size_t page_size = [] { @@ -199,6 +210,7 @@ std::size_t system_page_size() }(); return page_size; } +#endif std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) { @@ -212,6 +224,7 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) return value + increment; } +#ifndef _WIN32 std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) { if (a != 0 && b > std::numeric_limits::max() / a) { @@ -320,12 +333,14 @@ class lz4_resident_windows_t { std::vector& windows_; }; +#endif } // namespace Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) { MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io); + ensure_lz4_runtime_available(); fd_ = open_lz4_fd(path); diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/io/experimental_mps_fast/perf_counters.hpp new file mode 100644 index 0000000000..147a7ae7bb --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/perf_counters.hpp @@ -0,0 +1,163 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace mps_fast { + +struct perf_counter_spec_t { + const char* name; + uint32_t type; + uint64_t config; +}; + +static constexpr uint64_t perf_cache_config(uint64_t cache, uint64_t op, uint64_t result) +{ + return cache | (op << 8) | (result << 16); +} + +// Small scoped Linux perf_event_open wrapper for coarse phase diagnostics. +// +// Important limitations: +// - Counters are per-thread: construct one instance inside each worker whose +// work should be measured, then aggregate snapshots. +// - These are generic perf events; exact mappings vary by CPU. Some events may +// be unavailable or unhelpful, e.g. store-side DTLB misses on this node. +// - This deliberately does not use event groups or time_enabled/time_running +// scaling, so counts are approximate if the kernel multiplexes counters. +static constexpr std::array PERF_COUNTER_SPECS = {{ + {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES}, + {"instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS}, + {"cache_refs", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES}, + {"cache_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES}, + {"branch_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES}, + {"backend_stall_cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND}, + {"dtlb_load_misses", + PERF_TYPE_HW_CACHE, + perf_cache_config( + PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_RESULT_MISS)}, + {"dtlb_store_misses", + PERF_TYPE_HW_CACHE, + perf_cache_config( + PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_WRITE, PERF_COUNT_HW_CACHE_RESULT_MISS)}, +}}; + +struct perf_counter_snapshot_t { + bool active = false; + int open_errno = 0; + std::array values = {}; +}; + +class thread_perf_counters_t { + public: + thread_perf_counters_t() + { + fds_.fill(-1); + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + perf_event_attr attr = {}; + attr.type = PERF_COUNTER_SPECS[i].type; + attr.size = sizeof(attr); + attr.config = PERF_COUNTER_SPECS[i].config; + attr.disabled = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + + int fd = (int)syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + if (fd < 0) { + if (first_errno_ == 0) { first_errno_ = errno; } + continue; + } + fds_[i] = fd; + active_ = true; + } + + if (active_) { + for (int fd : fds_) { + if (fd >= 0) { + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + } + } + } + } + + thread_perf_counters_t(const thread_perf_counters_t&) = delete; + thread_perf_counters_t& operator=(const thread_perf_counters_t&) = delete; + + ~thread_perf_counters_t() { close_all(); } + + perf_counter_snapshot_t stop() + { + perf_counter_snapshot_t snapshot; + snapshot.active = active_; + snapshot.open_errno = first_errno_; + + for (size_t i = 0; i < fds_.size(); ++i) { + int fd = fds_[i]; + if (fd < 0) continue; + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + uint64_t value = 0; + if (read(fd, &value, sizeof(value)) == (ssize_t)sizeof(value)) { snapshot.values[i] = value; } + } + close_all(); + active_ = false; + return snapshot; + } + + private: + void close_all() + { + for (int& fd : fds_) { + if (fd >= 0) { + close(fd); + fd = -1; + } + } + } + + bool active_ = false; + int first_errno_ = 0; + std::array fds_; +}; + +static inline void print_perf_totals(const char* label, + const std::vector& snapshots) +{ + std::array totals = {}; + bool any_active = false; + int first_errno = 0; + for (const auto& snapshot : snapshots) { + if (snapshot.open_errno != 0 && first_errno == 0) { first_errno = snapshot.open_errno; } + if (!snapshot.active) continue; + any_active = true; + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + totals[i] += snapshot.values[i]; + } + } + + if (!any_active) { + std::fprintf(stderr, "[PERF] %s unavailable errno=%d\n", label, first_errno); + return; + } + + double ipc = totals[0] == 0 ? 0.0 : (double)totals[1] / (double)totals[0]; + double miss_rate = totals[2] == 0 ? 0.0 : (double)totals[3] / (double)totals[2]; + std::fprintf(stderr, "[PERF] %s", label); + for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) { + std::fprintf(stderr, " %s=%llu", PERF_COUNTER_SPECS[i].name, totals[i]); + } + std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate); +} + +} // namespace mps_fast From 91742cd0d5a1b01d72f49e5a65adc40ded0b50f5 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 5 Jun 2026 09:39:48 -0700 Subject: [PATCH 04/22] improved iee754 compliant float parsing, warn on nnz > INT_MAX --- cpp/CMakeLists.txt | 2 +- .../fast_fp64_parser.hpp | 400 ++++++++++++++++++ .../fast_parse_primitives.hpp | 116 +---- .../io/experimental_mps_fast/fast_parser.cpp | 111 +++-- .../io/experimental_mps_fast/fast_parser.hpp | 11 +- .../fast_parser_adapter.cpp | 3 + .../io/experimental_mps_fast/file_reader.cpp | 33 +- .../io/experimental_mps_fast/file_reader.hpp | 2 +- .../hash_table_smallstr.hpp | 27 +- .../experimental_mps_fast/lz4_file_reader.cpp | 168 ++++---- .../io/experimental_mps_fast/mmap_region.hpp | 26 +- .../mps_section_scanner.cpp | 17 +- .../mps_section_scanner.hpp | 2 +- .../io/experimental_mps_fast/nvtx_ranges.hpp | 2 +- .../experimental_mps_fast/perf_counters.hpp | 2 +- .../io/experimental_mps_fast/simd_compat.hpp | 2 +- cpp/src/io/utilities/error.hpp | 28 +- 17 files changed, 685 insertions(+), 267 deletions(-) create mode 100644 cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 60227547b4..712a132fc0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -465,7 +465,7 @@ endif () if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} - APPEND PROPERTY COMPILE_OPTIONS "-mavx2;-maes;-msse4.2") + APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2") endif () # Apply -UNDEBUG only to solver source files (not gRPC infrastructure). diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp new file mode 100644 index 0000000000..605c6adc5b --- /dev/null +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -0,0 +1,400 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace fp64 { + +#define FASTP64_MIN_EXP_10 (-307) +#define FASTP64_MAX_EXP_10 288 +#define FASTP64_POWER_COUNT (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1) +#define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1) + +// Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm +// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 +// (8), 2021. + +struct power_10_lut_entry_t { + uint64_t high; + uint64_t low; + int biased_e2; +}; + +struct cuopt_uint256_t { + std::array limb{}; + + constexpr uint32_t mul_u32(uint32_t m) + { + unsigned __int128 carry = 0; + for (uint64_t& v : limb) { + unsigned __int128 x = static_cast(v) * m + carry; + v = static_cast(x); + carry = x >> 64; + } + return static_cast(carry); + } + + constexpr cuopt_uint256_t shl_small(int bits) const + { + cuopt_uint256_t out; + if (bits == 0) return *this; + for (int i = 3; i >= 0; --i) { + uint64_t v = limb[i] << bits; + if (i > 0) v |= limb[i - 1] >> (64 - bits); + out.limb[i] = v; + } + return out; + } +}; + +struct cuopt_normalized_uint256_t { + cuopt_uint256_t sig; + int exp2 = 0; + + static constexpr cuopt_normalized_uint256_t one() + { + cuopt_normalized_uint256_t x; + x.sig.limb[3] = uint64_t{1} << 63; + x.exp2 = -255; + return x; + } + + constexpr void mul10() + { + uint32_t carry = sig.mul_u32(10); + int shift = 32 - std::countl_zero(carry); + cuopt_uint256_t out; + for (int i = 0; i < 4; ++i) { + uint64_t lower = sig.limb[i] >> shift; + uint64_t upper = 0; + if (i + 1 < 4) { + upper = sig.limb[i + 1] << (64 - shift); + } else { + upper = static_cast(carry) << (64 - shift); + } + out.limb[i] = lower | upper; + } + sig = out; + exp2 += shift; + } + + constexpr void div10() + { + constexpr uint64_t div10_shift_4_threshold = 0xA000000000000000ULL; + int shift = sig.limb[3] < div10_shift_4_threshold ? 4 : 3; + uint64_t extra = sig.limb[3] >> (64 - shift); + cuopt_uint256_t shifted = sig.shl_small(shift); + + cuopt_uint256_t quotient; + unsigned __int128 rem = extra; + for (int i = 3; i >= 0; --i) { + unsigned __int128 cur = (rem << 64) | shifted.limb[i]; + quotient.limb[i] = static_cast(cur / 10); + rem = cur % 10; + } + sig = quotient; + exp2 -= shift; + } +}; + +constexpr power_10_lut_entry_t make_power(const cuopt_normalized_uint256_t& p) +{ + int e2 = p.exp2 + 192; + return {p.sig.limb[3], p.sig.limb[2], 1150 + e2}; +} + +// build time LUT for the lemire trick +constexpr std::array make_power_table() +{ + std::array table{}; + cuopt_normalized_uint256_t p = cuopt_normalized_uint256_t::one(); + table[-FASTP64_MIN_EXP_10] = make_power(p); + + for (int e = 1; e <= FASTP64_MAX_EXP_10; ++e) { + p.mul10(); + table[e - FASTP64_MIN_EXP_10] = make_power(p); + } + + p = cuopt_normalized_uint256_t::one(); + for (int e = -1; e >= FASTP64_MIN_EXP_10; --e) { + p.div10(); + table[e - FASTP64_MIN_EXP_10] = make_power(p); + } + return table; +} + +inline constexpr auto fast_fp64_parse_lut = make_power_table(); + +inline constexpr std::array small_powers = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + +inline constexpr std::array small_integer_powers = {1ULL, + 10ULL, + 100ULL, + 1000ULL, + 10000ULL, + 100000ULL, + 1000000ULL, + 10000000ULL, + 100000000ULL, + 1000000000ULL, + 10000000000ULL, + 100000000000ULL, + 1000000000000ULL, + 10000000000000ULL, + 100000000000000ULL, + 1000000000000000ULL}; + +struct ParsedDecimal { + bool negative = false; + bool fast_eligible = false; + uint64_t mantissa = 0; + int exp10 = 0; +}; + +static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; } + +// SWAR 8char run of digits -> integer representation +static inline bool parse_8_digits(const char* p, uint32_t& out) +{ + std::array bytes{}; + std::memcpy(bytes.data(), p, bytes.size()); + uint64_t raw = std::bit_cast(bytes); + uint64_t high = raw & 0xF0F0F0F0F0F0F0F0ULL; + uint64_t low_check = (raw + 0x0606060606060606ULL) & 0xF0F0F0F0F0F0F0F0ULL; + if (high != 0x3030303030303030ULL || low_check != 0x3030303030303030ULL) { return false; } + + uint64_t v = raw - 0x3030303030303030ULL; + uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL; + uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL; + out = static_cast((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL); + return true; +} + +static inline void scan_digit_run(const char*& p, + const char* end, + bool after_dot, + ParsedDecimal& out, + bool& saw_digit, + int& frac_digits, + int& sig_digits, + bool& too_many_digits) +{ + while (p < end) { + uint32_t chunk = 0; + if (end - p >= 8 && parse_8_digits(p, chunk)) { + saw_digit = true; + if (after_dot) frac_digits += 8; + + if (!too_many_digits) { + if (sig_digits == 0 && chunk == 0) { + p += 8; + continue; + } + + if (sig_digits + 8 <= 19) { + out.mantissa = out.mantissa * 100000000ULL + chunk; + sig_digits += 8; + } else { + too_many_digits = true; + } + } + + p += 8; + continue; + } + + if (!is_digit(*p)) return; + saw_digit = true; + int digit = *p - '0'; + if (after_dot) ++frac_digits; + if (!too_many_digits && (digit != 0 || sig_digits != 0)) { + if (sig_digits < 19) { + out.mantissa = (out.mantissa * 10) + static_cast(digit); + ++sig_digits; + } else { + too_many_digits = true; + } + } + ++p; + } +} + +static inline bool parse_decimal_advance(const char*& p, const char* end, ParsedDecimal& out) +{ + if (p < end && (*p == '-' || *p == '+')) { + out.negative = *p == '-'; + ++p; + } + + bool saw_digit = false; + int frac_digits = 0; + int sig_digits = 0; + bool too_many_digits = false; + + scan_digit_run(p, end, false, out, saw_digit, frac_digits, sig_digits, too_many_digits); + if (p < end && *p == '.') { + ++p; + scan_digit_run(p, end, true, out, saw_digit, frac_digits, sig_digits, too_many_digits); + } + + if (!saw_digit) return false; + + int explicit_exp = 0; + if (p < end && (*p == 'e' || *p == 'E' || *p == 'd' || *p == 'D')) { + const char* exp_start = p; + ++p; + bool exp_negative = false; + if (p < end && (*p == '-' || *p == '+')) { + exp_negative = *p == '-'; + ++p; + } + if (p == end || !is_digit(*p)) { + p = exp_start; + } else { + int exp_value = 0; + while (p < end && is_digit(*p)) { + if (exp_value < 1000000) exp_value = exp_value * 10 + (*p - '0'); + ++p; + } + explicit_exp = exp_negative ? -exp_value : exp_value; + } + } + + out.exp10 = explicit_exp - frac_digits; + out.fast_eligible = !too_many_digits; + return true; +} + +// fallback to stdlib for edge case or ambiguous roundings (very rare) +static inline double fallback_strtod(std::string_view s) +{ + char stack_buf[32]; + if (s.size() >= sizeof(stack_buf)) { + mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length"); + } + std::memcpy(stack_buf, s.data(), s.size()); + stack_buf[s.size()] = '\0'; + for (size_t i = 0; i < s.size(); ++i) { + if (stack_buf[i] == 'd' || stack_buf[i] == 'D') stack_buf[i] = 'e'; + } + + char* parse_end = nullptr; + errno = 0; + return std::strtod(stack_buf, &parse_end); +} + +// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 +// (8), 2021. +static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits) +{ + if (exp10 < FASTP64_MIN_EXP_10 || exp10 > FASTP64_MAX_EXP_10) { return false; } + + const power_10_lut_entry_t p = fast_fp64_parse_lut[exp10 - FASTP64_MIN_EXP_10]; + int lz = std::countl_zero(man); + uint64_t norm = man << lz; + int adj_e2 = p.biased_e2 - lz; + + unsigned __int128 product = static_cast(norm) * p.high; + uint64_t hi = static_cast(product >> 64); + uint64_t lo = static_cast(product); + + if ((hi & 0x1FF) == 0x1FF && lo + norm < norm) { + unsigned __int128 low_product = static_cast(norm) * p.low; + uint64_t low_hi = static_cast(low_product >> 64); + uint64_t low_lo = static_cast(low_product); + uint64_t old_lo = lo; + lo += low_hi; + hi += lo < old_lo ? 1 : 0; + if ((hi & 0x1FF) == 0x1FF && lo == std::numeric_limits::max() && + low_lo + norm < low_lo) { + return false; + } + } + + uint64_t hi_msb = hi >> 63; + uint64_t x54 = hi >> (9 + hi_msb); + adj_e2 -= static_cast(1 - hi_msb); + + // half-way ambiguity, fallback + if (lo == 0 && (hi & 0x1FF) == 0 && (x54 & 3) == 1) { return false; } + + // exponent overflow, fallback + uint64_t x53 = (x54 + (x54 & 1)) >> 1; + uint64_t overflow = x53 >> 53; + uint64_t ret_man = (x53 >> overflow) & FASTP64_MANTISSA_MASK; + int ret_exp = adj_e2 + static_cast(overflow); + if (ret_exp <= 0 || ret_exp >= 0x7FF) { return false; } + + bits = (static_cast(ret_exp) << 52) | ret_man; + return true; +} + +static inline double assemble_fp64(const ParsedDecimal& dec) +{ + uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0; + if (dec.mantissa == 0) { return std::bit_cast(bits); } + + if (dec.fast_eligible) { + double small = 0.0; + bool used_small = false; + if (dec.exp10 >= 0 && dec.exp10 < static_cast(small_integer_powers.size())) { + uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10]; + if (dec.mantissa <= limit) { + small = static_cast(dec.mantissa) * small_powers[dec.exp10]; + used_small = true; + } + } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) { + small = static_cast(dec.mantissa) / small_powers[-dec.exp10]; + used_small = true; + } + if (used_small) { return dec.negative ? -small : small; } + + uint64_t mag_bits = 0; + if (eisel_lemire(dec.mantissa, dec.exp10, mag_bits)) { + return std::bit_cast(bits | mag_bits); + } + } + + return std::numeric_limits::quiet_NaN(); +} + +static inline double parse_fp64_advance(const char*& p, const char* end) +{ + const char* start = p; + ParsedDecimal dec; + if (!parse_decimal_advance(p, end, dec)) { + return fallback_strtod(std::string_view(start, static_cast(p - start))); + } + + double v = assemble_fp64(dec); + if (v == v) return v; + return fallback_strtod(std::string_view(start, static_cast(p - start))); +} + +static inline double parse_fp64_token(const char* p, const char* end) +{ + return parse_fp64_advance(p, end); +} + +} // namespace fp64 +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index 453687df01..bd4ee4669a 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -1,21 +1,21 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights -// reserved. SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 #pragma once -#include "simd_compat.hpp" +#include "fast_fp64_parser.hpp" -#include #include -#include #include #include #include -#include #include #include #include +#include +#include + #ifndef __likely #define __likely(x) __builtin_expect(!!(x), 1) #endif @@ -26,104 +26,14 @@ namespace mps_fast { -inline constexpr int EXP10_TABLE_MAX = 308; - -constexpr double constexpr_pow10(int exp) -{ - if (exp == 0) return 1.0; - double result = 1.0; - if (exp > 0) { - for (int i = 0; i < exp; ++i) - result *= 10.0; - } else { - for (int i = 0; i > exp; --i) - result /= 10.0; - } - return result; -} - -constexpr auto make_exp10_table() -{ - std::array table{}; - for (int i = -EXP10_TABLE_MAX; i <= EXP10_TABLE_MAX; ++i) { - table[(size_t)(i + EXP10_TABLE_MAX)] = constexpr_pow10(i); - } - return table; -} - -inline constexpr auto table_exp10 = make_exp10_table(); +static inline void reset_number_parse_stats() {} +static inline void print_number_parse_stats() {} static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; } -// Honestly, it's pretty bare bones as it is. It could take advantage of SIMD/SWAR -// or use the Eisel-Lemire trick. Would have to be validated through benchmarking -// but usually MPS files use simple enough coefficients static inline double fast_atof_core(const char*& data, const char* end) { - double sign = 1.0; - if (data < end && *data == '-') { - sign = -1.0; - ++data; - } else if (data < end && *data == '+') { - ++data; - } - - uint64_t significand = 0; - int decimal_exponent = 0; - int significant_digits = 0; - bool seen_dot = false; - - while (data < end) { - char c = *data; - if (is_digit_byte(c)) { - int digit = c - '0'; - if (seen_dot) { --decimal_exponent; } - if (significand != 0 || digit != 0) { - // FP64 can't represent more than that - if (significant_digits < 19) { - significand = significand * 10 + static_cast(digit); - ++significant_digits; - } else if (!seen_dot) { - ++decimal_exponent; - } - } - ++data; - } else if (c == '.' && !seen_dot) { - seen_dot = true; - ++data; - } else { - break; - } - } - - if (data < end && (*data == 'e' || *data == 'E' || *data == 'd' || *data == 'D')) { - ++data; - int exp_sign = 1; - if (data < end && *data == '-') { - exp_sign = -1; - ++data; - } else if (data < end && *data == '+') { - ++data; - } - - int exponent = 0; - while (data < end && is_digit_byte(*data)) { - exponent = exponent * 10 + (*data - '0'); - ++data; - } - - exponent *= exp_sign; - decimal_exponent += exponent; - } - - double result = static_cast(significand); - if (decimal_exponent >= -EXP10_TABLE_MAX && decimal_exponent <= EXP10_TABLE_MAX) { - result *= table_exp10[static_cast(decimal_exponent + EXP10_TABLE_MAX)]; - } else { - result *= std::pow(10.0, decimal_exponent); - } - - return sign * result; + return fp64::parse_fp64_advance(data, end); } static inline double fast_atof(const char* data, const char* end) @@ -167,14 +77,14 @@ struct cursor_t { char msg_buf[512]; std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args); va_end(args); - char buf[1024]; - std::snprintf(buf, sizeof(buf), "%zu:%zu: %s", line, col, msg_buf); - throw std::runtime_error(buf); + mps_parser_fail(error_type_t::ValidationError, "%zu:%zu: %s", line, col, msg_buf); } void advance(std::size_t n) { - if (ptr + n > end) { throw std::runtime_error("cursor advanced past end of file"); } + if (ptr + n > end) { + mps_parser_fail(error_type_t::ValidationError, "cursor advanced past end of file"); + } ptr += n; } diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 35a67346c3..73f50c5341 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -1,6 +1,8 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 +#define MPS_FAST_TIMERS + #include "fast_parser.hpp" #include "fast_parse_primitives.hpp" #include "file_reader.hpp" @@ -1337,14 +1339,38 @@ static const char* find_next_line(const char* p, const char* end) return p; } -static std::vector compute_line_chunk_boundaries(const char* section_start, - const char* section_end, - int num_threads) +static std::string_view peek_bounds_line_var_name(const char* line_start, const char* end) +{ + const char* p = line_start; + for (int field = 0; field < 2; ++field) { + while (p < end && *p <= ' ' && *p != '\n') + p++; + while (p < end && *p > ' ') + p++; + } + while (p < end && *p <= ' ' && *p != '\n') + p++; + const char* var_start = p; + while (p < end && *p > ' ') + p++; + return std::string_view(var_start, (size_t)(p - var_start)); +} + +static const char* find_line_start(const char* section_start, const char* p) +{ + while (p > section_start && p[-1] != '\n') + --p; + return p; +} + +static std::vector compute_bounds_chunk_boundaries(const char* section_start, + const char* section_end, + int num_threads) { scoped_timer_t timer("bounds_compute_chunk_boundaries"); - size_t total_size = (size_t)(section_end - section_start); - size_t chunk_size = total_size / (size_t)num_threads; + const size_t total_size = (size_t)(section_end - section_start); + const size_t chunk_size = total_size / (size_t)num_threads; std::vector boundaries((size_t)num_threads); boundaries[0].start = section_start; @@ -1352,9 +1378,21 @@ static std::vector compute_line_chunk_boundaries(const char if (t == num_threads - 1) { boundaries[(size_t)t].end = section_end; } else { - const char* boundary = section_start + (size_t)(t + 1) * chunk_size; - boundaries[(size_t)t].end = find_next_line(boundary, section_end); - boundaries[(size_t)t + 1].start = boundaries[(size_t)t].end; + const char* boundary = + find_next_line(section_start + (size_t)(t + 1) * chunk_size, section_end); + + // Keep consecutive BOUNDS records for the same variable in one chunk. + // Then each thread owns full LO/UP-style groups and can apply file order locally. + while (boundary < section_end) { + const char* prev_line = find_line_start(section_start, boundary - 1); + const auto prev_var = peek_bounds_line_var_name(prev_line, section_end); + const auto next_var = peek_bounds_line_var_name(boundary, section_end); + if (prev_var.empty() || next_var.empty() || prev_var != next_var) { break; } + boundary = find_next_line(boundary, section_end); + } + + boundaries[(size_t)t].end = boundary; + boundaries[(size_t)t + 1].start = boundary; } } return boundaries; @@ -1580,6 +1618,23 @@ static void merge_chunk_results_to_csr(parse_state_t& state, } } size_t total_cols = global_col_offset[num_chunks]; + if constexpr (std::numeric_limits::max() < std::numeric_limits::max()) { + const size_t index_max = (size_t)std::numeric_limits::max(); + if (total_nnz > index_max) { + mps_parser_fail(error_type_t::RuntimeError, + "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu", + total_nnz, + index_max); + } + if (total_cols > index_max || (size_t)n_rows > index_max) { + mps_parser_fail(error_type_t::RuntimeError, + "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index " + "max=%zu", + (size_t)n_rows, + total_cols, + index_max); + } + } { scoped_timer_t timer("columns_dense_metadata"); bool dense_ok = total_cols > 0; @@ -1986,7 +2041,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, size_t comments = 0; size_t min_var = SIZE_MAX; size_t max_var = 0; - size_t non_strict_order = 0; + size_t decreasing_order = 0; bool saw_integer_type = false; bool saw_negative_upper = false; const char* error_ptr = nullptr; @@ -1994,7 +2049,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, }; std::vector stats((size_t)num_threads); - auto boundaries = compute_line_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads); + auto boundaries = + compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads); std::vector bound_seen; { @@ -2005,8 +2061,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, { scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint"); - // Duplicate or non-monotone BOUNDS updates are file-order dependent. Parse - // optimistically, then accept only if chunk summaries prove strict order. + // Repeated BOUNDS for the same variable are safe inside a group-owned chunk. + // Parse optimistically, then accept only if chunk summaries prove no backward jumps. #pragma omp parallel for schedule(static) num_threads(num_threads) for (int t = 0; t < num_threads; ++t) { auto& local = stats[(size_t)t]; @@ -2073,7 +2129,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, local.lines++; local.min_var = std::min(local.min_var, var_idx); local.max_var = std::max(local.max_var, var_idx); - if (prev_var != SIZE_MAX && var_idx <= prev_var) { local.non_strict_order++; } + if (prev_var != SIZE_MAX && var_idx < prev_var) { local.decreasing_order++; } prev_var = var_idx; bool first_bound_for_var = bound_seen[var_idx] == 0; @@ -2152,7 +2208,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } size_t dense_misses = 0; - size_t non_strict_order = 0; + size_t decreasing_order = 0; size_t overlap_chunks = 0; size_t prev_max = SIZE_MAX; for (int t = 0; t < num_threads; ++t) { @@ -2162,21 +2218,21 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, cursor.error("%s", local.error_msg); } dense_misses += local.dense_misses; - non_strict_order += local.non_strict_order; + decreasing_order += local.decreasing_order; if (local.lines > 0) { if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; } prev_max = local.max_var; } } - const bool order_safe = dense_misses == 0 && non_strict_order == 0 && overlap_chunks == 0; + const bool order_safe = dense_misses == 0 && decreasing_order == 0 && overlap_chunks == 0; if (!order_safe) { std::fprintf(stderr, "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu " - "non_strict_order=%zu overlap_chunks=%zu\n", + "decreasing_order=%zu overlap_chunks=%zu\n", dense_misses, - non_strict_order, + decreasing_order, overlap_chunks); cursor.ptr = bounds_body_start; return false; @@ -3010,26 +3066,30 @@ static small_raw_read_t try_read_small_raw_file(const std::string& path) { FILE* file = std::fopen(path.c_str(), "rb"); if (file == nullptr) { - throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open raw MPS file '%s': %s", + path.c_str(), + std::strerror(errno)); } std::unique_ptr file_guard(file, &std::fclose); if (std::fseek(file, 0, SEEK_END) != 0) { - throw std::runtime_error("Failed to seek raw MPS file '" + path + "'"); + mps_parser_fail(error_type_t::RuntimeError, "Failed to seek raw MPS file '%s'", path.c_str()); } long file_size_long = std::ftell(file); if (file_size_long < 0) { - throw std::runtime_error("Failed to determine raw MPS file size '" + path + "'"); + mps_parser_fail( + error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str()); } std::size_t file_size = static_cast(file_size_long); if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; } if (std::fseek(file, 0, SEEK_SET) != 0) { - throw std::runtime_error("Failed to rewind raw MPS file '" + path + "'"); + mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str()); } std::vector buffer(file_size); if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) { - throw std::runtime_error("Failed to read raw MPS file '" + path + "'"); + mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str()); } return {true, std::move(buffer)}; } @@ -3116,7 +3176,8 @@ cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( return parse_mps_fast_stream( stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); } - throw std::runtime_error("single-path parser supports raw read and LZ4 inputs only"); + mps_parser_fail(error_type_t::RuntimeError, + "single-path parser supports raw read and LZ4 inputs only"); } template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp index 20e9901024..9f6f0f107b 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights -// reserved. SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 #pragma once @@ -13,7 +13,10 @@ namespace mps_fast { template -cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method = FileReadMethod::Read); +using parser_model_t = cuopt::linear_programming::io::mps_data_model_t; + +template +parser_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method = FileReadMethod::Read); } // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp index 49a7602739..0d14f059bc 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp @@ -9,6 +9,8 @@ #include "fast_parser.hpp" +#include + #include namespace cuopt::linear_programming::io { @@ -16,6 +18,7 @@ namespace cuopt::linear_programming::io { template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) { + CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str()); return mps_fast::parse_mps_fast_file(mps_file_path); } diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 97ef5c5cc4..08521eafc0 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -1,9 +1,11 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #include "file_reader.hpp" #include "nvtx_ranges.hpp" +#include + #include #include #include @@ -24,6 +26,10 @@ namespace mps_fast { +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + char* string_buffer; char* string_buffer_ptr; @@ -65,7 +71,10 @@ std::size_t get_file_size(int fd, const std::string& path) { struct stat st; if (::fstat(fd, &st) != 0) { - throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to stat file '%s': %s", + path.c_str(), + std::strerror(errno)); } return static_cast(st.st_size); } @@ -86,7 +95,7 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) if (remainder == 0) { return value; } std::size_t increment = alignment - remainder; if (value > std::numeric_limits::max() - increment) { - throw std::runtime_error("allocation size overflow"); + mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); } return value + increment; } @@ -98,7 +107,10 @@ RawInputStream::RawInputStream(const std::string& path) : path_(path) MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); fd_ = ::open(path.c_str(), O_RDONLY); if (fd_ < 0) { - throw std::runtime_error("Failed to open raw MPS file '" + path + "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open raw MPS file '%s': %s", + path.c_str(), + std::strerror(errno)); } file_size_ = get_file_size(fd_, path); @@ -173,11 +185,15 @@ void RawInputStream::run_decode_tasks() fd_, output_data_ + offset + done, size - done, static_cast(offset + done)); if (got < 0) { if (errno == EINTR) { continue; } - throw std::runtime_error("Failed to pread raw MPS file '" + path_ + - "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread raw MPS file '%s': %s", + path_.c_str(), + std::strerror(errno)); } if (got == 0) { - throw std::runtime_error("Unexpected EOF while reading raw MPS file '" + path_ + "'"); + mps_parser_fail(error_type_t::RuntimeError, + "Unexpected EOF while reading raw MPS file '%s'", + path_.c_str()); } done += static_cast(got); } @@ -243,7 +259,8 @@ FileReadMethod effective_file_read_method(const std::string& path, FileReadMetho { if (has_lz4_extension(path)) { return FileReadMethod::Lz4; } if (method == FileReadMethod::Lz4) { - throw std::runtime_error("lz4 read method requires a .lz4 input: " + path); + mps_parser_fail( + error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str()); } return method; } diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index 3232a23e84..cc603e35d8 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp index 7aa302da23..ab0d4c2c78 100644 --- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -5,7 +5,8 @@ #pragma once -#include "simd_compat.hpp" +#include +#include #include #include @@ -44,30 +45,6 @@ static inline uint32_t crcHash(const uint8_t* key, int64_t len) return crc; } -static const simde__m128i aes_seed_128 = - simde_mm_set_epi64x(0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL); -static const simde__m256i aes_seed_256 = simde_mm256_set_epi64x( - 0x9E3779B97F4A7C15ULL, 0xBB67AE8584CAA73BULL, 0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL); - -static inline uint32_t aes_hash(simde__m128i key) -{ - simde__m128i h = simde_mm_aesenc_si128(key, aes_seed_128); - h = simde_mm_aesenc_si128(h, aes_seed_128); - simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8)); - return (uint32_t)simde_mm_cvtsi128_si32(folded); -} - -static inline uint32_t aes_hash(simde__m256i key) -{ - simde__m128i lo = simde_mm256_castsi256_si128(key); - simde__m128i hi = simde_mm256_extracti128_si256(key, 1); - simde__m128i h = simde_mm_xor_si128(lo, hi); - h = simde_mm_aesenc_si128(h, aes_seed_128); - h = simde_mm_aesenc_si128(h, aes_seed_128); - simde__m128i folded = simde_mm_xor_si128(h, simde_mm_srli_si128(h, 8)); - return (uint32_t)simde_mm_cvtsi128_si32(folded); -} - static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3) { uint64_t crc = 0; diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 36c42ba79a..010e890058 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -1,26 +1,25 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #include "file_reader.hpp" #include "mps_section_scanner.hpp" #include "nvtx_ranges.hpp" +#include + #ifdef _OPENMP #include #endif -#ifndef _WIN32 #include #include #include #include #include -#include -#include -#endif #include #include +#include #include #include #include @@ -37,6 +36,10 @@ namespace mps_fast { +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + namespace { constexpr uint32_t lz4_frame_magic = 0x184D2204u; @@ -60,18 +63,18 @@ struct lz4_runtime_t { if (handle != nullptr) { break; } } if (handle == nullptr) { - throw std::logic_error( - "Could not open .mps.lz4 file since liblz4 was not found " - "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually " - "or install liblz4."); + mps_parser_fail(error_type_t::RuntimeError, + "Could not open .mps.lz4 file since liblz4 was not found " + "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually " + "or install liblz4."); } decompress_safe = reinterpret_cast(::dlsym(handle, "LZ4_decompress_safe")); if (decompress_safe == nullptr) { - throw std::logic_error( - "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file manually " - "or install a compatible liblz4."); + mps_parser_fail(error_type_t::RuntimeError, + "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file " + "manually or install a compatible liblz4."); } } @@ -100,7 +103,8 @@ int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, (void)dst; (void)compressed_size; (void)dst_capacity; - throw std::logic_error( + mps_parser_fail( + error_type_t::RuntimeError, "Experimental fast MPS parser was built without LZ4 decompression support. " "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); #endif @@ -111,7 +115,8 @@ void ensure_lz4_runtime_available() #if defined(MPS_PARSER_WITH_LZ4) (void)lz4_runtime(); #else - throw std::logic_error( + mps_parser_fail( + error_type_t::RuntimeError, "Experimental fast MPS parser was built without LZ4 decompression support. " "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually."); #endif @@ -121,37 +126,16 @@ int open_lz4_fd(const std::string& path) { int fd = ::open(path.c_str(), O_RDONLY); if (fd < 0) { - throw std::runtime_error("Failed to open LZ4 file '" + path + "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open LZ4 file '%s': %s", + path.c_str(), + std::strerror(errno)); } return fd; } -#ifndef _WIN32 -std::size_t system_page_size(); -#endif std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment); -#ifndef _WIN32 -class FileDescriptor { - public: - explicit FileDescriptor(int fd) : fd_(fd) {} - ~FileDescriptor() - { - if (fd_ >= 0) { ::close(fd_); } - } - - FileDescriptor(const FileDescriptor&) = delete; - FileDescriptor& operator=(const FileDescriptor&) = delete; - - int get() const noexcept { return fd_; } - bool valid() const noexcept { return fd_ >= 0; } - - private: - int fd_; -}; - -#endif - uint32_t read_le32(const char* ptr) { const auto* p = reinterpret_cast(ptr); @@ -176,32 +160,34 @@ std::size_t block_max_size_from_bd(unsigned char bd) case 5: return 256ull * 1024ull; case 6: return 1024ull * 1024ull; case 7: return 4ull * 1024ull * 1024ull; - default: throw std::runtime_error("unsupported LZ4 frame block size ID"); + default: mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame block size ID"); } } std::size_t checked_size(uint64_t value, const char* label) { if (value > static_cast(std::numeric_limits::max())) { - throw std::runtime_error(std::string("LZ4 ") + label + " exceeds size_t"); + mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label); } return static_cast(value); } -#ifndef _WIN32 std::size_t get_file_size(int fd, const std::string& path) { struct stat st; if (::fstat(fd, &st) != 0) { - throw std::runtime_error("Failed to stat file '" + path + "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to stat file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + if (st.st_size < 0) { + mps_parser_fail( + error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str()); } - if (st.st_size < 0) { throw std::runtime_error("Invalid negative file size for '" + path + "'"); } return static_cast(st.st_size); } -#endif - -#ifndef _WIN32 std::size_t system_page_size() { static std::size_t page_size = [] { @@ -210,7 +196,6 @@ std::size_t system_page_size() }(); return page_size; } -#endif std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) { @@ -219,16 +204,15 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) if (remainder == 0) { return value; } std::size_t increment = alignment - remainder; if (value > std::numeric_limits::max() - increment) { - throw std::runtime_error("allocation size overflow"); + mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); } return value + increment; } -#ifndef _WIN32 std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) { if (a != 0 && b > std::numeric_limits::max() / a) { - throw std::runtime_error(std::string(label) + " size overflow"); + mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label); } return a * b; } @@ -313,7 +297,7 @@ class lz4_resident_windows_t { const lz4_resident_window_t& window_for_offset(std::size_t offset) const { if (windows_.empty()) { - throw std::runtime_error("LZ4 resident window lookup with no windows"); + mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows"); } std::size_t lo = 0; std::size_t hi = windows_.size(); @@ -328,12 +312,11 @@ class lz4_resident_windows_t { return w; } } - throw std::runtime_error("LZ4 offset outside resident windows"); + mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); } std::vector& windows_; }; -#endif } // namespace @@ -350,48 +333,58 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) char header[32]; if (compressed_size_ < 7) { - throw std::runtime_error("LZ4 input is too small to contain a frame header"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input is too small to contain a frame header"); } std::size_t header_bytes = std::min(sizeof(header), compressed_size_); if (!pread_full_plain(fd_, header, header_bytes, 0)) { - throw std::runtime_error("Failed to read LZ4 frame header '" + path + - "': " + std::strerror(errno)); + mps_parser_fail(error_type_t::RuntimeError, + "Failed to read LZ4 frame header '%s': %s", + path.c_str(), + std::strerror(errno)); } std::size_t offset = 0; uint32_t magic = read_le32(header + offset); if (magic != lz4_frame_magic) { - throw std::runtime_error("unsupported LZ4 input: expected standard LZ4 frame magic"); + mps_parser_fail(error_type_t::ValidationError, + "unsupported LZ4 input: expected standard LZ4 frame magic"); } offset += 4; unsigned char flg = static_cast(header[offset++]); unsigned char bd = static_cast(header[offset++]); unsigned version = (flg >> 6) & 0x3u; - if (version != 1) { throw std::runtime_error("unsupported LZ4 frame version"); } + if (version != 1) { + mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version"); + } bool block_independent = (flg & 0x20u) != 0; block_checksum_ = (flg & 0x10u) != 0; content_size_present_ = (flg & 0x08u) != 0; content_checksum_ = (flg & 0x04u) != 0; dict_id_ = (flg & 0x01u) != 0; if (!block_independent) { - throw std::runtime_error("parallel LZ4 reader requires independent blocks; compress with -BI"); + mps_parser_fail(error_type_t::ValidationError, + "parallel LZ4 reader requires independent blocks; compress with -BI"); } block_max_size_ = block_max_size_from_bd(bd); if (content_size_present_) { if (offset + 8 > header_bytes) { - throw std::runtime_error("truncated LZ4 frame while reading content size"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading content size"); } content_size_ = checked_size(read_le64(header + offset), "content size"); offset += 8; } if (dict_id_) { if (offset + 4 > header_bytes) { - throw std::runtime_error("truncated LZ4 frame while reading dictionary id"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading dictionary id"); } offset += 4; } if (offset + 1 > header_bytes) { - throw std::runtime_error("truncated LZ4 frame while reading header checksum"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading header checksum"); } offset += 1; header_size_ = offset; @@ -447,7 +440,7 @@ void Lz4InputStream::commit_up_to(std::size_t bytes) std::lock_guard lock(commit_mutex_); if (bytes <= output_committed_size_) return; if (bytes > output_mapped_size_) { - throw std::runtime_error("LZ4 output exceeded reserved virtual mapping"); + mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping"); } std::size_t new_committed = round_up_to_multiple(bytes, system_page_size()); if (new_committed > output_mapped_size_) new_committed = output_mapped_size_; @@ -556,7 +549,8 @@ void Lz4InputStream::run_decode_tasks() } } if (actual < 0 || static_cast(actual) > block.decompressed_size) { - throw std::runtime_error("LZ4 input block decompressed to invalid size"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input block decompressed to invalid size"); } std::size_t actual_size = static_cast(actual); @@ -606,8 +600,13 @@ void Lz4InputStream::run_decode_tasks() ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset); } if (!ok) { - fail_and_notify(std::make_exception_ptr(std::runtime_error( - "Failed to pread LZ4 resident window: " + std::string(std::strerror(errno))))); + try { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread LZ4 resident window: %s", + std::strerror(errno)); + } catch (...) { + fail_and_notify(std::current_exception()); + } return; } { @@ -637,8 +636,8 @@ void Lz4InputStream::run_decode_tasks() return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; }); if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) { - throw std::runtime_error( - "LZ4 metadata scanner stopped before required window was ready"); + mps_parser_fail(error_type_t::RuntimeError, + "LZ4 metadata scanner stopped before required window was ready"); } } }; @@ -665,7 +664,8 @@ void Lz4InputStream::run_decode_tasks() MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic); wait_range_ready(offset, 4); if (offset + 4 > compressed_size_) { - throw std::runtime_error("truncated LZ4 frame while reading block header"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block header"); } uint32_t raw_block_size = resident.read_u32(offset); offset += 4; @@ -674,17 +674,20 @@ void Lz4InputStream::run_decode_tasks() bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; if (block_payload_size == 0) { - throw std::runtime_error("invalid zero-sized LZ4 data block"); + mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block"); } if (block_payload_size > block_max_size_ && uncompressed) { - throw std::runtime_error("LZ4 uncompressed block exceeds frame block maximum"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 uncompressed block exceeds frame block maximum"); } if (content_size_present_ && decompressed_offset >= content_size_) { - throw std::runtime_error("LZ4 frame contains more blocks than content size allows"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame contains more blocks than content size allows"); } wait_range_ready(offset, block_payload_size); if (offset + block_payload_size > compressed_size_) { - throw std::runtime_error("truncated LZ4 frame while reading block payload"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block payload"); } std::size_t decompressed_size = block_payload_size; @@ -696,7 +699,7 @@ void Lz4InputStream::run_decode_tasks() } } if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) { - throw std::runtime_error("LZ4 block exceeds declared content size"); + mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size"); } const char* src = resident.ptr_if_contiguous(offset, block_payload_size); @@ -717,27 +720,32 @@ void Lz4InputStream::run_decode_tasks() if (block_checksum_) { wait_range_ready(offset, 4); if (offset + 4 > compressed_size_) { - throw std::runtime_error("truncated LZ4 frame while reading block checksum"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block checksum"); } offset += 4; } if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) { - throw std::runtime_error("LZ4 input block count exceeded reserved metadata slots"); + mps_parser_fail(error_type_t::OutOfMemoryError, + "LZ4 input block count exceeded reserved metadata slots"); } if (batch.size() >= 1024) { push_batch(batch); } } if (content_checksum_) { wait_range_ready(offset, 4); if (offset + 4 > compressed_size_) { - throw std::runtime_error("truncated LZ4 frame while reading content checksum"); + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading content checksum"); } offset += 4; } if (content_size_present_ && decompressed_offset != content_size_) { - throw std::runtime_error("LZ4 frame ended before declared content size was reached"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame ended before declared content size was reached"); } if (offset != compressed_size_) { - throw std::runtime_error("LZ4 input contains trailing data after the first frame"); + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input contains trailing data after the first frame"); } push_batch(batch); { diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp index c1f411111a..98c6e4885d 100644 --- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once @@ -10,12 +10,19 @@ #include #include #include + +#include + #include #include #include namespace mps_fast { +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + // Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range // are still released by unmapping the owning outer range. class mmap_region_t { @@ -51,8 +58,8 @@ class mmap_region_t { { void* ptr = ::mmap(address, size, prot, flags, fd, offset); if (ptr == MAP_FAILED) { - throw std::runtime_error(std::string("mmap failed for ") + context + ": " + - std::strerror(errno)); + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); } return mmap_region_t(ptr, size); } @@ -66,17 +73,18 @@ class mmap_region_t { std::size_t size, std::size_t alignment, int prot, int flags, const char* context) { if (alignment == 0 || (alignment & (alignment - 1)) != 0) { - throw std::runtime_error("mmap aligned allocation requires power-of-two alignment"); + mps_parser_fail(error_type_t::RuntimeError, + "mmap aligned allocation requires power-of-two alignment"); } if (size > std::numeric_limits::max() - alignment) { - throw std::runtime_error("mmap aligned allocation size overflow"); + mps_parser_fail(error_type_t::OutOfMemoryError, "mmap aligned allocation size overflow"); } std::size_t raw_size = size + alignment; void* raw = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0); if (raw == MAP_FAILED) { - throw std::runtime_error(std::string("mmap failed for ") + context + ": " + - std::strerror(errno)); + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); } uintptr_t raw_addr = reinterpret_cast(raw); @@ -93,8 +101,8 @@ class mmap_region_t { { void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset); if (ptr == MAP_FAILED) { - throw std::runtime_error(std::string("mmap failed for ") + context + ": " + - std::strerror(errno)); + mps_parser_fail( + error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno)); } } diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index 3ed8763428..8581921173 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -1,8 +1,9 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #include "mps_section_scanner.hpp" -#include "simd_compat.hpp" + +#include #include #include @@ -10,8 +11,15 @@ #include #include +#include +#include + namespace mps_fast { +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_fail; + namespace { bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; } @@ -52,7 +60,7 @@ std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) case mps_phase_kind::ranges: return 5; case mps_phase_kind::quadratic: return 6; } - throw std::runtime_error("invalid MPS phase kind"); + mps_parser_fail(error_type_t::RuntimeError, "invalid MPS phase kind"); } void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range) @@ -277,7 +285,8 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, const char* end) { if (block_index >= block_count_) { - throw std::runtime_error("MPS section scanner observed invalid LZ4 block index"); + mps_parser_fail(error_type_t::RuntimeError, + "MPS section scanner observed invalid LZ4 block index"); } scan_section_range(begin, end, false); diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index 0c492b0074..cc287368fb 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp index 650d28dbc2..23f4b4b8c1 100644 --- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/io/experimental_mps_fast/perf_counters.hpp index 147a7ae7bb..1baaf011e5 100644 --- a/cpp/src/io/experimental_mps_fast/perf_counters.hpp +++ b/cpp/src/io/experimental_mps_fast/perf_counters.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp index d81af7a2eb..fb849fcff0 100644 --- a/cpp/src/io/experimental_mps_fast/simd_compat.hpp +++ b/cpp/src/io/experimental_mps_fast/simd_compat.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/cpp/src/io/utilities/error.hpp b/cpp/src/io/utilities/error.hpp index 58ac3891e1..c1b28fc7ff 100644 --- a/cpp/src/io/utilities/error.hpp +++ b/cpp/src/io/utilities/error.hpp @@ -34,6 +34,30 @@ inline std::string error_to_string(error_type_t error) return std::string("UnAccountedError"); } +[[noreturn]] inline void mps_parser_throw(error_type_t error_type, const char* msg) +{ + throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) + + "\", \"msg\": " + "\"" + std::string(msg) + "\"}"); +} + +/** + * @brief Report an unrecoverable parser error. + * + * @param[error_type_t] error enum error type + * @param[const char *] fmt String format for error message + * @param variable set of arguments used for fmt + * @throw std::logic_error always + */ +[[noreturn]] inline void mps_parser_fail(error_type_t error_type, const char* fmt, ...) +{ + va_list args; + va_start(args, fmt); + char msg[2048]; + vsnprintf(msg, sizeof(msg), fmt, args); + va_end(args); + mps_parser_throw(error_type, msg); +} + /** * @brief Function for checking (pre-)conditions that throws an exception when a * condition is false @@ -52,9 +76,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f char msg[2048]; vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - - throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) + - "\", \"msg\": " + "\"" + std::string(msg) + "\"}"); + mps_parser_throw(error_type, msg); } } From be97a050f1fee09385f6ca60db03d6596aba5bee Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Mon, 8 Jun 2026 05:38:55 -0700 Subject: [PATCH 05/22] decode performance metrics --- .../experimental_mps_fast/lz4_file_reader.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 010e890058..a0be7daaf0 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -20,9 +20,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -217,6 +219,12 @@ std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) return a * b; } +double elapsed_ms_since(std::chrono::steady_clock::time_point start) +{ + return std::chrono::duration(std::chrono::steady_clock::now() - start) + .count(); +} + bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) { std::size_t done = 0; @@ -479,6 +487,8 @@ void Lz4InputStream::run_decode_tasks() } const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count); + std::atomic decoder_wait_batch_ms{0.0}; + std::atomic decoder_active_batch_ms{0.0}; struct resident_block_desc_t { const char* src = nullptr; @@ -514,10 +524,12 @@ void Lz4InputStream::run_decode_tasks() { MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); std::unique_lock lock(desc_mutex); + const auto wait_start = std::chrono::steady_clock::now(); desc_cv.wait(lock, [&] { return stop_workers.load(std::memory_order_acquire) || scanner_done || !desc_queue.empty(); }); + decoder_wait_batch_ms.fetch_add(elapsed_ms_since(wait_start), std::memory_order_relaxed); if (stop_workers.load(std::memory_order_acquire)) { return; } if (desc_queue.empty()) { if (scanner_done) return; @@ -527,6 +539,7 @@ void Lz4InputStream::run_decode_tasks() desc_queue.pop_front(); } + const auto decode_start = std::chrono::steady_clock::now(); MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode); for (const auto& block : batch) { char* dst = output_data_ + block.decompressed_offset; @@ -578,6 +591,8 @@ void Lz4InputStream::run_decode_tasks() section_scanner_->publish_ready(after); } } + decoder_active_batch_ms.fetch_add(elapsed_ms_since(decode_start), + std::memory_order_relaxed); } } catch (...) { fail_and_notify(std::current_exception()); @@ -621,6 +636,7 @@ void Lz4InputStream::run_decode_tasks() std::atomic_size_t blocks_scanned{0}; std::vector> crossing_payloads; + const auto read_wall_start = std::chrono::steady_clock::now(); std::thread scanner([&] { try { nvtx::name_current_thread("lz4-metadata-scan"); @@ -770,6 +786,7 @@ void Lz4InputStream::run_decode_tasks() for (auto& reader : readers) { reader.join(); } + const double read_wall_ms = elapsed_ms_since(read_wall_start); scanner.join(); for (auto& worker : io_workers) { worker.join(); @@ -777,6 +794,19 @@ void Lz4InputStream::run_decode_tasks() if (first_error) std::rethrow_exception(first_error); output_view_size_ = ready_bytes_; section_scanner_->publish_ready(output_view_size_); + + const double compressed_mb = static_cast(compressed_size_) / (1024.0 * 1024.0); + const double read_effective_mbps = + read_wall_ms > 0.0 ? compressed_mb / (read_wall_ms / 1000.0) : 0.0; + const double decoder_wait_ms = decoder_wait_batch_ms.load(std::memory_order_relaxed); + const double decoder_active_ms = decoder_active_batch_ms.load(std::memory_order_relaxed); + const double decoder_total_ms = decoder_wait_ms + decoder_active_ms; + const double decoder_wait_ratio = + decoder_total_ms > 0.0 ? decoder_wait_ms / decoder_total_ms : 0.0; + std::fprintf(stderr, + "[LZ4_IO] read_effective_MBps=%.3f decoder_wait_ratio=%.6f\n", + read_effective_mbps, + decoder_wait_ratio); } } // namespace mps_fast From 1e4d7c991c4392728db2d0a1e7fbde87d652ba0f Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Wed, 10 Jun 2026 07:51:22 -0700 Subject: [PATCH 06/22] lots of cleanup --- cpp/CMakeLists.txt | 6 + .../fast_fp64_parser.hpp | 72 +- .../fast_parse_primitives.hpp | 83 +- .../io/experimental_mps_fast/fast_parser.cpp | 994 ++++++++++-------- .../io/experimental_mps_fast/file_reader.cpp | 93 +- .../io/experimental_mps_fast/file_reader.hpp | 37 +- .../hash_table_smallstr.hpp | 293 +----- .../experimental_mps_fast/lz4_file_reader.cpp | 786 ++++++++------ .../io/experimental_mps_fast/mmap_region.hpp | 25 +- .../mps_section_scanner.cpp | 290 +++-- .../mps_section_scanner.hpp | 10 +- .../io/experimental_mps_fast/nvtx_ranges.hpp | 2 +- .../io/experimental_mps_fast/simd_compat.hpp | 10 - cpp/tests/linear_programming/CMakeLists.txt | 38 + .../fast_fp64_parser_test.cpp | 231 ++++ .../fast_parser_edge_test.cpp | 871 +++++++++++++++ 16 files changed, 2463 insertions(+), 1378 deletions(-) delete mode 100644 cpp/src/io/experimental_mps_fast/simd_compat.hpp create mode 100644 cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp create mode 100644 cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 712a132fc0..e134d49d02 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,6 +54,7 @@ option(SKIP_ROUTING_BUILD "Skip building routing components" OFF) option(SKIP_GRPC_BUILD "Skip building gRPC and protobuf components" OFF) option(WRITE_FATBIN "Enable fatbin writing" ON) option(HOST_LINEINFO "Build with debug line information for host code" OFF) +option(MPS_FAST_TIMERS "Enable experimental fast MPS parser phase timer printouts" OFF) message(VERBOSE "cuOpt: Enable nvcc -lineinfo: ${CMAKE_CUDA_LINEINFO}") message(VERBOSE "cuOpt: Build cuOpt unit-tests: ${BUILD_TESTS}") @@ -64,6 +65,7 @@ message(VERBOSE "cuOpt: Skip C/Python adapters: ${SKIP_C_PYTHON_ADAPTERS}") message(VERBOSE "cuOpt: Skip routing build: ${SKIP_ROUTING_BUILD}") message(VERBOSE "cuOpt: Build with debug line information for host code: ${HOST_LINEINFO}") message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}") +message(VERBOSE "cuOpt: Fast MPS parser timers: ${MPS_FAST_TIMERS}") # ################################################################################################## # - compiler options ------------------------------------------------------------------------------ @@ -517,6 +519,10 @@ target_compile_definitions(cuopt PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API ) +if (MPS_FAST_TIMERS) + target_compile_definitions(cuopt PRIVATE MPS_FAST_TIMERS=1) +endif () + target_compile_options(cuopt PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" "$<$:${CUOPT_CUDA_FLAGS}>" diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index 605c6adc5b..905dcc9e7b 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -27,6 +27,8 @@ namespace fp64 { #define FASTP64_MAX_EXP_10 288 #define FASTP64_POWER_COUNT (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1) #define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1) +#define FASTP64_EXPONENT_MASK 0x7FF +#define FASTP64_HALF_MASK 0x1FF // Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 @@ -45,11 +47,11 @@ struct cuopt_uint256_t { { unsigned __int128 carry = 0; for (uint64_t& v : limb) { - unsigned __int128 x = static_cast(v) * m + carry; - v = static_cast(x); + unsigned __int128 x = (unsigned __int128)v * m + carry; + v = (uint64_t)x; carry = x >> 64; } - return static_cast(carry); + return (uint32_t)carry; } constexpr cuopt_uint256_t shl_small(int bits) const @@ -81,6 +83,9 @@ struct cuopt_normalized_uint256_t { { uint32_t carry = sig.mul_u32(10); int shift = 32 - std::countl_zero(carry); + // The normalized 256-bit value always overflows into carry after *10; keep + // the guard explicit because the cross-limb path shifts by 64 - shift. + if (shift == 0) { return; } cuopt_uint256_t out; for (int i = 0; i < 4; ++i) { uint64_t lower = sig.limb[i] >> shift; @@ -88,7 +93,7 @@ struct cuopt_normalized_uint256_t { if (i + 1 < 4) { upper = sig.limb[i + 1] << (64 - shift); } else { - upper = static_cast(carry) << (64 - shift); + upper = (uint64_t)carry << (64 - shift); } out.limb[i] = lower | upper; } @@ -107,7 +112,7 @@ struct cuopt_normalized_uint256_t { unsigned __int128 rem = extra; for (int i = 3; i >= 0; --i) { unsigned __int128 cur = (rem << 64) | shifted.limb[i]; - quotient.limb[i] = static_cast(cur / 10); + quotient.limb[i] = (uint64_t)(cur / 10); rem = cur % 10; } sig = quotient; @@ -186,7 +191,7 @@ static inline bool parse_8_digits(const char* p, uint32_t& out) uint64_t v = raw - 0x3030303030303030ULL; uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL; uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL; - out = static_cast((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL); + out = (uint32_t)((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL); return true; } @@ -229,7 +234,7 @@ static inline void scan_digit_run(const char*& p, if (after_dot) ++frac_digits; if (!too_many_digits && (digit != 0 || sig_digits != 0)) { if (sig_digits < 19) { - out.mantissa = (out.mantissa * 10) + static_cast(digit); + out.mantissa = (out.mantissa * 10) + (uint64_t)digit; ++sig_digits; } else { too_many_digits = true; @@ -314,38 +319,42 @@ static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits) uint64_t norm = man << lz; int adj_e2 = p.biased_e2 - lz; - unsigned __int128 product = static_cast(norm) * p.high; - uint64_t hi = static_cast(product >> 64); - uint64_t lo = static_cast(product); + unsigned __int128 product = (unsigned __int128)norm * p.high; + uint64_t hi = (uint64_t)(product >> 64); + uint64_t lo = (uint64_t)product; - if ((hi & 0x1FF) == 0x1FF && lo + norm < norm) { - unsigned __int128 low_product = static_cast(norm) * p.low; - uint64_t low_hi = static_cast(low_product >> 64); - uint64_t low_lo = static_cast(low_product); + // If the high product lands near the 9-bit halfway window, include the low + // 64x64 product to disambiguate rounding before deciding whether to fallback. + if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && lo + norm < norm) { + unsigned __int128 low_product = (unsigned __int128)norm * p.low; + uint64_t low_hi = (uint64_t)(low_product >> 64); + uint64_t low_lo = (uint64_t)low_product; uint64_t old_lo = lo; lo += low_hi; hi += lo < old_lo ? 1 : 0; - if ((hi & 0x1FF) == 0x1FF && lo == std::numeric_limits::max() && - low_lo + norm < low_lo) { + if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && + lo == std::numeric_limits::max() && low_lo + norm < low_lo) { return false; } } uint64_t hi_msb = hi >> 63; - uint64_t x54 = hi >> (9 + hi_msb); - adj_e2 -= static_cast(1 - hi_msb); + // Extract 54 bits: 53 significand bits plus one rounding bit. The product + // may be shifted by one depending on whether hi already has its top bit set. + uint64_t x54 = hi >> (9 + hi_msb); + adj_e2 -= (int)(1 - hi_msb); - // half-way ambiguity, fallback - if (lo == 0 && (hi & 0x1FF) == 0 && (x54 & 3) == 1) { return false; } + // Exact halfway with round-to-even ambiguity; let strtod handle the rare tie. + if (lo == 0 && (hi & FASTP64_HALF_MASK) == 0 && (x54 & 3) == 1) { return false; } - // exponent overflow, fallback + // Round 54 -> 53 bits, carry into the exponent if rounding overflows. uint64_t x53 = (x54 + (x54 & 1)) >> 1; uint64_t overflow = x53 >> 53; uint64_t ret_man = (x53 >> overflow) & FASTP64_MANTISSA_MASK; - int ret_exp = adj_e2 + static_cast(overflow); - if (ret_exp <= 0 || ret_exp >= 0x7FF) { return false; } + int ret_exp = adj_e2 + (int)overflow; + if (ret_exp <= 0 || ret_exp >= FASTP64_EXPONENT_MASK) { return false; } - bits = (static_cast(ret_exp) << 52) | ret_man; + bits = ((uint64_t)ret_exp << 52) | ret_man; return true; } @@ -357,14 +366,14 @@ static inline double assemble_fp64(const ParsedDecimal& dec) if (dec.fast_eligible) { double small = 0.0; bool used_small = false; - if (dec.exp10 >= 0 && dec.exp10 < static_cast(small_integer_powers.size())) { + if (dec.exp10 >= 0 && dec.exp10 < (int)small_integer_powers.size()) { uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10]; if (dec.mantissa <= limit) { - small = static_cast(dec.mantissa) * small_powers[dec.exp10]; + small = (double)dec.mantissa * small_powers[dec.exp10]; used_small = true; } } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) { - small = static_cast(dec.mantissa) / small_powers[-dec.exp10]; + small = (double)dec.mantissa / small_powers[-dec.exp10]; used_small = true; } if (used_small) { return dec.negative ? -small : small; } @@ -383,17 +392,12 @@ static inline double parse_fp64_advance(const char*& p, const char* end) const char* start = p; ParsedDecimal dec; if (!parse_decimal_advance(p, end, dec)) { - return fallback_strtod(std::string_view(start, static_cast(p - start))); + return fallback_strtod(std::string_view(start, (size_t)(p - start))); } double v = assemble_fp64(dec); if (v == v) return v; - return fallback_strtod(std::string_view(start, static_cast(p - start))); -} - -static inline double parse_fp64_token(const char* p, const char* end) -{ - return parse_fp64_advance(p, end); + return fallback_strtod(std::string_view(start, (size_t)(p - start))); } } // namespace fp64 diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index bd4ee4669a..70ed3283c3 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -5,7 +5,6 @@ #include "fast_fp64_parser.hpp" -#include #include #include #include @@ -26,26 +25,6 @@ namespace mps_fast { -static inline void reset_number_parse_stats() {} -static inline void print_number_parse_stats() {} - -static inline bool is_digit_byte(char c) noexcept { return c >= '0' && c <= '9'; } - -static inline double fast_atof_core(const char*& data, const char* end) -{ - return fp64::parse_fp64_advance(data, end); -} - -static inline double fast_atof(const char* data, const char* end) -{ - return fast_atof_core(data, end); -} - -static inline double fast_atof_advance(const char*& ptr, const char* end) -{ - return fast_atof_core(ptr, end); -} - struct cursor_t { const char* start; const char* ptr; @@ -65,7 +44,7 @@ struct cursor_t { line_start = p + 1; } } - std::size_t column = static_cast(ptr - line_start) + 1; + std::size_t column = (std::size_t)(ptr - line_start) + 1; return {line, column}; } @@ -92,7 +71,7 @@ struct cursor_t { static const char* scalar_scan(const char* p, const char* end) { while (p < end) { - unsigned char c = static_cast(*p); + unsigned char c = (unsigned char)*p; if constexpr (skip_ws_mode) { if (c > 32 || c == '\n') return p; } else { @@ -171,6 +150,8 @@ struct cursor_t { const simde__m256i v32 = simde_mm256_set1_epi8(32); const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + // Input buffers are padded by file_reader/lz4_file_reader/small_raw_read, + // so this unaligned 32-byte load is valid whenever end - ptr >= 32. simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); @@ -210,16 +191,19 @@ struct cursor_t { inline __attribute__((always_inline)) std::pair read_two_fields() { - if (__unlikely(end - ptr < 32)) { + auto slow = [&] { auto f1 = read_field(); auto f2 = read_field(); - return {f1, f2}; - } + return std::pair{f1, f2}; + }; + + if (__unlikely(end - ptr < 32)) { return slow(); } const char* field1_start = ptr; const simde__m256i v32 = simde_mm256_set1_epi8(32); const simde__m256i vnl = simde_mm256_set1_epi8('\n'); + // Same padded-buffer contract as read_field(). simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); @@ -229,33 +213,17 @@ struct cursor_t { unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); unsigned int stop_mask = printable_mask | nl_mask; - if (__unlikely(ws_mask == 0)) { - auto f1 = read_field(); - auto f2 = read_field(); - return {f1, f2}; - } + if (__unlikely(ws_mask == 0)) { return slow(); } int field1_end_off = __builtin_ctz(ws_mask); unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1); - if (__unlikely(after_field1 == 0)) { - auto f1 = read_field(); - auto f2 = read_field(); - return {f1, f2}; - } + if (__unlikely(after_field1 == 0)) { return slow(); } int field2_start_off = __builtin_ctz(after_field1); - if (__unlikely(ptr[field2_start_off] == '\n')) { - auto f1 = read_field(); - auto f2 = read_field(); - return {f1, f2}; - } + if (__unlikely(ptr[field2_start_off] == '\n')) { return slow(); } unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1); - if (__unlikely(ws_after_field2_start == 0)) { - auto f1 = read_field(); - auto f2 = read_field(); - return {f1, f2}; - } + if (__unlikely(ws_after_field2_start == 0)) { return slow(); } int field2_end_off = __builtin_ctz(ws_after_field2_start); unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1); @@ -274,7 +242,9 @@ struct cursor_t { static inline void expect(cursor_t& cursor, const char* field) { auto id = cursor.read_field(); - if (__unlikely(id != field)) { cursor.error("expected '%s', got '%s'", field, id.data()); } + if (__unlikely(id != field)) { + cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data()); + } } static inline void accept_comment_line(cursor_t& cursor) @@ -290,7 +260,10 @@ static inline void accept_comment_line(cursor_t& cursor) static inline void expect_eol(cursor_t& cursor) { - if (__unlikely(!cursor.eol())) { cursor.error("expected end of line, got '%s'", cursor.ptr); } + if (__unlikely(!cursor.eol())) { + auto got = cursor.peek_field(); + cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data()); + } for (;;) { while (cursor.eol()) { @@ -308,7 +281,8 @@ static inline void expect_eol(cursor_t& cursor) } if (__unlikely(cursor.done())) { return; } - if (__unlikely(!std::isalpha(static_cast(cursor.ptr[0])))) { + char c = cursor.ptr[0]; + if (__unlikely(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) { cursor.skip_ws(); if (cursor.eol()) { continue; } } @@ -336,19 +310,22 @@ static inline void expect_section(cursor_t& cursor, const char* section) static inline double expect_number(cursor_t& cursor) { auto num = cursor.read_field(); - if (num.empty()) { cursor.error("expected number, got '%s'", num.data()); } - return fast_atof(num.data(), num.data() + num.size()); + if (num.empty()) { cursor.error("expected number, got empty field"); } + const char* p = num.data(); + return fp64::parse_fp64_advance(p, p + num.size()); } static inline double expect_number_fast_pm_one(cursor_t& cursor) { const char* p = cursor.ptr; - if (p[0] == '-' && p[1] == '1' && p[2] <= ' ') { + // Kept bounded despite the global padding invariant: this path is also used + // on section-local cursors whose logical end may precede the physical buffer. + if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') { cursor.ptr = p + 2; cursor.skip_ws(); return -1.0; } - if (p[0] == '1' && p[1] <= ' ') { + if (cursor.end - p >= 2 && p[0] == '1' && p[1] <= ' ') { cursor.ptr = p + 1; cursor.skip_ws(); return 1.0; diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 73f50c5341..de1b3ea84c 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 -#define MPS_FAST_TIMERS - #include "fast_parser.hpp" #include "fast_parse_primitives.hpp" #include "file_reader.hpp" @@ -77,6 +75,48 @@ static int phase_thread_count(int phase_cap) return std::max(1, std::min(phase_cap, available_threads)); } +class chunk_name_arena_t { + public: + void reserve(size_t bytes) + { + if (bytes > next_slab_size_) { next_slab_size_ = bytes; } + } + + std::string_view copy(std::string_view name) + { + char* dst = allocate(name.size() + 1); + std::memcpy(dst, name.data(), name.size()); + dst[name.size()] = '\0'; + return std::string_view(dst, name.size()); + } + + private: + struct slab_t { + std::unique_ptr data; + size_t capacity = 0; + size_t used = 0; + }; + + char* allocate(size_t bytes) + { + if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().capacity) { + size_t capacity = std::max(bytes, next_slab_size_); + slab_t slab; + slab.data = std::make_unique(capacity); + slab.capacity = capacity; + slabs_.push_back(std::move(slab)); + next_slab_size_ = std::max(next_slab_size_ * 2, capacity); + } + slab_t& slab = slabs_.back(); + char* ptr = slab.data.get() + slab.used; + slab.used += bytes; + return ptr; + } + + std::vector slabs_; + size_t next_slab_size_ = 64 * 1024; +}; + static inline size_t row_hash_partition_for(uint32_t hash) { return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); @@ -89,8 +129,70 @@ static inline size_t row_hash_partition_for(uint32_t hash) struct TimerEntry { const char* name; double elapsed_ms; + size_t rss_kb; + size_t hwm_kb; + size_t compressed_bytes; }; +static std::atomic_size_t& get_timer_compressed_bytes() +{ + static std::atomic_size_t compressed_bytes{0}; + return compressed_bytes; +} + +class timer_io_context_t { + public: + explicit timer_io_context_t(size_t compressed_bytes) + : old_compressed_bytes_( + get_timer_compressed_bytes().exchange(compressed_bytes, std::memory_order_acq_rel)) + { + } + + ~timer_io_context_t() + { + get_timer_compressed_bytes().store(old_compressed_bytes_, std::memory_order_release); + } + + timer_io_context_t(const timer_io_context_t&) = delete; + timer_io_context_t& operator=(const timer_io_context_t&) = delete; + + private: + size_t old_compressed_bytes_ = 0; +}; + +static size_t parse_status_kb_line(const char* line, const char* key) +{ + size_t key_len = std::strlen(key); + if (std::strncmp(line, key, key_len) != 0) { return 0; } + const char* p = line + key_len; + while (*p == ' ' || *p == '\t') { + ++p; + } + size_t value = 0; + while (*p >= '0' && *p <= '9') { + value = value * 10 + (size_t)(*p - '0'); + ++p; + } + return value; +} + +static std::pair current_process_rss_kb() +{ + FILE* file = std::fopen("/proc/self/status", "r"); + if (file == nullptr) { return {0, 0}; } + + size_t rss_kb = 0; + size_t hwm_kb = 0; + char line[256]; + while (std::fgets(line, sizeof(line), file) != nullptr) { + if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); } + if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); } + if (rss_kb != 0 && hwm_kb != 0) { break; } + } + std::fclose(file); + return {rss_kb, hwm_kb}; +} + static std::vector& get_timer_buffer() { static std::vector buffer; @@ -110,7 +212,13 @@ static void flush_timers() std::lock_guard lock(get_timer_mutex()); auto& buffer = get_timer_buffer(); for (const auto& entry : buffer) { - std::fprintf(stderr, "[TIMER] %s: %.3f ms\n", entry.name, entry.elapsed_ms); + std::fprintf(stderr, + "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f compressed_GB=%.3f\n", + entry.name, + entry.elapsed_ms, + (double)entry.rss_kb / (1024.0 * 1024.0), + (double)entry.hwm_kb / (1024.0 * 1024.0), + (double)entry.compressed_bytes / (1024.0 * 1024.0 * 1024.0)); } buffer.clear(); #endif @@ -189,8 +297,10 @@ class scoped_timer_t { double elapsed_ms = std::chrono::duration(end - start_).count(); nvtx_.end(); if (accumulator_) { *accumulator_ += elapsed_ms; } + auto [rss_kb, hwm_kb] = current_process_rss_kb(); + size_t compressed_bytes = get_timer_compressed_bytes().load(std::memory_order_acquire); std::lock_guard lock(get_timer_mutex()); - get_timer_buffer().push_back({name_, elapsed_ms}); + get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb, compressed_bytes}); #endif } @@ -221,11 +331,6 @@ static inline void error_unknown_row(cursor_t& cursor, const char* row_start, co // Parsing state shared across section parsers // ============================================================================= -// Hash and equality for string_view keys in unordered_map -struct string_view_hash { - size_t operator()(std::string_view sv) const { return std::hash{}(sv); } -}; - static inline size_t next_power_of_2(size_t n) { if (n == 0) return 1; @@ -309,12 +414,14 @@ struct parse_state_t { // Temporary string_view storage (points into input buffer, no allocation) std::vector row_names_sv; std::vector var_names_sv; + std::vector var_name_arenas; std::string_view problem_name_sv; std::string_view objective_name_sv; std::vector ignored_objective_names_sv; // Optional dense ordered column index for labels like V0, V1, ... bool col_dense_ordered = false; + std::string col_dense_prefix_storage; std::string_view col_dense_prefix; uint64_t col_dense_min_id = 0; uint64_t col_dense_max_id = 0; @@ -329,7 +436,7 @@ struct parse_state_t { size_t row_hash_partition_count = 0; std::array row_hash_partitions = {}; // Overflow map for row names longer than HASH_KEY_BYTES - std::unordered_map row_names_long; + std::unordered_map row_names_long; // Optional dense ordered row index for labels like R0001, R0002, ... row_index_mode_t row_index_mode = row_index_mode_t::hash; @@ -342,7 +449,7 @@ struct parse_state_t { bool row_dense_zero_padded = false; // var_names still uses STL (only used in parse_bounds, not as hot) - std::unordered_map var_names_map; + std::unordered_map var_names_map; struct bounds_only_var_t { f_t lb = f_t{0}; @@ -524,7 +631,7 @@ struct parse_state_t { // Use mmap for allocation - the OS provides zero'd pages row_hash_region = mmap_region_t::anonymous( row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table"); - row_names_ht = static_cast(row_hash_region.data()); + row_names_ht = (hash_slot_var_t*)row_hash_region.data(); if (use_partitioned) { hash_slot_var_t* next_slots = row_names_ht; for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { @@ -1226,6 +1333,7 @@ struct ChunkResult { std::vector row_indices; std::vector col_offsets; std::vector var_names; + chunk_name_arena_t var_name_arena; std::vector markers; std::vector> objective_entries; // local_col_idx -> coefficient // Sparse per-row scratch: each touched 4096-row block stores counts after parsing, @@ -1475,6 +1583,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, result.row_indices.reserve(estimated_nnz); result.col_offsets.reserve(estimated_cols + 1); result.var_names.reserve(estimated_cols); + result.var_name_arena.reserve(std::max(4096, estimated_cols * 16)); result.objective_entries.reserve(estimated_cols); size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) / COLUMN_ROW_COUNT_BLOCK_ROWS; @@ -1532,25 +1641,25 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, sign = -1.0; cursor.advance(1); } - if (cursor.ptr + 1 < cursor.end && is_digit_byte(cursor.ptr[0]) && + if (cursor.ptr + 1 < cursor.end && fp64::is_digit(cursor.ptr[0]) && (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) { value = sign * (cursor.ptr[0] - '0'); cursor.advance(1); } else { - value = sign * fast_atof_advance(cursor.ptr, cursor.end); + value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end); } // usually EOL directly follows if (__unlikely(!cursor.eol())) { cursor.skip_ws(); } accept_comment(cursor); - if (result.first_var_name.empty()) { result.first_var_name = var_name; } - result.last_var_name = var_name; - if (prev_var_name != var_name) { - result.var_names.push_back(var_name); - observe_dense_col_name(result.dense_col_stats, var_name); + std::string_view owned_var_name = result.var_name_arena.copy(var_name); + result.var_names.push_back(owned_var_name); + observe_dense_col_name(result.dense_col_stats, owned_var_name); result.col_offsets.push_back(result.values.size()); - prev_var_name = var_name; + prev_var_name = owned_var_name; + if (result.first_var_name.empty()) { result.first_var_name = owned_var_name; } + result.last_var_name = owned_var_name; } auto add_entry = [&](std::string_view rn, double val) { @@ -1579,7 +1688,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, expect_eol(cursor); continue; } - double value2 = fast_atof_advance(cursor.ptr, cursor.end); + double value2 = fp64::parse_fp64_advance(cursor.ptr, cursor.end); cursor.skip_ws(); accept_comment(cursor); @@ -1595,114 +1704,129 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, } // Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC -template -static void merge_chunk_results_to_csr(parse_state_t& state, - std::vector& chunks, - int num_threads) -{ - scoped_timer_t timer("merge_chunks_to_csr"); - - int num_chunks = (int)chunks.size(); - if (num_chunks == 0) return; - - i_t n_rows = state.problem.n_constraints_; +template +struct column_merge_shape_t { + int num_chunks = 0; + i_t n_rows = 0; + std::vector global_col_offset; + size_t total_cols = 0; + size_t total_nnz = 0; +}; - std::vector global_col_offset(num_chunks + 1); - global_col_offset[0] = 0; - size_t total_nnz = 0; +template +static column_merge_shape_t compute_column_merge_shape(const std::vector& chunks, + i_t n_rows) +{ + column_merge_shape_t shape; + shape.num_chunks = (int)chunks.size(); + shape.n_rows = n_rows; + shape.global_col_offset.resize((size_t)shape.num_chunks + 1); { scoped_timer_t timer("columns_global_offsets"); - for (int t = 0; t < num_chunks; t++) { - global_col_offset[t + 1] = global_col_offset[t] + chunks[t].var_names.size(); - total_nnz += chunks[t].values.size(); + for (int t = 0; t < shape.num_chunks; t++) { + shape.global_col_offset[(size_t)t + 1] = + shape.global_col_offset[(size_t)t] + chunks[(size_t)t].var_names.size(); + shape.total_nnz += chunks[(size_t)t].values.size(); } } - size_t total_cols = global_col_offset[num_chunks]; + shape.total_cols = shape.global_col_offset[(size_t)shape.num_chunks]; if constexpr (std::numeric_limits::max() < std::numeric_limits::max()) { const size_t index_max = (size_t)std::numeric_limits::max(); - if (total_nnz > index_max) { + if (shape.total_nnz > index_max) { mps_parser_fail(error_type_t::RuntimeError, "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu", - total_nnz, + shape.total_nnz, index_max); } - if (total_cols > index_max || (size_t)n_rows > index_max) { + if (shape.total_cols > index_max || (size_t)n_rows > index_max) { mps_parser_fail(error_type_t::RuntimeError, "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index " "max=%zu", (size_t)n_rows, - total_cols, + shape.total_cols, index_max); } } - { - scoped_timer_t timer("columns_dense_metadata"); - bool dense_ok = total_cols > 0; - bool have_first = false; - std::string_view dense_prefix; - uint64_t expected_next_id = 0; - uint64_t dense_min_id = 0; - uint64_t dense_max_id = 0; - size_t dense_pad_width = 0; - bool dense_zero_padded = false; - - for (int t = 0; t < num_chunks && dense_ok; ++t) { - const auto& stats = chunks[t].dense_col_stats; - if (stats.count == 0) { continue; } - if (!stats.candidate || stats.count != chunks[t].var_names.size()) { - dense_ok = false; - break; - } - if (!have_first) { - have_first = true; - dense_prefix = stats.prefix; - expected_next_id = stats.first_id; - dense_min_id = stats.first_id; - dense_pad_width = stats.pad_width; - dense_zero_padded = stats.zero_padded; - } - if (stats.prefix != dense_prefix || stats.first_id != expected_next_id || - !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) { - dense_ok = false; - break; - } - if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) { - dense_ok = false; - break; - } - dense_max_id = stats.last_id; - if (stats.last_id == std::numeric_limits::max()) { - expected_next_id = stats.last_id; - dense_ok = false; - break; - } - expected_next_id = stats.last_id + 1; - } + return shape; +} - if (!have_first || dense_max_id < dense_min_id || - dense_max_id - dense_min_id + 1 != total_cols) { +template +static void detect_dense_column_metadata(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_dense_metadata"); + bool dense_ok = shape.total_cols > 0; + bool have_first = false; + std::string_view dense_prefix; + uint64_t expected_next_id = 0; + uint64_t dense_min_id = 0; + uint64_t dense_max_id = 0; + size_t dense_pad_width = 0; + bool dense_zero_padded = false; + + for (int t = 0; t < shape.num_chunks && dense_ok; ++t) { + const auto& stats = chunks[(size_t)t].dense_col_stats; + if (stats.count == 0) { continue; } + if (!stats.candidate || stats.count != chunks[(size_t)t].var_names.size()) { + dense_ok = false; + break; + } + if (!have_first) { + have_first = true; + dense_prefix = stats.prefix; + expected_next_id = stats.first_id; + dense_min_id = stats.first_id; + dense_pad_width = stats.pad_width; + dense_zero_padded = stats.zero_padded; + } + if (stats.prefix != dense_prefix || stats.first_id != expected_next_id || + !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) { dense_ok = false; + break; } - - state.col_dense_ordered = dense_ok; - if (dense_ok) { - state.col_dense_prefix = dense_prefix; - state.col_dense_min_id = dense_min_id; - state.col_dense_max_id = dense_max_id; - state.col_dense_pad_width = dense_pad_width; - state.col_dense_zero_padded = dense_zero_padded; + if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) { + dense_ok = false; + break; } + dense_max_id = stats.last_id; + if (stats.last_id == std::numeric_limits::max()) { + dense_ok = false; + break; + } + expected_next_id = stats.last_id + 1; + } + + if (!have_first || dense_max_id < dense_min_id || + dense_max_id - dense_min_id + 1 != shape.total_cols) { + dense_ok = false; } - // Step 2: Sum row counts (already computed during parsing) and build CSR row_offsets - std::vector global_row_counts((size_t)n_rows, 0); + state.col_dense_ordered = dense_ok; + if (dense_ok) { + state.col_dense_prefix_storage.assign(dense_prefix); + state.col_dense_prefix = state.col_dense_prefix_storage; + state.col_dense_min_id = dense_min_id; + state.col_dense_max_id = dense_max_id; + state.col_dense_pad_width = dense_pad_width; + state.col_dense_zero_padded = dense_zero_padded; + } +} + +template +static std::vector build_csr_row_offsets(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + std::vector global_row_counts((size_t)shape.n_rows, 0); { scoped_timer_t timer("columns_sum_row_counts"); - for (int t = 0; t < num_chunks; t++) { - for (const auto& block : chunks[t].row_count_blocks) { - const int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset; - size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; - size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base); + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& block : chunks[(size_t)t].row_count_blocks) { + const int64_t* block_counts = + chunks[(size_t)t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base); for (size_t local = 0; local < block_limit; ++local) { global_row_counts[row_base + local] += (i_t)block_counts[local]; } @@ -1711,196 +1835,223 @@ static void merge_chunk_results_to_csr(parse_state_t& state, } { scoped_timer_t timer("columns_build_row_offsets"); - state.problem.A_offsets_.resize((size_t)n_rows + 1); + state.problem.A_offsets_.resize((size_t)shape.n_rows + 1); state.problem.A_offsets_[0] = 0; - for (i_t r = 0; r < n_rows; r++) { + for (i_t r = 0; r < shape.n_rows; r++) { state.problem.A_offsets_[(size_t)r + 1] = state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r]; } } + return global_row_counts; +} - { - scoped_timer_t timer("columns_counts_to_write_positions"); - std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0}); - for (int t = 0; t < num_chunks; t++) { - for (auto& block : chunks[t].row_count_blocks) { - int64_t* block_counts = chunks[t].row_count_storage.data() + block.storage_offset; - size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; - size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)n_rows - row_base); - for (size_t local = 0; local < block_limit; ++local) { - int64_t count = block_counts[local]; - if (count == 0) continue; - size_t row = row_base + local; - i_t pos = state.problem.A_offsets_[row] + global_row_counts[row]; - block_counts[local] = (int64_t)pos; - global_row_counts[row] += (i_t)count; - } +template +static void convert_counts_to_write_positions(std::vector& chunks, + const column_merge_shape_t& shape, + const std::vector& row_offsets, + std::vector& global_row_counts) +{ + scoped_timer_t timer("columns_counts_to_write_positions"); + std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0}); + for (int t = 0; t < shape.num_chunks; t++) { + for (auto& block : chunks[(size_t)t].row_count_blocks) { + int64_t* block_counts = chunks[(size_t)t].row_count_storage.data() + block.storage_offset; + size_t row_base = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base); + for (size_t local = 0; local < block_limit; ++local) { + int64_t count = block_counts[local]; + if (count == 0) continue; + size_t row = row_base + local; + i_t pos = row_offsets[row] + global_row_counts[row]; + block_counts[local] = (int64_t)pos; + global_row_counts[row] += (i_t)count; } } } +} - { - scoped_timer_t timer("columns_row_count_storage_hugepages"); +static void materialize_chunk_row_count_storage(std::vector& chunks, int num_threads) +{ + scoped_timer_t timer("columns_row_count_storage_hugepages"); #pragma omp parallel for num_threads(num_threads) - for (int t = 0; t < num_chunks; ++t) { - materialize_vector_hugepages( - "column_row_count_storage", chunks[t].row_count_storage, materialize_touch_t::write_2mb); - } + for (int t = 0; t < (int)chunks.size(); ++t) { + materialize_vector_hugepages("column_row_count_storage", + chunks[(size_t)t].row_count_storage, + materialize_touch_t::write_2mb); } +} - // Step 6: Allocate CSR arrays - { - scoped_timer_t timer("allocate_csr_arrays"); - - // May be unexpectedly slow, even if already reserved() to good fit. - // I assume the cause is probably that the pages aren't actually backed when reserve() is called - // and the actual physical allocation only happens now - - // evil tweak until we can refactior problem_t - // run the zero-init resize() calls in parallel +template +static void allocate_column_outputs(parse_state_t& state, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("allocate_csr_arrays"); + // problem_t uses std::vector, so these resize() calls zero-initialize large arrays. + // Running them in parallel hides part of that page-fault and initialization cost. #pragma omp parallel sections num_threads(4) - { + { #pragma omp section - { - state.problem.A_.resize(total_nnz); - } + { + state.problem.A_.resize(shape.total_nnz); + } #pragma omp section - { - state.problem.A_indices_.resize(total_nnz); - } + { + state.problem.A_indices_.resize(shape.total_nnz); + } #pragma omp section - { - if (!state.col_dense_ordered) { state.var_names_sv.resize(total_cols); } + { + if (!state.col_dense_ordered) { + state.var_name_arenas.clear(); + state.var_name_arenas.resize((size_t)shape.num_chunks); + state.var_names_sv.resize(shape.total_cols); } + } #pragma omp section - { - state.problem.var_types_.resize(total_cols); - } + { + state.problem.var_types_.resize(shape.total_cols); } } +} - // Step 6: Parallel scatter into CSR + copy var_names +template +static void scatter_column_chunks_to_csr(parse_state_t& state, + std::vector& chunks, + const column_merge_shape_t& shape, + int num_threads) +{ + scoped_timer_t timer("scatter_into_csr"); { - scoped_timer_t timer("scatter_into_csr"); - { - scoped_timer_t matrix_timer("scatter_matrix_entries"); + scoped_timer_t matrix_timer("scatter_matrix_entries"); #ifdef MPS_FAST_PERF_COUNTERS - std::vector perf_snapshots((size_t)num_chunks); + std::vector perf_snapshots((size_t)shape.num_chunks); #endif #pragma omp parallel for num_threads(num_threads) - for (int t = 0; t < num_chunks; t++) { + for (int t = 0; t < shape.num_chunks; t++) { #ifdef MPS_FAST_PERF_COUNTERS - thread_perf_counters_t perf_counters; + thread_perf_counters_t perf_counters; #endif - auto& chunk = chunks[t]; - - for (size_t local_col = 0; local_col < chunks[t].var_names.size(); local_col++) { - i_t global_col = (i_t)(global_col_offset[t] + local_col); - - size_t col_start = chunks[t].col_offsets[local_col]; - size_t col_end = chunks[t].col_offsets[local_col + 1]; - for (size_t idx = col_start; idx < col_end; idx++) { - i_t row = (i_t)chunks[t].row_indices[idx]; - size_t row_idx = (size_t)row; - size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; - size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; - int32_t block_pos = chunk.row_count_block_dir[block_id]; - RowCountBlock& block = chunk.row_count_blocks[(size_t)block_pos]; - int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; - i_t dest = (i_t)write_pos++; - state.problem.A_[dest] = (f_t)chunks[t].values[idx]; - state.problem.A_indices_[dest] = global_col; - } + auto& chunk = chunks[(size_t)t]; + for (size_t local_col = 0; local_col < chunk.var_names.size(); local_col++) { + i_t global_col = (i_t)(shape.global_col_offset[(size_t)t] + local_col); + size_t col_start = chunk.col_offsets[local_col]; + size_t col_end = chunk.col_offsets[local_col + 1]; + for (size_t idx = col_start; idx < col_end; idx++) { + i_t row = (i_t)chunk.row_indices[idx]; + size_t row_idx = (size_t)row; + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = chunk.row_count_block_dir[block_id]; + RowCountBlock& block = chunk.row_count_blocks[(size_t)block_pos]; + int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; + i_t dest = (i_t)write_pos++; + state.problem.A_[dest] = (f_t)chunk.values[idx]; + state.problem.A_indices_[dest] = global_col; } -#ifdef MPS_FAST_PERF_COUNTERS - perf_snapshots[(size_t)t] = perf_counters.stop(); -#endif } #ifdef MPS_FAST_PERF_COUNTERS - print_perf_totals("scatter_matrix_entries", perf_snapshots); + perf_snapshots[(size_t)t] = perf_counters.stop(); #endif } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("scatter_matrix_entries", perf_snapshots); +#endif + } - if (!state.col_dense_ordered) { - { - scoped_timer_t names_timer("scatter_var_names"); + if (!state.col_dense_ordered) { + scoped_timer_t names_timer("scatter_var_names"); #pragma omp parallel for num_threads(num_threads) - for (int t = 0; t < num_chunks; t++) { - for (size_t i = 0; i < chunks[t].var_names.size(); i++) { - state.var_names_sv[global_col_offset[t] + i] = chunks[t].var_names[i]; - } - } + for (int t = 0; t < shape.num_chunks; t++) { + chunk_name_arena_t& arena = state.var_name_arenas[(size_t)t]; + arena.reserve(std::max(4096, chunks[(size_t)t].var_names.size() * 16)); + for (size_t i = 0; i < chunks[(size_t)t].var_names.size(); i++) { + state.var_names_sv[shape.global_col_offset[(size_t)t] + i] = + arena.copy(chunks[(size_t)t].var_names[i]); } - } else { - scoped_timer_t names_timer("scatter_var_names"); } + } else { + scoped_timer_t names_timer("scatter_var_names"); } +} - // Step 7: Apply integer markers - struct GlobalMarker { - MarkerInfo::Type type; - size_t global_var_idx; - }; - { - scoped_timer_t timer("columns_apply_markers"); - std::vector all_markers; - - for (int t = 0; t < num_chunks; t++) { - for (const auto& m : chunks[t].markers) { - GlobalMarker gm; - gm.type = m.type; - - if (m.after_local_var_idx == SIZE_MAX) { - // Marker before any variable in this chunk - gm.global_var_idx = (global_col_offset[t] > 0) ? global_col_offset[t] - 1 : SIZE_MAX; - } else { - gm.global_var_idx = global_col_offset[t] + m.after_local_var_idx; - } - all_markers.push_back(gm); - } - } - - std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { - // SIZE_MAX means "before all variables" - should sort first - if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true; - if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false; - return a.global_var_idx < b.global_var_idx; - }); +struct global_marker_t { + MarkerInfo::Type type; + size_t global_var_idx; +}; - bool is_integer = false; - size_t marker_idx = 0; +template +static void apply_column_integer_markers(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_apply_markers"); + std::vector all_markers; + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& m : chunks[(size_t)t].markers) { + global_marker_t gm; + gm.type = m.type; + gm.global_var_idx = + m.after_local_var_idx == SIZE_MAX + ? (shape.global_col_offset[(size_t)t] > 0 ? shape.global_col_offset[(size_t)t] - 1 + : SIZE_MAX) + : shape.global_col_offset[(size_t)t] + m.after_local_var_idx; + all_markers.push_back(gm); + } + } + + std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { + if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true; + if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false; + return a.global_var_idx < b.global_var_idx; + }); - for (size_t v = 0; v < total_cols; v++) { - while (marker_idx < all_markers.size() && - (all_markers[marker_idx].global_var_idx == SIZE_MAX || - all_markers[marker_idx].global_var_idx < v)) { - if (all_markers[marker_idx].type == MarkerInfo::INTORG) { - is_integer = true; - } else { - is_integer = false; - } - marker_idx++; - } - state.problem.var_types_[v] = is_integer ? 'I' : 'C'; + bool is_integer = false; + size_t marker_idx = 0; + for (size_t v = 0; v < shape.total_cols; v++) { + while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX || + all_markers[marker_idx].global_var_idx < v)) { + is_integer = all_markers[marker_idx].type == MarkerInfo::INTORG; + marker_idx++; } + state.problem.var_types_[v] = is_integer ? 'I' : 'C'; } +} - // Step 8: Handle objective entries - { - scoped_timer_t timer("columns_objective_entries"); - state.problem.c_.resize(total_cols, f_t{0}); - for (int t = 0; t < num_chunks; t++) { - for (const auto& [local_col, coeff] : chunks[t].objective_entries) { - size_t global_col = global_col_offset[t] + local_col; - if (global_col < total_cols) { state.problem.c_[global_col] = (f_t)coeff; } - } +template +static void assign_column_objective_entries(parse_state_t& state, + const std::vector& chunks, + const column_merge_shape_t& shape) +{ + scoped_timer_t timer("columns_objective_entries"); + state.problem.c_.resize(shape.total_cols, f_t{0}); + for (int t = 0; t < shape.num_chunks; t++) { + for (const auto& [local_col, coeff] : chunks[(size_t)t].objective_entries) { + size_t global_col = shape.global_col_offset[(size_t)t] + local_col; + if (global_col < shape.total_cols) { state.problem.c_[global_col] = (f_t)coeff; } } } +} - // Store final dimensions; CSR and objective coefficients are already complete. - state.problem.n_vars_ = (i_t)total_cols; - state.problem.nnz_ = (i_t)total_nnz; +template +static void merge_chunk_results_to_csr(parse_state_t& state, + std::vector& chunks, + int num_threads) +{ + scoped_timer_t timer("merge_chunks_to_csr"); + if (chunks.empty()) return; + + auto shape = compute_column_merge_shape(chunks, state.problem.n_constraints_); + detect_dense_column_metadata(state, chunks, shape); + auto global_row_counts = build_csr_row_offsets(state, chunks, shape); + convert_counts_to_write_positions(chunks, shape, state.problem.A_offsets_, global_row_counts); + materialize_chunk_row_count_storage(chunks, num_threads); + allocate_column_outputs(state, shape); + scatter_column_chunks_to_csr(state, chunks, shape, num_threads); + apply_column_integer_markers(state, chunks, shape); + assign_column_objective_entries(state, chunks, shape); + + state.problem.n_vars_ = (i_t)shape.total_cols; + state.problem.nnz_ = (i_t)shape.total_nnz; } template @@ -1931,20 +2082,28 @@ static void parse_columns_section_parallel(parse_state_t& state, #ifdef MPS_FAST_PERF_COUNTERS std::vector perf_snapshots((size_t)num_threads); #endif + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; { #pragma omp parallel for num_threads(num_threads) for (int t = 0; t < num_threads; t++) { - MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns); + try { + MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns); #ifdef MPS_FAST_PERF_COUNTERS - thread_perf_counters_t perf_counters; + thread_perf_counters_t perf_counters; #endif - results[t] = - parse_columns_chunk(chunk_bounds[t].start, chunk_bounds[t].end, state); + results[t] = + parse_columns_chunk(chunk_bounds[t].start, chunk_bounds[t].end, state); #ifdef MPS_FAST_PERF_COUNTERS - perf_snapshots[(size_t)t] = perf_counters.stop(); + perf_snapshots[(size_t)t] = perf_counters.stop(); #endif + } catch (...) { + std::lock_guard lock(error_mutex); + if (!first_error) { first_error = std::current_exception(); } + } } } + if (first_error) { std::rethrow_exception(first_error); } #ifdef MPS_FAST_PERF_COUNTERS print_perf_totals("parse_columns_chunk_parallel", perf_snapshots); #endif @@ -2016,6 +2175,74 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) } } +static size_t find_var_after_hint(const std::vector& var_names, + std::string_view var_name, + size_t hint_idx) +{ + const size_t n_vars = var_names.size(); + if (hint_idx + 1 < n_vars && var_names[hint_idx + 1] == var_name) { return hint_idx + 1; } + if (hint_idx < n_vars && var_names[hint_idx] == var_name) { return hint_idx; } + + const size_t first_begin = std::min(hint_idx + 2, n_vars); + for (size_t i = first_begin; i < n_vars; ++i) { + if (var_names[i] == var_name) { return i; } + } + for (size_t i = 0; i < hint_idx && i < n_vars; ++i) { + if (var_names[i] == var_name) { return i; } + } + return SIZE_MAX; +} + +template +static bool apply_bound_record(std::string_view bound_type, + f_t value, + bool has_value, + bool first_bound_for_var, + SetLb&& set_lb, + SetUb&& set_ub, + SetType&& set_type, + Error&& error) +{ + if (bound_type == "LO") { + set_lb(value); + } else if (bound_type == "UP") { + set_ub(value); + if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits::infinity()); } + } else if (bound_type == "FX") { + set_lb(value); + set_ub(value); + } else if (bound_type == "FR") { + set_lb(-std::numeric_limits::infinity()); + set_ub(std::numeric_limits::infinity()); + } else if (bound_type == "MI") { + set_lb(-std::numeric_limits::infinity()); + } else if (bound_type == "PL") { + set_ub(std::numeric_limits::infinity()); + } else if (bound_type == "BV") { + set_lb(f_t{0}); + set_ub(f_t{1}); + set_type('I'); + } else if (bound_type == "LI") { + set_lb(value); + set_type('I'); + } else if (bound_type == "UI") { + set_ub(value); + if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits::infinity()); } + set_type('I'); + } else if (bound_type == "SC") { + if (__unlikely(!has_value)) { + error("SC bound requires an upper bound value", bound_type); + return false; + } + set_ub(value); + set_type('S'); + } else { + error("unknown bound type", bound_type); + return false; + } + return true; +} + template static bool parse_bounds_section_parallel_dense(parse_state_t& state, cursor_t& cursor, @@ -2042,8 +2269,6 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, size_t min_var = SIZE_MAX; size_t max_var = 0; size_t decreasing_order = 0; - bool saw_integer_type = false; - bool saw_negative_upper = false; const char* error_ptr = nullptr; char error_msg[192] = {}; }; @@ -2073,23 +2298,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, size_t hint_idx = 0; auto lookup_var = [&](std::string_view var_name) { if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); } - if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) { - return hint_idx + 1; - } - if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { return hint_idx; } - - size_t search_start = hint_idx + 2; - size_t search_end = n_vars; - search_loop: - for (size_t i = search_start; i < search_end; ++i) { - if (state.var_names_sv[i] == var_name) { return i; } - } - if (search_start != 0) { - search_end = hint_idx; - search_start = 0; - goto search_loop; - } - return SIZE_MAX; + return find_var_after_hint(state.var_names_sv, var_name, hint_idx); }; try { while (cursor.ptr < cursor.end) { @@ -2144,57 +2353,30 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, accept_comment(cursor); } - if (bound_type == "LO") { - state.problem.variable_lower_bounds_[var_idx] = value; - } else if (bound_type == "UP") { - state.problem.variable_upper_bounds_[var_idx] = value; - if (first_bound_for_var && value < f_t{0}) { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); - local.saw_negative_upper = true; - } - } else if (bound_type == "FX") { - state.problem.variable_lower_bounds_[var_idx] = value; - state.problem.variable_upper_bounds_[var_idx] = value; - } else if (bound_type == "FR") { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); - state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); - } else if (bound_type == "MI") { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); - } else if (bound_type == "PL") { - state.problem.variable_upper_bounds_[var_idx] = std::numeric_limits::infinity(); - } else if (bound_type == "BV") { - state.problem.variable_lower_bounds_[var_idx] = 0; - state.problem.variable_upper_bounds_[var_idx] = 1; - state.problem.var_types_[var_idx] = 'I'; - local.saw_integer_type = true; - } else if (bound_type == "LI") { - state.problem.variable_lower_bounds_[var_idx] = value; - state.problem.var_types_[var_idx] = 'I'; - local.saw_integer_type = true; - } else if (bound_type == "UI") { - state.problem.variable_upper_bounds_[var_idx] = value; - if (first_bound_for_var && value < f_t{0}) { - state.problem.variable_lower_bounds_[var_idx] = -std::numeric_limits::infinity(); - local.saw_negative_upper = true; - } - state.problem.var_types_[var_idx] = 'I'; - local.saw_integer_type = true; - } else if (bound_type == "SC") { - if (__unlikely(!has_value)) { - std::snprintf( - local.error_msg, sizeof(local.error_msg), "SC bound requires an upper bound value"); - local.error_ptr = cursor.ptr; - break; + auto set_lb = [&](f_t x) { state.problem.variable_lower_bounds_[var_idx] = x; }; + auto set_ub = [&](f_t x) { state.problem.variable_upper_bounds_[var_idx] = x; }; + auto set_type = [&](char t) { state.problem.var_types_[var_idx] = t; }; + auto set_error = [&](const char* msg, std::string_view type) { + if (type.empty() || std::strcmp(msg, "unknown bound type") != 0) { + std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", msg); + } else { + std::snprintf(local.error_msg, + sizeof(local.error_msg), + "%s: %.*s", + msg, + (int)type.size(), + type.data()); } - state.problem.variable_upper_bounds_[var_idx] = value; - state.problem.var_types_[var_idx] = 'S'; - } else { - std::snprintf(local.error_msg, - sizeof(local.error_msg), - "unknown bound type: %.*s", - (int)bound_type.size(), - bound_type.data()); local.error_ptr = cursor.ptr; + }; + if (!apply_bound_record(bound_type, + value, + has_value, + first_bound_for_var, + set_lb, + set_ub, + set_type, + set_error)) { break; } @@ -2353,29 +2535,10 @@ static void parse_bounds_section(parse_state_t& state, if (__likely(state.col_dense_ordered)) { var_idx = state.col_lookup_dense_ordered(var_name); if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } - } else if (hint_idx + 1 < n_vars && state.var_names_sv[hint_idx + 1] == var_name) { - var_idx = hint_idx + 1; - } else if (hint_idx < n_vars && state.var_names_sv[hint_idx] == var_name) { - var_idx = hint_idx; } else { - size_t search_start = hint_idx + 2; - size_t search_end = n_vars; - - search_loop: - for (size_t i = search_start; i < search_end; ++i) { - if (state.var_names_sv[i] == var_name) { - var_idx = i; - goto found; - } - } - if (search_start != 0) { - search_end = hint_idx; - search_start = 0; - goto search_loop; - } - aux_var = &state.bounds_only_vars[var_name]; + var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx); + if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } } - found: if (var_idx != SIZE_MAX) { hint_idx = var_idx; } bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx); @@ -2383,15 +2546,8 @@ static void parse_bounds_section(parse_state_t& state, bool has_value = false; accept_comment(cursor); if (!cursor.eol()) { - // bounds are often just set to 0 or 1 - if (false && isdigit(cursor.ptr[0]) && cursor.ptr[1] == '\n' && cursor.ptr[2] == ' ') { - value = cursor.ptr[0] - '0'; - cursor.ptr += 1; - has_value = true; - } else { - value = (f_t)expect_number(cursor); - has_value = true; - } + value = (f_t)expect_number(cursor); + has_value = true; accept_comment(cursor); } @@ -2417,43 +2573,14 @@ static void parse_bounds_section(parse_state_t& state, } }; - if (bound_type == "LO") { - set_lb(value); - } else if (bound_type == "UP") { - set_ub(value); - if (first_bound_for_var && value < f_t{0}) { - set_lb(-std::numeric_limits::infinity()); - } - } else if (bound_type == "FX") { - set_lb(value); - set_ub(value); - } else if (bound_type == "FR") { - set_lb(-std::numeric_limits::infinity()); - set_ub(std::numeric_limits::infinity()); - } else if (bound_type == "MI") { - set_lb(-std::numeric_limits::infinity()); - } else if (bound_type == "PL") { - set_ub(std::numeric_limits::infinity()); - } else if (bound_type == "BV") { - set_lb(0); - set_ub(1); - set_type('I'); - } else if (bound_type == "LI") { - set_lb(value); - set_type('I'); - } else if (bound_type == "UI") { - set_ub(value); - if (first_bound_for_var && value < f_t{0}) { - set_lb(-std::numeric_limits::infinity()); + auto set_error = [&](const char* msg, std::string_view type) { + if (std::strcmp(msg, "unknown bound type") == 0) { + cursor.error("%s: %.*s", msg, (int)type.size(), type.data()); } - set_type('I'); - } else if (bound_type == "SC") { - if (__unlikely(!has_value)) { cursor.error("SC bound requires an upper bound value"); } - set_ub(value); - set_type('S'); - } else { - cursor.error("unknown bound type: %.*s", (int)bound_type.size(), bound_type.data()); - } + cursor.error("%s", msg); + }; + (void)apply_bound_record( + bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error); if (aux_var == nullptr) { mark_bound(var_idx); } expect_eol(cursor); @@ -2831,15 +2958,10 @@ static void append_bounds_only_variables(parse_state_t& state) state.problem.n_vars_ = (i_t)state.problem.var_names_.size(); } -template -static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( - Stream& stream, const char* total_timer_name, const char* producer_task_name) +template +static std::size_t init_problem_storage( + cuopt::linear_programming::io::mps_data_model_t& problem, std::size_t reserve_hint) { - auto total_timer = std::make_unique(total_timer_name); - omp_set_max_active_levels(2); - - input_stream_view_t input = stream.view(); - cuopt::linear_programming::io::mps_data_model_t problem; problem.n_vars_ = 0; problem.n_constraints_ = 0; problem.nnz_ = 0; @@ -2847,7 +2969,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ problem.objective_scaling_factor_ = f_t{1}; problem.objective_offset_ = f_t{0}; - std::size_t reserve_size = std::max(stream.reserve_size_hint(), 1024 * 1024); + std::size_t reserve_size = std::max(reserve_hint, 1024 * 1024); std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); problem.A_offsets_.reserve(reserve_dim); problem.b_.reserve(reserve_dim); @@ -2859,6 +2981,31 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ problem.var_names_.reserve(reserve_dim); problem.constraint_lower_bounds_.reserve(reserve_dim); problem.constraint_upper_bounds_.reserve(reserve_dim); + return reserve_dim; +} + +static const char* trailing_endata_cursor_end(mps_phase_registry_t& registry) +{ + mps_phase_range_t quadratic = registry.range(mps_phase_kind::quadratic); + if (quadratic.present) { return quadratic.end; } + mps_phase_range_t bounds = registry.range(mps_phase_kind::bounds); + if (bounds.present) { return bounds.end; } + mps_phase_range_t ranges = registry.range(mps_phase_kind::ranges); + if (ranges.present) { return ranges.end; } + return registry.range(mps_phase_kind::rhs).end; +} + +template +static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( + Stream& stream, const char* total_timer_name, const char* producer_task_name) +{ + omp_set_max_active_levels(2); + + input_stream_view_t input = stream.view(); + timer_io_context_t timer_io_context(input.compressed_size); + auto total_timer = std::make_unique(total_timer_name); + cuopt::linear_programming::io::mps_data_model_t problem; + std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint()); cursor_t cursor(input.data, 0); parse_state_t state(problem, cursor); @@ -2949,6 +3096,9 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic); } + // We intentionally keep LZ4/raw input as a stable full-buffer producer here. The + // progressive decoded-page lifetime prototype saved RSS, but made COLUMNS/merge slower + // and really wants a separate memory-limited parser pipeline instead of this fast path. #pragma omp task { MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io); @@ -2978,7 +3128,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ }); } -#pragma omp task depend(in : columns_ready, rows_done) depend(out : columns_done) +#pragma omp task depend(in : rows_done, columns_ready) depend(out : columns_done) { run_parser_task([&] { MPS_NVTX_RANGE("task_columns", nvtx::colors::columns); @@ -3042,13 +3192,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ append_bounds_only_variables(state); input.size = stream.size(); - cursor.ptr = input.registry->range(mps_phase_kind::quadratic).present - ? input.registry->range(mps_phase_kind::quadratic).end - : (input.registry->range(mps_phase_kind::bounds).present - ? input.registry->range(mps_phase_kind::bounds).end - : (input.registry->range(mps_phase_kind::ranges).present - ? input.registry->range(mps_phase_kind::ranges).end - : input.registry->range(mps_phase_kind::rhs).end)); + cursor.ptr = trailing_endata_cursor_end(*input.registry); cursor.end = input.data + input.size; if (!cursor.done()) { expect(cursor, "ENDATA"); } @@ -3060,6 +3204,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ struct small_raw_read_t { bool use_small_path = false; std::vector buffer; + std::size_t size = 0; }; static small_raw_read_t try_read_small_raw_file(const std::string& path) @@ -3081,54 +3226,39 @@ static small_raw_read_t try_read_small_raw_file(const std::string& path) mps_parser_fail( error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str()); } - std::size_t file_size = static_cast(file_size_long); + std::size_t file_size = (std::size_t)file_size_long; if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; } if (std::fseek(file, 0, SEEK_SET) != 0) { mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str()); } - std::vector buffer(file_size); + if (file_size > std::numeric_limits::max() - input_buffer_padding_bytes) { + mps_parser_fail(error_type_t::OutOfMemoryError, "small raw input padding size overflow"); + } + std::vector buffer(file_size + input_buffer_padding_bytes); if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) { mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str()); } - return {true, std::move(buffer)}; + return {true, std::move(buffer), file_size}; } template static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_small_raw_file( - std::vector buffer) + std::vector buffer, std::size_t input_size) { auto total_timer = std::make_unique("parse_mps_fast_file_raw_small (total)"); const char* data = buffer.data(); - const char* end = data + buffer.size(); + const char* end = data + input_size; mps_phase_registry_t registry; mps_section_block_scanner_t scanner(data, 1, registry); scanner.observe_block(0, data, end); - scanner.publish_ready(buffer.size()); + scanner.publish_ready(input_size); cuopt::linear_programming::io::mps_data_model_t problem; - problem.n_vars_ = 0; - problem.n_constraints_ = 0; - problem.nnz_ = 0; - problem.maximize_ = false; - problem.objective_scaling_factor_ = f_t{1}; - problem.objective_offset_ = f_t{0}; - - std::size_t reserve_size = std::max(buffer.size(), 1024 * 1024); - std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); - problem.A_offsets_.reserve(reserve_dim); - problem.b_.reserve(reserve_dim); - problem.variable_lower_bounds_.reserve(reserve_dim); - problem.variable_upper_bounds_.reserve(reserve_dim); - problem.var_types_.reserve(reserve_dim); - problem.row_types_.reserve(reserve_dim); - problem.row_names_.reserve(reserve_dim); - problem.var_names_.reserve(reserve_dim); - problem.constraint_lower_bounds_.reserve(reserve_dim); - problem.constraint_upper_bounds_.reserve(reserve_dim); + std::size_t reserve_dim = init_problem_storage(problem, input_size); - cursor_t cursor(data, buffer.size()); + cursor_t cursor(data, input_size); parse_state_t state(problem, cursor); state.row_names_sv.reserve(reserve_dim); @@ -3142,13 +3272,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data); append_bounds_only_variables(state); - cursor.ptr = registry.range(mps_phase_kind::quadratic).present - ? registry.range(mps_phase_kind::quadratic).end - : (registry.range(mps_phase_kind::bounds).present - ? registry.range(mps_phase_kind::bounds).end - : (registry.range(mps_phase_kind::ranges).present - ? registry.range(mps_phase_kind::ranges).end - : registry.range(mps_phase_kind::rhs).end)); + cursor.ptr = trailing_endata_cursor_end(registry); cursor.end = end; if (!cursor.done()) { expect(cursor, "ENDATA"); } @@ -3170,7 +3294,7 @@ cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( if (effective_method == FileReadMethod::Read) { small_raw_read_t small_raw = try_read_small_raw_file(path); if (small_raw.use_small_path) { - return parse_mps_fast_small_raw_file(std::move(small_raw.buffer)); + return parse_mps_fast_small_raw_file(std::move(small_raw.buffer), small_raw.size); } RawInputStream stream(path); return parse_mps_fast_stream( diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 08521eafc0..dc9ae86abc 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -30,13 +32,11 @@ using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; using cuopt::linear_programming::io::mps_parser_fail; -char* string_buffer; -char* string_buffer_ptr; - namespace { -constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; -constexpr std::size_t raw_input_max_read_threads = 8; +constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t raw_input_max_read_threads = 8; +constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull; bool path_has_suffix(const std::string& path, const char* suffix) noexcept { @@ -45,28 +45,6 @@ bool path_has_suffix(const std::string& path, const char* suffix) noexcept path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; } -} // namespace - -namespace { - -class FileDescriptor { - public: - explicit FileDescriptor(int fd) : fd_(fd) {} - ~FileDescriptor() - { - if (fd_ >= 0) { ::close(fd_); } - } - - FileDescriptor(const FileDescriptor&) = delete; - FileDescriptor& operator=(const FileDescriptor&) = delete; - - int get() const noexcept { return fd_; } - bool valid() const noexcept { return fd_ >= 0; } - - private: - int fd_; -}; - std::size_t get_file_size(int fd, const std::string& path) { struct stat st; @@ -76,14 +54,14 @@ std::size_t get_file_size(int fd, const std::string& path) path.c_str(), std::strerror(errno)); } - return static_cast(st.st_size); + return (std::size_t)st.st_size; } std::size_t system_page_size() { static std::size_t page_size = [] { long value = ::sysconf(_SC_PAGESIZE); - return value > 0 ? static_cast(value) : static_cast(4096); + return value > 0 ? (std::size_t)value : (std::size_t)4096; }(); return page_size; } @@ -100,25 +78,47 @@ std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) return value + increment; } +std::size_t add_input_padding(std::size_t size) +{ + if (size > std::numeric_limits::max() - input_buffer_padding_bytes) { + mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow"); + } + return size + input_buffer_padding_bytes; +} + } // namespace RawInputStream::RawInputStream(const std::string& path) : path_(path) { MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); - fd_ = ::open(path.c_str(), O_RDONLY); - if (fd_ < 0) { + buffered_fd_ = ::open(path.c_str(), O_RDONLY); + if (buffered_fd_ < 0) { mps_parser_fail(error_type_t::RuntimeError, "Failed to open raw MPS file '%s': %s", path.c_str(), std::strerror(errno)); } - file_size_ = get_file_size(fd_, path); + file_size_ = get_file_size(buffered_fd_, path); + fd_ = buffered_fd_; + bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes; + if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) { + use_direct_io = raw_direct[0] != '0'; + } + if (use_direct_io) { +#ifdef O_DIRECT + int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); + if (direct_fd >= 0) { + fd_ = direct_fd; + direct_io_ = true; + } +#endif + } window_bytes_ = raw_input_window_bytes; window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); - output_mapped_size_ = - round_up_to_multiple(std::max(file_size_, 1), system_page_size()); + output_mapped_size_ = round_up_to_multiple( + std::max(add_input_padding(file_size_), 1), system_page_size()); output_region_ = mmap_region_t::anonymous( output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); output_data_ = output_region_.char_data(); @@ -133,6 +133,7 @@ RawInputStream::RawInputStream(const std::string& path) : path_(path) RawInputStream::~RawInputStream() { if (fd_ >= 0) { ::close(fd_); } + if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); } } const char* RawInputStream::data() const noexcept { return output_data_; } @@ -156,7 +157,7 @@ void RawInputStream::run_decode_tasks() } std::size_t hw_threads = - std::max(1, static_cast(std::thread::hardware_concurrency())); + std::max(1, (std::size_t)std::thread::hardware_concurrency()); std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); thread_count = std::max(1, std::min(thread_count, window_count_)); @@ -181,10 +182,19 @@ void RawInputStream::run_decode_tasks() { MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); while (done < size) { - ssize_t got = ::pread( - fd_, output_data_ + offset + done, size - done, static_cast(offset + done)); + ssize_t got = + ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); if (got < 0) { if (errno == EINTR) { continue; } + if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) { + got = ::pread( + buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); + if (got >= 0) { + done += (std::size_t)got; + continue; + } + if (errno == EINTR) { continue; } + } mps_parser_fail(error_type_t::RuntimeError, "Failed to pread raw MPS file '%s': %s", path_.c_str(), @@ -195,7 +205,7 @@ void RawInputStream::run_decode_tasks() "Unexpected EOF while reading raw MPS file '%s'", path_.c_str()); } - done += static_cast(got); + done += (std::size_t)got; } } @@ -249,10 +259,11 @@ bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffi void drop_file_cache(const std::string& path) { MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io); - FileDescriptor fd(::open(path.c_str(), O_RDONLY)); - if (!fd.valid()) { return; } + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { return; } - ::posix_fadvise(fd.get(), 0, 0, POSIX_FADV_DONTNEED); + ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); + ::close(fd); } FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method) diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index cc603e35d8..bab63c76cf 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -16,6 +16,10 @@ namespace mps_fast { +inline constexpr std::size_t input_buffer_padding_bytes = 64; + +struct lz4_pipeline_t; + /** * @brief File reading method selection */ @@ -72,15 +76,7 @@ class Lz4InputStream { void run_decode_tasks(); private: - struct Block { - std::size_t compressed_offset = 0; - std::size_t compressed_size = 0; - std::size_t read_end_offset = 0; - std::size_t decompressed_offset = 0; - std::size_t decompressed_size = 0; - std::size_t index = 0; - bool uncompressed = false; - }; + friend struct lz4_pipeline_t; void commit_up_to(std::size_t bytes); @@ -99,7 +95,6 @@ class Lz4InputStream { bool block_checksum_ = false; bool content_checksum_ = false; bool dict_id_ = false; - std::vector blocks_; mps_phase_registry_t registry_; std::mutex commit_mutex_; std::mutex frontier_mutex_; @@ -108,24 +103,6 @@ class Lz4InputStream { std::unique_ptr section_scanner_; std::size_t next_block_ = 0; std::size_t ready_bytes_ = 0; - - struct BatchMetric { - std::size_t index = 0; - std::size_t first_block = 0; - std::size_t blocks = 0; - std::size_t file_bytes = 0; - std::size_t decompressed_bytes = 0; - double read_ms = 0.0; - double decode_ms = 0.0; - double commit_ms = 0.0; - double frontier_lock_wait_ms = 0.0; - double frontier_update_ms = 0.0; - double section_scan_ms = 0.0; - std::size_t ready_bytes_delta = 0; - std::size_t frontier_blocks_advanced = 0; - double total_ms = 0.0; - }; - std::vector batch_metrics_; }; class RawInputStream { @@ -148,7 +125,9 @@ class RawInputStream { private: std::string path_; - int fd_ = -1; + int fd_ = -1; + int buffered_fd_ = -1; + bool direct_io_ = false; mmap_region_t output_region_; char* output_data_ = nullptr; std::size_t output_mapped_size_ = 0; diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp index ab0d4c2c78..7d367db941 100644 --- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -6,302 +6,63 @@ #pragma once #include -#include +#include #include #include -#define __assume(cond) \ - do { \ - if (!(cond)) __builtin_unreachable(); \ - } while (0) - -#define BUCKET_COUNT (4194304 * 2 * 2 * 4) // 2^22 - -// Set to 1 for 32-byte keys, 0 for 16-byte keys -#ifndef USE_32B_HASH_KEYS -#define USE_32B_HASH_KEYS 1 -#endif - namespace mps_fast { -static inline uint32_t crcHash(const uint8_t* key, int64_t len) -{ - __assume(len < 256); - - uint64_t crc = 0; - while (len > 8) { - uint64_t val = *(const uint64_t*)key; - crc = simde_mm_crc32_u64(crc, val); - len -= 8; - key += 8; - } - - // CRC the final 1-7 bytes - uint64_t val = *(const uint64_t*)key; - val &= ~(~0ULL << len * 8); // Compiles to a bzhi instruction (also UB) - crc = simde_mm_crc32_u64(crc, val); - - return crc; -} - -static inline uint32_t crcHash32B(uint64_t q0, uint64_t q1, uint64_t q2, uint64_t q3) +// FNV-1a over bytes in reverse order; row names commonly share long prefixes. +static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len) { - uint64_t crc = 0; - crc = simde_mm_crc32_u64(crc, q0); - crc = simde_mm_crc32_u64(crc, q1); - crc = simde_mm_crc32_u64(crc, q2); - crc = simde_mm_crc32_u64(crc, q3); + constexpr uint32_t fnv_offset = 2166136261u; + constexpr uint32_t fnv_prime = 16777619u; - return crc; -} - -// FNV-1a hash, processes bytes in reverse to better handle common-prefix strings -static inline uint32_t fnv1a_hash(const char* ptr, size_t len) -{ - constexpr uint32_t FNV_OFFSET = 2166136261u; - constexpr uint32_t FNV_PRIME = 16777619u; - - uint32_t h = FNV_OFFSET; + uint32_t h = fnv_offset; const char* p = ptr + len; while (p > ptr) { --p; h ^= (uint8_t)*p; - h *= FNV_PRIME; + h *= fnv_prime; } return h; } -struct __attribute__((packed)) hash_slot_32_t { - uint32_t count; - simde__m256i node; -}; - -struct alignas(16) hash_slot_16_t { - char key[16]; - uint32_t count; -}; - -static inline bool key_cmpeq_16(const char* slot_key, simde__m128i key) -{ - simde__m128i slot_vec = simde_mm_loadu_si128((const simde__m128i*)slot_key); - int mask = simde_mm_movemask_epi8(simde_mm_cmpeq_epi8(slot_vec, key)); - return mask == 0xFFFF; -} - -// 32-byte aligned slot: 28-byte key + 4-byte count = 32 bytes total (one cache line half) +// 28-byte inline key + uint32 payload: two slots per 64-byte cache line. +// key_store writes a full 32-byte vector starting at key[0], so callers must +// publish the payload after storing the key. key_cmpeq masks those payload lanes +// away, leaving the trailing uint32 free for the row index + 1 sentinel. struct alignas(32) hash_slot_28_t { char key[28]; uint32_t count; }; -static inline simde__m256i make_key_28(const char* ptr, size_t len) -{ - alignas(32) char buf[32] = {0}; - size_t copy_len = len < 28 ? len : 28; - std::memcpy(buf, ptr, copy_len); - return simde_mm256_load_si256((const simde__m256i*)buf); -} - -// Compare 28-byte keys stored in simde__m256i (ignore last 4 bytes) -static inline bool key_cmpeq_28(const char* slot_key, simde__m256i key) -{ - simde__m256i slot_vec = simde_mm256_loadu_si256((const simde__m256i*)slot_key); - int mask = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key)); - return (mask & 0x0FFFFFFF) == 0x0FFFFFFF; // Only check first 28 bytes -} - -#if USE_32B_HASH_KEYS -using hash_key_t = simde__m256i; -using hash_slot_var_t = hash_slot_28_t; -constexpr size_t HASH_KEY_BYTES = 28; -constexpr int HASH_KEY_CMP_MASK = 0x0FFFFFFF; -#define make_key make_key_28 -#define key_cmpeq(slot_key, key) key_cmpeq_28(slot_key, key) -#define key_store(slot_key, key) simde_mm256_store_si256((simde__m256i*)(slot_key), key) -#else -using hash_key_t = simde__m128i; -using hash_slot_var_t = hash_slot_16_t; -constexpr size_t HASH_KEY_BYTES = 16; -constexpr int HASH_KEY_CMP_MASK = 0xFFFF; -#define make_key make_key_16 -#define key_cmpeq(slot_key, key) key_cmpeq_16(slot_key, key) -#define key_store(slot_key, key) simde_mm_store_si128((simde__m128i*)(slot_key), key) -#endif - -// Legacy alias -using hash_slot_t = hash_slot_32_t; - -struct hash_table_t { - hash_slot_t slots[BUCKET_COUNT]; -}; - -static inline void hash_table_push( - hash_table_t* table, uint32_t hash, simde__m256i val, int len, const uint8_t* ptr) -{ - hash %= BUCKET_COUNT; - - hash_slot_t* slot = &table->slots[hash]; - - if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) { - ++slot->count; - return; - } - - bool relooped = false; - -loop: - for (; slot < &table->slots[BUCKET_COUNT]; ++slot) { - if (slot->count == 0) { - slot->count = 1; - slot->node = val; - return; - } - - if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == 0xFFFFFFFF) { - ++slot->count; - return; - } - } +using hash_key_t = simde__m256i; +using hash_slot_var_t = hash_slot_28_t; +constexpr std::size_t HASH_KEY_BYTES = 28; - if (!relooped) { - relooped = true; - slot = &table->slots[0]; - goto loop; - } else { - __builtin_trap(); - } -} - -extern char* string_buffer; -extern char* string_buffer_ptr; +static_assert(sizeof(hash_slot_28_t) == 32); +static_assert(alignof(hash_slot_28_t) == 32); +static_assert(offsetof(hash_slot_28_t, count) == HASH_KEY_BYTES); -// Lookup: returns the stored value (count-1) or SIZE_MAX if not found -// For small strings <= 32 bytes stored inline in node -static inline size_t hash_table_lookup(const hash_table_t* table, uint32_t hash, simde__m256i val) +static inline hash_key_t make_key(const char* ptr, std::size_t len) { - hash %= BUCKET_COUNT; - const hash_slot_t* slot = &table->slots[hash]; - - for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) { - if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; } - - if (slot->count == 0) { - return SIZE_MAX; // Not found - } - - if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) { - return slot->count - 1; // Found, return index - } - } - - return SIZE_MAX; // Not found + alignas(32) char buf[32] = {}; + std::memcpy(buf, ptr, len < HASH_KEY_BYTES ? len : HASH_KEY_BYTES); + return simde_mm256_load_si256(reinterpret_cast(buf)); } -// Insert with index: stores index+1 in count field (0 means empty) -static inline void hash_table_insert(hash_table_t* table, - uint32_t hash, - simde__m256i val, - size_t index) +static inline bool key_cmpeq(const char* slot_key, hash_key_t key) { - hash %= BUCKET_COUNT; - hash_slot_t* slot = &table->slots[hash]; - - for (size_t i = 0; i < BUCKET_COUNT; ++i, ++slot) { - if (slot >= &table->slots[BUCKET_COUNT]) { slot = &table->slots[0]; } - - if (slot->count == 0) { - slot->count = (uint32_t)(index + 1); - slot->node = val; - return; - } - - if (simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot->node, val)) == (int)0xFFFFFFFF) { - // Already exists, update index - slot->count = (uint32_t)(index + 1); - return; - } - } - - __builtin_trap(); -} - -// Create simde__m256i key from string_view (zero-padded) -static inline simde__m256i make_key_32(const char* ptr, size_t len) -{ - alignas(32) char buf[32] = {0}; - if (len > 32) len = 32; - memcpy(buf, ptr, len); - return simde_mm256_load_si256((const simde__m256i*)buf); -} - -// Create simde__m128i key from string_view (zero-padded, for strings <= 16 bytes) -static inline simde__m128i make_key_16(const char* ptr, size_t len) -{ - alignas(16) char buf[16] = {0}; - if (len > 16) len = 16; - memcpy(buf, ptr, len); - return simde_mm_load_si128((const simde__m128i*)buf); -} - -static inline uint64_t m256_u64_lane(simde__m256i value, size_t lane) -{ - simde__m256i_private private_value = simde__m256i_to_private(value); - return private_value.u64[lane]; + simde__m256i slot_vec = simde_mm256_loadu_si256(reinterpret_cast(slot_key)); + int mask = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key)); + return (mask & 0x0fffffff) == 0x0fffffff; } -static inline void hash_table_push_ptr(hash_table_t* table, - uint32_t hash, - int len, - const uint8_t* ptr) +static inline void key_store(char* slot_key, hash_key_t key) { - hash %= BUCKET_COUNT; - - hash_slot_t* slot = &table->slots[hash]; - bool relooped = false; - - uint32_t len_in_qwords = (len / 8) + (len % 8 ? 1 : 0); - -loop: - do { - uint64_t node_len = m256_u64_lane(slot->node, 3); - uint64_t node_tag = m256_u64_lane(slot->node, 0); - // nonzero, it's not a pointer of the same length, skip - if (__builtin_expect(node_len != (uint64_t)len, 0)) { - if (__builtin_expect(node_tag == 0, 1)) { - slot->count = 1; - slot->node = simde_mm256_set_epi64x(len, - ((uint64_t*)ptr)[0], - (uint64_t)string_buffer_ptr, - 0u | ((uint64_t)len_in_qwords << 32u)); - - memcpy(string_buffer_ptr, ptr, len); - string_buffer_ptr += len; - // Pad - string_buffer_ptr += (8 - len % 8) + 8; - - return; - } else - continue; - } - if (m256_u64_lane(slot->node, 2) != ((uint64_t*)ptr)[0]) // First 8 bytes differ - continue; - - uint8_t* other_ptr = reinterpret_cast(m256_u64_lane(slot->node, 1)); - if (__builtin_expect(memcmp(ptr + 16, other_ptr + 16, len - 16) == 0, 1)) { - ++slot->count; - - return; - } - } while (++slot < &table->slots[BUCKET_COUNT]); - - if (!relooped) { - relooped = true; - slot = &table->slots[0]; - goto loop; - } else { - __builtin_trap(); - } + simde_mm256_store_si256(reinterpret_cast(slot_key), key); } } // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index a0be7daaf0..bb6657e303 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -19,12 +19,13 @@ #include #include +#include #include -#include #include #include #include #include +#include #include #include #include @@ -44,12 +45,13 @@ using cuopt::linear_programming::io::mps_parser_fail; namespace { -constexpr uint32_t lz4_frame_magic = 0x184D2204u; -constexpr uint32_t lz4_uncompressed_block = 0x80000000u; -constexpr uint32_t lz4_block_size_mask = 0x7FFFFFFFu; -constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024ull; -constexpr std::size_t lz4_input_max_io_threads = 8; -constexpr std::size_t lz4_no_content_size_reserve_ratio = 16; +constexpr uint32_t lz4_frame_magic = 0x184D2204u; +constexpr uint32_t lz4_uncompressed_block = 0x80000000u; +constexpr uint32_t lz4_block_size_mask = 0x7FFFFFFFu; +constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull; +constexpr std::size_t lz4_input_max_io_threads = 8; +constexpr std::size_t lz4_no_content_size_reserve_ratio = 16; using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int); @@ -168,10 +170,10 @@ std::size_t block_max_size_from_bd(unsigned char bd) std::size_t checked_size(uint64_t value, const char* label) { - if (value > static_cast(std::numeric_limits::max())) { + if (value > (uint64_t)std::numeric_limits::max()) { mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label); } - return static_cast(value); + return (std::size_t)value; } std::size_t get_file_size(int fd, const std::string& path) @@ -187,14 +189,14 @@ std::size_t get_file_size(int fd, const std::string& path) mps_parser_fail( error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str()); } - return static_cast(st.st_size); + return (std::size_t)st.st_size; } std::size_t system_page_size() { static std::size_t page_size = [] { long value = ::sysconf(_SC_PAGESIZE); - return value > 0 ? static_cast(value) : static_cast(4096); + return value > 0 ? (std::size_t)value : (std::size_t)4096; }(); return page_size; } @@ -219,10 +221,12 @@ std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) return a * b; } -double elapsed_ms_since(std::chrono::steady_clock::time_point start) +std::size_t checked_add(std::size_t a, std::size_t b, const char* label) { - return std::chrono::duration(std::chrono::steady_clock::now() - start) - .count(); + if (a > std::numeric_limits::max() - b) { + mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label); + } + return a + b; } bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) @@ -230,9 +234,9 @@ bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) std::size_t done = 0; while (done < bytes) { std::size_t remaining = bytes - done; - std::size_t chunk = std::min( - remaining, static_cast(std::numeric_limits::max())); - ssize_t got = ::pread(fd, dst + done, chunk, static_cast(offset + done)); + std::size_t chunk = + std::min(remaining, (std::size_t)std::numeric_limits::max()); + ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done)); if (got < 0) { if (errno == EINTR) { continue; } return false; @@ -241,7 +245,7 @@ bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) errno = EIO; return false; } - done += static_cast(got); + done += (std::size_t)got; } return true; } @@ -359,8 +363,8 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) "unsupported LZ4 input: expected standard LZ4 frame magic"); } offset += 4; - unsigned char flg = static_cast(header[offset++]); - unsigned char bd = static_cast(header[offset++]); + unsigned char flg = (unsigned char)header[offset++]; + unsigned char bd = (unsigned char)header[offset++]; unsigned version = (flg >> 6) & 0x3u; if (version != 1) { mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version"); @@ -403,6 +407,7 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve"); reserve_size = std::max(reserve_size, block_max_size_); } + reserve_size = checked_add(reserve_size, input_buffer_padding_bytes, "LZ4 output padding"); constexpr std::size_t huge_alignment = 2 * 1024 * 1024; output_mapped_size_ = round_up_to_multiple(reserve_size, system_page_size()); @@ -460,310 +465,293 @@ void Lz4InputStream::commit_up_to(std::size_t bytes) output_committed_size_ = new_committed; } -void Lz4InputStream::run_decode_tasks() -{ - MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io); - std::exception_ptr first_error = nullptr; - std::mutex error_mutex; - std::atomic_bool stop_workers{false}; - auto mark_error = [&](std::exception_ptr eptr) { +struct resident_block_desc_t { + const char* src = nullptr; + std::size_t compressed_size = 0; + std::size_t decompressed_offset = 0; + std::size_t decompressed_size = 0; + std::size_t index = 0; + std::size_t window_index = std::numeric_limits::max(); + bool uncompressed = false; +}; + +struct lz4_pipeline_t { + explicit lz4_pipeline_t(Lz4InputStream& input_) + : input(input_), + window_count((input.compressed_size_ + window_bytes - 1) / window_bytes), + windows(window_count), + io_threads(std::min(lz4_input_max_io_threads, window_count)), + window_done(window_count, 0), + window_refs(window_count), + window_scanned(window_count), + window_released(window_count) + { + for (std::size_t i = 0; i < window_count; ++i) { + std::size_t offset = i * window_bytes; + std::size_t size = std::min(window_bytes, input.compressed_size_ - offset); + windows[i].index = i; + windows[i].file_offset = offset; + windows[i].size = size; + window_refs[i].store(0, std::memory_order_relaxed); + window_scanned[i].store(0, std::memory_order_relaxed); + window_released[i].store(0, std::memory_order_relaxed); + } + } + + void run() + { + start_readers(); + std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this); + start_decoders(); + + for (auto& reader : readers) { + reader.join(); + } + scanner.join(); + for (auto& worker : decoders) { + worker.join(); + } + if (first_error) { std::rethrow_exception(first_error); } + } + + void finalize() + { + input.output_view_size_ = input.ready_bytes_; + input.commit_up_to( + checked_add(input.output_view_size_, input_buffer_padding_bytes, "LZ4 output padding")); + input.section_scanner_->publish_ready(input.output_view_size_); + } + + void mark_error(std::exception_ptr eptr) + { std::lock_guard lock(error_mutex); if (!first_error) { first_error = eptr; stop_workers.store(true, std::memory_order_release); } - }; - - const std::size_t window_bytes = lz4_pipeline_batch_bytes; - const std::size_t window_count = (compressed_size_ + window_bytes - 1) / window_bytes; - std::vector windows(window_count); - for (std::size_t i = 0; i < window_count; ++i) { - std::size_t offset = i * window_bytes; - std::size_t size = std::min(window_bytes, compressed_size_ - offset); - windows[i].index = i; - windows[i].file_offset = offset; - windows[i].size = size; - windows[i].data.reset(new char[size]); - } - - const std::size_t io_threads = std::min(lz4_input_max_io_threads, window_count); - std::atomic decoder_wait_batch_ms{0.0}; - std::atomic decoder_active_batch_ms{0.0}; - - struct resident_block_desc_t { - const char* src = nullptr; - std::size_t compressed_size = 0; - std::size_t decompressed_offset = 0; - std::size_t decompressed_size = 0; - std::size_t index = 0; - bool uncompressed = false; - }; - - std::atomic_size_t next_window{0}; - std::vector window_done(window_count, 0); - std::mutex window_mutex; - std::condition_variable window_cv; - - std::deque> desc_queue; - bool scanner_done = false; - std::mutex desc_mutex; - std::condition_variable desc_cv; + } - auto fail_and_notify = [&](std::exception_ptr eptr) { + void fail_and_notify(std::exception_ptr eptr) + { mark_error(eptr); window_cv.notify_all(); desc_cv.notify_all(); - }; + } + + void add_compressed_resident(std::size_t bytes) + { + compressed_resident_bytes.fetch_add(bytes, std::memory_order_relaxed); + } + + void try_release_window(std::size_t index) + { + if (index >= window_count) { return; } + if (window_scanned[index].load(std::memory_order_acquire) == 0) { return; } + if (window_refs[index].load(std::memory_order_acquire) != 0) { return; } + uint8_t expected = 0; + if (!window_released[index].compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { + return; + } + std::lock_guard lock(window_release_mutex); + if (windows[index].data) { + windows[index].data.reset(); + compressed_resident_bytes.fetch_sub(windows[index].size, std::memory_order_relaxed); + } + } + + void mark_windows_scanned_before(std::size_t offset) + { + std::size_t last_excl = std::min(window_count, offset / window_bytes); + for (std::size_t wi = 0; wi < last_excl; ++wi) { + window_scanned[wi].store(1, std::memory_order_release); + try_release_window(wi); + } + } - auto decode_worker = [&](std::size_t tid) { + void start_readers() + { + readers.reserve(io_threads); + for (std::size_t t = 0; t < io_threads; ++t) { + readers.emplace_back(&lz4_pipeline_t::run_reader_stage, this, t); + } + } + + void run_reader_stage(std::size_t tid) + { + std::string thread_name = "lz4-window-read-" + std::to_string(tid); + nvtx::name_current_thread(thread_name.c_str()); + while (!stop_workers.load(std::memory_order_acquire)) { + std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); + if (index >= windows.size()) { break; } + auto& w = windows[index]; + w.data.reset(new char[w.size]); + add_compressed_resident(w.size); + bool ok = false; + { + MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); + ok = pread_full_plain(input.fd_, w.data.get(), w.size, w.file_offset); + } + if (!ok) { + try { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread LZ4 resident window: %s", + std::strerror(errno)); + } catch (...) { + fail_and_notify(std::current_exception()); + } + return; + } + { + MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); + std::lock_guard lock(window_mutex); + window_done[index] = 1; + } + window_cv.notify_all(); + } + } + + void start_decoders() + { + decoders.reserve(io_threads); + for (std::size_t t = 0; t < io_threads; ++t) { + decoders.emplace_back(&lz4_pipeline_t::run_decoder_stage, this, t); + } + } + + void run_decoder_stage(std::size_t tid) + { try { std::string thread_name = "lz4-window-decode-" + std::to_string(tid); nvtx::name_current_thread(thread_name.c_str()); while (true) { - std::vector batch; - { - MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); - std::unique_lock lock(desc_mutex); - const auto wait_start = std::chrono::steady_clock::now(); - desc_cv.wait(lock, [&] { - return stop_workers.load(std::memory_order_acquire) || scanner_done || - !desc_queue.empty(); - }); - decoder_wait_batch_ms.fetch_add(elapsed_ms_since(wait_start), std::memory_order_relaxed); - if (stop_workers.load(std::memory_order_acquire)) { return; } - if (desc_queue.empty()) { - if (scanner_done) return; - continue; - } - batch = std::move(desc_queue.front()); - desc_queue.pop_front(); - } - - const auto decode_start = std::chrono::steady_clock::now(); - MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode); - for (const auto& block : batch) { - char* dst = output_data_ + block.decompressed_offset; - int actual = 0; - { - MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode); - if (block.uncompressed) { - std::memcpy(dst, block.src, block.decompressed_size); - actual = static_cast(block.decompressed_size); - } else if (block.compressed_size > - static_cast(std::numeric_limits::max()) || - block.decompressed_size > - static_cast(std::numeric_limits::max())) { - actual = -1; - } else { - actual = lz4_decompress_safe_runtime(block.src, - dst, - static_cast(block.compressed_size), - static_cast(block.decompressed_size)); - } - } - if (actual < 0 || static_cast(actual) > block.decompressed_size) { - mps_parser_fail(error_type_t::ValidationError, - "LZ4 input block decompressed to invalid size"); - } - - std::size_t actual_size = static_cast(actual); - { - MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); - section_scanner_->observe_block(block.index, dst, dst + actual_size); - } - std::size_t before = 0; - std::size_t after = 0; - { - MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic); - frontier_mutex_.lock(); - block_done_[block.index] = 1; - block_end_[block.index] = block.decompressed_offset + actual_size; - before = ready_bytes_; - while (next_block_ < block_done_.size() && block_done_[next_block_]) { - ready_bytes_ = block_end_[next_block_]; - ++next_block_; - } - after = ready_bytes_; - frontier_mutex_.unlock(); - } - if (after > before) { - MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic); - section_scanner_->publish_ready(after); - } - } - decoder_active_batch_ms.fetch_add(elapsed_ms_since(decode_start), - std::memory_order_relaxed); + std::vector batch = wait_for_decode_batch(); + if (batch.empty()) { return; } + decode_batch(batch); } } catch (...) { fail_and_notify(std::current_exception()); } - }; + } - std::vector readers; - readers.reserve(io_threads); - for (std::size_t t = 0; t < io_threads; ++t) { - readers.emplace_back([&, t] { - std::string thread_name = "lz4-window-read-" + std::to_string(t); - nvtx::name_current_thread(thread_name.c_str()); - while (!stop_workers.load(std::memory_order_acquire)) { - std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); - if (index >= windows.size()) { break; } - auto& w = windows[index]; - bool ok = false; - { - MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); - ok = pread_full_plain(fd_, w.data.get(), w.size, w.file_offset); - } - if (!ok) { - try { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to pread LZ4 resident window: %s", - std::strerror(errno)); - } catch (...) { - fail_and_notify(std::current_exception()); - } - return; - } - { - MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); - std::lock_guard lock(window_mutex); - window_done[index] = 1; - } - window_cv.notify_all(); - } + std::vector wait_for_decode_batch() + { + MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); + std::unique_lock lock(desc_mutex); + desc_cv.wait(lock, [&] { + return stop_workers.load(std::memory_order_acquire) || scanner_done || !desc_queue.empty(); }); + if (stop_workers.load(std::memory_order_acquire) || desc_queue.empty()) { return {}; } + std::vector batch = std::move(desc_queue.front()); + desc_queue.pop_front(); + return batch; } - std::atomic_size_t blocks_scanned{0}; - std::vector> crossing_payloads; - const auto read_wall_start = std::chrono::steady_clock::now(); - std::thread scanner([&] { - try { - nvtx::name_current_thread("lz4-metadata-scan"); - lz4_resident_windows_t resident(windows); - auto wait_range_ready = [&](std::size_t begin, std::size_t size) { - if (size == 0) return; - std::size_t first = begin / window_bytes; - std::size_t last = (begin + size - 1) / window_bytes; - for (std::size_t wi = first; wi <= last; ++wi) { - MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); - std::unique_lock lock(window_mutex); - window_cv.wait(lock, [&] { - return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; - }); - if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) { - mps_parser_fail(error_type_t::RuntimeError, - "LZ4 metadata scanner stopped before required window was ready"); - } - } - }; - auto push_batch = [&](std::vector& batch) { - if (batch.empty()) return; - { - MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc); - commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size); - } - { - MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic); - std::lock_guard lock(desc_mutex); - desc_queue.push_back(std::move(batch)); - } - batch.clear(); - desc_cv.notify_one(); - }; - - std::vector batch; - batch.reserve(1024); - std::size_t offset = header_size_; - std::size_t decompressed_offset = 0; - while (true) { - MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic); - wait_range_ready(offset, 4); - if (offset + 4 > compressed_size_) { - mps_parser_fail(error_type_t::ValidationError, - "truncated LZ4 frame while reading block header"); - } - uint32_t raw_block_size = resident.read_u32(offset); - offset += 4; - if (raw_block_size == 0) { break; } - - bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; - std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; - if (block_payload_size == 0) { - mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block"); - } - if (block_payload_size > block_max_size_ && uncompressed) { - mps_parser_fail(error_type_t::ValidationError, - "LZ4 uncompressed block exceeds frame block maximum"); - } - if (content_size_present_ && decompressed_offset >= content_size_) { - mps_parser_fail(error_type_t::ValidationError, - "LZ4 frame contains more blocks than content size allows"); - } - wait_range_ready(offset, block_payload_size); - if (offset + block_payload_size > compressed_size_) { - mps_parser_fail(error_type_t::ValidationError, - "truncated LZ4 frame while reading block payload"); - } - - std::size_t decompressed_size = block_payload_size; - if (!uncompressed) { - if (content_size_present_) { - decompressed_size = std::min(block_max_size_, content_size_ - decompressed_offset); - } else { - decompressed_size = block_max_size_; - } - } - if (content_size_present_ && decompressed_size > content_size_ - decompressed_offset) { - mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size"); - } + void decode_batch(const std::vector& batch) + { + MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode); + for (const auto& block : batch) { + decode_block(block); + } + } - const char* src = resident.ptr_if_contiguous(offset, block_payload_size); - if (src == nullptr) { - crossing_payloads.emplace_back(block_payload_size); - resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size); - src = crossing_payloads.back().data(); - } - batch.push_back({src, - block_payload_size, - decompressed_offset, - decompressed_size, - blocks_scanned.load(std::memory_order_relaxed), - uncompressed}); - blocks_scanned.fetch_add(1, std::memory_order_relaxed); - decompressed_offset += decompressed_size; - offset += block_payload_size; - if (block_checksum_) { - wait_range_ready(offset, 4); - if (offset + 4 > compressed_size_) { - mps_parser_fail(error_type_t::ValidationError, - "truncated LZ4 frame while reading block checksum"); - } - offset += 4; - } - if (blocks_scanned.load(std::memory_order_relaxed) > block_done_.size()) { - mps_parser_fail(error_type_t::OutOfMemoryError, - "LZ4 input block count exceeded reserved metadata slots"); - } - if (batch.size() >= 1024) { push_batch(batch); } - } - if (content_checksum_) { - wait_range_ready(offset, 4); - if (offset + 4 > compressed_size_) { - mps_parser_fail(error_type_t::ValidationError, - "truncated LZ4 frame while reading content checksum"); - } - offset += 4; + void decode_block(const resident_block_desc_t& block) + { + char* dst = input.output_data_ + block.decompressed_offset; + int actual = 0; + { + MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode); + if (block.uncompressed) { + std::memcpy(dst, block.src, block.decompressed_size); + actual = (int)block.decompressed_size; + } else if (block.compressed_size > (std::size_t)std::numeric_limits::max() || + block.decompressed_size > (std::size_t)std::numeric_limits::max()) { + actual = -1; + } else { + actual = lz4_decompress_safe_runtime( + block.src, dst, (int)block.compressed_size, (int)block.decompressed_size); } - if (content_size_present_ && decompressed_offset != content_size_) { - mps_parser_fail(error_type_t::ValidationError, - "LZ4 frame ended before declared content size was reached"); + } + if (actual < 0 || (std::size_t)actual > block.decompressed_size) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input block decompressed to invalid size"); + } + release_block_window_ref(block); + publish_decoded_block(block, dst, (std::size_t)actual); + } + + void release_block_window_ref(const resident_block_desc_t& block) + { + if (block.window_index == std::numeric_limits::max()) { return; } + uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel); + (void)old; + assert(old > 0); + if (old == 1) { try_release_window(block.window_index); } + } + + void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size) + { + { + MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); + input.section_scanner_->observe_block(block.index, dst, dst + actual_size); + } + std::size_t before = 0; + std::size_t after = 0; + { + MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic); + std::lock_guard lock(input.frontier_mutex_); + input.block_done_[block.index] = 1; + input.block_end_[block.index] = block.decompressed_offset + actual_size; + before = input.ready_bytes_; + while (input.next_block_ < input.block_done_.size() && input.block_done_[input.next_block_]) { + input.ready_bytes_ = input.block_end_[input.next_block_]; + ++input.next_block_; } - if (offset != compressed_size_) { - mps_parser_fail(error_type_t::ValidationError, - "LZ4 input contains trailing data after the first frame"); + after = input.ready_bytes_; + } + if (after > before) { + MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic); + input.section_scanner_->publish_ready(after); + } + } + + void wait_range_ready(std::size_t begin, std::size_t size) + { + if (size == 0) return; + std::size_t first = begin / window_bytes; + std::size_t last = (begin + size - 1) / window_bytes; + for (std::size_t wi = first; wi <= last; ++wi) { + MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); + std::unique_lock lock(window_mutex); + window_cv.wait( + lock, [&] { return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; }); + if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) { + mps_parser_fail(error_type_t::RuntimeError, + "LZ4 metadata scanner stopped before required window was ready"); } - push_batch(batch); + } + } + + void push_batch(std::vector& batch) + { + if (batch.empty()) return; + { + MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc); + input.commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size); + } + { + MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic); + std::lock_guard lock(desc_mutex); + desc_queue.push_back(std::move(batch)); + } + batch.clear(); + desc_cv.notify_one(); + } + + void run_scanner_stage() + { + try { + nvtx::name_current_thread("lz4-metadata-scan"); + scan_lz4_metadata(); { std::lock_guard lock(desc_mutex); scanner_done = true; @@ -776,37 +764,177 @@ void Lz4InputStream::run_decode_tasks() } fail_and_notify(std::current_exception()); } - }); - - std::vector io_workers; - io_workers.reserve(io_threads); - for (std::size_t t = 0; t < io_threads; ++t) { - io_workers.emplace_back(decode_worker, t); - } - for (auto& reader : readers) { - reader.join(); - } - const double read_wall_ms = elapsed_ms_since(read_wall_start); - scanner.join(); - for (auto& worker : io_workers) { - worker.join(); - } - if (first_error) std::rethrow_exception(first_error); - output_view_size_ = ready_bytes_; - section_scanner_->publish_ready(output_view_size_); - - const double compressed_mb = static_cast(compressed_size_) / (1024.0 * 1024.0); - const double read_effective_mbps = - read_wall_ms > 0.0 ? compressed_mb / (read_wall_ms / 1000.0) : 0.0; - const double decoder_wait_ms = decoder_wait_batch_ms.load(std::memory_order_relaxed); - const double decoder_active_ms = decoder_active_batch_ms.load(std::memory_order_relaxed); - const double decoder_total_ms = decoder_wait_ms + decoder_active_ms; - const double decoder_wait_ratio = - decoder_total_ms > 0.0 ? decoder_wait_ms / decoder_total_ms : 0.0; - std::fprintf(stderr, - "[LZ4_IO] read_effective_MBps=%.3f decoder_wait_ratio=%.6f\n", - read_effective_mbps, - decoder_wait_ratio); + } + + void scan_lz4_metadata() + { + lz4_resident_windows_t resident(windows); + std::vector batch; + batch.reserve(lz4_decode_batch_decompressed_bytes / input.block_max_size_ + 1); + std::size_t batch_decoded_bytes = 0; + std::size_t offset = input.header_size_; + std::size_t decompressed_offset = 0; + blocks_scanned.store(0, std::memory_order_relaxed); + + while (true) { + MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic); + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block header"); + } + uint32_t raw_block_size = resident.read_u32(offset); + offset += 4; + if (raw_block_size == 0) { break; } + + resident_block_desc_t block = + scan_one_block(resident, raw_block_size, offset, decompressed_offset); + batch_decoded_bytes += block.decompressed_size; + batch.push_back(block); + blocks_scanned.fetch_add(1, std::memory_order_relaxed); + if (blocks_scanned.load(std::memory_order_relaxed) > input.block_done_.size()) { + mps_parser_fail(error_type_t::OutOfMemoryError, + "LZ4 input block count exceeded reserved metadata slots"); + } + if (batch_decoded_bytes >= lz4_decode_batch_decompressed_bytes) { + push_batch(batch); + batch_decoded_bytes = 0; + } + } + + scan_frame_footer(offset, decompressed_offset); + push_batch(batch); + mark_windows_scanned_before(input.compressed_size_); + } + + resident_block_desc_t scan_one_block(lz4_resident_windows_t& resident, + uint32_t raw_block_size, + std::size_t& offset, + std::size_t& decompressed_offset) + { + bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; + std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; + if (block_payload_size == 0) { + mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block"); + } + if (block_payload_size > input.block_max_size_ && uncompressed) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 uncompressed block exceeds frame block maximum"); + } + if (input.content_size_present_ && decompressed_offset >= input.content_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame contains more blocks than content size allows"); + } + + wait_range_ready(offset, block_payload_size); + if (offset + block_payload_size > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block payload"); + } + + std::size_t decompressed_size = block_payload_size; + if (!uncompressed) { + decompressed_size = + input.content_size_present_ + ? std::min(input.block_max_size_, input.content_size_ - decompressed_offset) + : input.block_max_size_; + } + if (input.content_size_present_ && + decompressed_size > input.content_size_ - decompressed_offset) { + mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size"); + } + + const char* src = resident.ptr_if_contiguous(offset, block_payload_size); + std::size_t window_index = std::numeric_limits::max(); + if (src == nullptr) { + crossing_payloads.emplace_back(block_payload_size); + resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size); + src = crossing_payloads.back().data(); + } else { + window_index = offset / window_bytes; + window_refs[window_index].fetch_add(1, std::memory_order_acq_rel); + } + + resident_block_desc_t block{src, + block_payload_size, + decompressed_offset, + decompressed_size, + blocks_scanned.load(std::memory_order_relaxed), + window_index, + uncompressed}; + decompressed_offset += decompressed_size; + offset += block_payload_size; + mark_windows_scanned_before(offset); + if (input.block_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading block checksum"); + } + offset += 4; + mark_windows_scanned_before(offset); + } + return block; + } + + void scan_frame_footer(std::size_t& offset, std::size_t decompressed_offset) + { + if (input.content_checksum_) { + wait_range_ready(offset, 4); + if (offset + 4 > input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading content checksum"); + } + offset += 4; + mark_windows_scanned_before(offset); + } + if (input.content_size_present_ && decompressed_offset != input.content_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 frame ended before declared content size was reached"); + } + if (offset != input.compressed_size_) { + mps_parser_fail(error_type_t::ValidationError, + "LZ4 input contains trailing data after the first frame"); + } + } + + Lz4InputStream& input; + const std::size_t window_bytes = lz4_pipeline_batch_bytes; + const std::size_t window_count; + std::vector windows; + const std::size_t io_threads; + + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; + std::atomic_bool stop_workers{false}; + + std::atomic_size_t next_window{0}; + std::vector window_done; + std::vector> window_refs; + std::vector> window_scanned; + std::vector> window_released; + std::mutex window_mutex; + std::condition_variable window_cv; + std::mutex window_release_mutex; + std::atomic_size_t compressed_resident_bytes{0}; + + std::deque> desc_queue; + bool scanner_done = false; + std::mutex desc_mutex; + std::condition_variable desc_cv; + + std::atomic_size_t blocks_scanned{0}; + std::vector> crossing_payloads; + std::vector readers; + std::vector decoders; +}; + +void Lz4InputStream::run_decode_tasks() +{ + MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io); + lz4_pipeline_t pipeline(*this); + pipeline.run(); + pipeline.finalize(); } } // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp index 98c6e4885d..d7b299917b 100644 --- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -53,6 +53,7 @@ class mmap_region_t { ~mmap_region_t() { reset(); } + private: static mmap_region_t map( void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context) { @@ -64,6 +65,7 @@ class mmap_region_t { return mmap_region_t(ptr, size); } + public: static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context) { return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context); @@ -89,7 +91,7 @@ class mmap_region_t { uintptr_t raw_addr = reinterpret_cast(raw); uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1); - std::size_t prefix = static_cast(aligned_addr - raw_addr); + std::size_t prefix = (std::size_t)(aligned_addr - raw_addr); std::size_t suffix = raw_size - prefix - size; if (prefix > 0) { ::munmap(raw, prefix); } if (suffix > 0) { ::munmap(reinterpret_cast(aligned_addr + size), suffix); } @@ -113,33 +115,14 @@ class mmap_region_t { size_ = 0; } - void reset(void* ptr, std::size_t size) noexcept - { - reset(); - ptr_ = ptr; - size_ = size; - } - - void* release() noexcept - { - void* ptr = ptr_; - ptr_ = nullptr; - size_ = 0; - return ptr; - } - void advise(int advice) const noexcept { if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); } } void* data() noexcept { return ptr_; } - const void* data() const noexcept { return ptr_; } - char* char_data() noexcept { return static_cast(ptr_); } - const char* char_data() const noexcept { return static_cast(ptr_); } + char* char_data() noexcept { return (char*)ptr_; } std::size_t size() const noexcept { return size_; } - bool empty() const noexcept { return ptr_ == nullptr || size_ == 0; } - explicit operator bool() const noexcept { return !empty(); } private: void* ptr_ = nullptr; diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index 8581921173..498b106955 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include #include @@ -22,6 +24,33 @@ using cuopt::linear_programming::io::mps_parser_fail; namespace { +struct section_record_t { + mps_section_kind kind; + const char* name; + std::size_t len; +}; + +constexpr section_record_t section_records[] = { + {mps_section_kind::rows, "ROWS", 4}, + {mps_section_kind::columns, "COLUMNS", 7}, + {mps_section_kind::rhs, "RHS", 3}, + {mps_section_kind::bounds, "BOUNDS", 6}, + {mps_section_kind::ranges, "RANGES", 6}, + {mps_section_kind::quadobj, "QUADOBJ", 7}, + {mps_section_kind::qmatrix, "QMATRIX", 7}, + {mps_section_kind::qcmatrix, "QCMATRIX", 8}, + {mps_section_kind::endata, "ENDATA", 6}, +}; + +constexpr const char* header_records[] = {"NAME", "OBJSENSE", "OBJNAME"}; + +constexpr std::size_t kSimdWidth = sizeof(simde__m256i); +static_assert(kSimdWidth == 32); +static_assert((std::size_t)mps_section_kind::rows == 0); +static_assert((std::size_t)mps_section_kind::endata + 1 == std::size(section_records)); +static_assert((std::size_t)mps_phase_kind::header == 0); +static_assert((std::size_t)mps_phase_kind::quadratic + 1 == 7); + bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; } simde__m256i nonblank_column1_mask(simde__m256i bytes) @@ -29,39 +58,21 @@ simde__m256i nonblank_column1_mask(simde__m256i bytes) return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' ')); } -const char* section_name(mps_section_kind kind) +enum class section_record_match_t { invalid, header, section }; + +bool line_has_record_prefix(const char* line_start, const char* line_end, const char* name) { - switch (kind) { - case mps_section_kind::rows: return "ROWS"; - case mps_section_kind::columns: return "COLUMNS"; - case mps_section_kind::rhs: return "RHS"; - case mps_section_kind::bounds: return "BOUNDS"; - case mps_section_kind::ranges: return "RANGES"; - case mps_section_kind::quadobj: return "QUADOBJ"; - case mps_section_kind::qmatrix: return "QMATRIX"; - case mps_section_kind::qcmatrix: return "QCMATRIX"; - case mps_section_kind::endata: return "ENDATA"; + std::size_t len = std::strlen(name); + if ((std::size_t)(line_end - line_start) < len || std::memcmp(line_start, name, len) != 0) { + return false; } - return ""; + const char* after = line_start + len; + return after == line_end || *after <= ' '; } -std::size_t section_name_len(mps_section_kind kind) { return std::strlen(section_name(kind)); } - } // namespace -std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) -{ - switch (phase) { - case mps_phase_kind::header: return 0; - case mps_phase_kind::rows: return 1; - case mps_phase_kind::columns: return 2; - case mps_phase_kind::rhs: return 3; - case mps_phase_kind::bounds: return 4; - case mps_phase_kind::ranges: return 5; - case mps_phase_kind::quadratic: return 6; - } - mps_parser_fail(error_type_t::RuntimeError, "invalid MPS phase kind"); -} +std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) { return (std::size_t)phase; } void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range) { @@ -105,68 +116,37 @@ bool mps_phase_registry_t::ready(mps_phase_kind phase) const mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const { - return ranges_[phase_index(phase)]; + std::size_t idx = phase_index(phase); + assert(ready_[idx].load(std::memory_order_acquire)); + return ranges_[idx]; } -bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind) +static section_record_match_t is_section_record(const char* line_start, + const char* line_end, + mps_section_kind* kind) { - if (line_start >= line_end) { return false; } - - mps_section_kind candidate; - switch (*line_start) { - case 'R': - if (line_end - line_start >= 3 && std::memcmp(line_start, "RHS", 3) == 0) { - candidate = mps_section_kind::rhs; - } else if (line_end - line_start >= 4 && std::memcmp(line_start, "ROWS", 4) == 0) { - candidate = mps_section_kind::rows; - } else if (line_end - line_start >= 6 && std::memcmp(line_start, "RANGES", 6) == 0) { - candidate = mps_section_kind::ranges; - } else { - return false; - } - break; - case 'C': - if (line_end - line_start >= 7 && std::memcmp(line_start, "COLUMNS", 7) == 0) { - candidate = mps_section_kind::columns; - } else { - return false; - } - break; - case 'B': - if (line_end - line_start >= 6 && std::memcmp(line_start, "BOUNDS", 6) == 0) { - candidate = mps_section_kind::bounds; - } else { - return false; - } - break; - case 'E': - if (line_end - line_start >= 6 && std::memcmp(line_start, "ENDATA", 6) == 0) { - candidate = mps_section_kind::endata; - } else { - return false; - } - break; - case 'Q': - if (line_end - line_start >= 7 && std::memcmp(line_start, "QUADOBJ", 7) == 0) { - candidate = mps_section_kind::quadobj; - } else if (line_end - line_start >= 7 && std::memcmp(line_start, "QMATRIX", 7) == 0) { - candidate = mps_section_kind::qmatrix; - } else if (line_end - line_start >= 8 && std::memcmp(line_start, "QCMATRIX", 8) == 0) { - candidate = mps_section_kind::qcmatrix; - } else { - return false; - } - break; - default: return false; + if (line_start >= line_end) { return section_record_match_t::invalid; } + + for (const char* name : header_records) { + if (line_has_record_prefix(line_start, line_end, name)) { + return section_record_match_t::header; + } } - const char* after = line_start + section_name_len(candidate); - while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) { - ++after; + for (const section_record_t& record : section_records) { + if ((std::size_t)(line_end - line_start) < record.len || + std::memcmp(line_start, record.name, record.len) != 0) { + continue; + } + const char* after = line_start + record.len; + while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) { + ++after; + } + if (after != line_end) { return section_record_match_t::invalid; } + *kind = record.kind; + return section_record_match_t::section; } - if (after != line_end) { return false; } - *kind = candidate; - return true; + return section_record_match_t::invalid; } mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data, @@ -188,18 +168,7 @@ mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data, std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind) { - switch (kind) { - case mps_section_kind::rows: return 0; - case mps_section_kind::columns: return 1; - case mps_section_kind::rhs: return 2; - case mps_section_kind::bounds: return 3; - case mps_section_kind::ranges: return 4; - case mps_section_kind::quadobj: return 5; - case mps_section_kind::qmatrix: return 6; - case mps_section_kind::qcmatrix: return 7; - case mps_section_kind::endata: return 8; - } - return 0; + return (std::size_t)kind; } void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr) @@ -212,11 +181,8 @@ void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, cons } } -void mps_section_block_scanner_t::scan_section_range(const char* begin, - const char* end, - bool boundary_scan) +void mps_section_block_scanner_t::scan_section_range(const char* begin, const char* end) { - (void)boundary_scan; if (begin >= end) return; const char* p = begin; @@ -224,21 +190,39 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin, // line. A separate boundary scan covers section titles whose newline/title // bytes straddle adjacent LZ4 blocks. if (p != data_) { - const void* nl = __builtin_memchr(p, '\n', static_cast(end - p)); + const void* nl = __builtin_memchr(p, '\n', (std::size_t)(end - p)); if (nl == nullptr) { return; } - p = static_cast(nl) + 1; + p = (const char*)nl + 1; } auto try_candidate = [&](const char* line_start) { - const void* nl = __builtin_memchr(line_start, '\n', static_cast(end - line_start)); - const char* line_end = nl == nullptr ? end : static_cast(nl); + const void* nl = __builtin_memchr(line_start, '\n', (std::size_t)(end - line_start)); + const char* line_end = nullptr; + if (nl == nullptr) { + const char* ready_ptr = data_ + ready_bytes_.load(std::memory_order_acquire); + if (end != ready_ptr) { return; } + line_end = end; + } else { + line_end = (const char*)nl; + } + if (*line_start == '*' || *line_start == '$') { return; } mps_section_kind kind; - if (line_is_section(line_start, line_end, &kind)) { record_section_hit(kind, line_start); } + section_record_match_t match = is_section_record(line_start, line_end, &kind); + if (match == section_record_match_t::section) { + record_section_hit(kind, line_start); + return; + } + if (match == section_record_match_t::invalid) { + mps_parser_fail(error_type_t::ValidationError, + "unknown section record: %.*s", + (int)(line_end - line_start), + line_start); + } }; // Handle the very first line of a file (NAME indicator, usually) if (p == data_) { - if (p < end && is_nonblank_column1(static_cast(*p))) { try_candidate(p); } + if (p < end && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); } ++p; } @@ -246,24 +230,25 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin, // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap // candidate signal, then run the exact section matcher only for candidates. const simde__m256i newline = simde_mm256_set1_epi8('\n'); - while (static_cast(end - p) >= 32) { + while ((std::size_t)(end - p) >= kSimdWidth) { + // The first-line path above increments p when p == data_, so p - 1 is + // in-bounds here. Loading the previous vector lets us test "\nX" for all + // 32 candidate column-1 bytes with one AVX2 mask. simde__m256i current = simde_mm256_loadu_si256(reinterpret_cast(p)); simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast(p - 1)); - std::uint32_t mask = static_cast(simde_mm256_movemask_epi8(simde_mm256_and_si256( - simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current)))); + std::uint32_t mask = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256( + simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current))); while (mask != 0) { int bit = __builtin_ctz(mask); try_candidate(p + bit); mask &= mask - 1; } - p += 32; + p += kSimdWidth; } // scalar tail while (p < end) { - if (*(p - 1) == '\n' && is_nonblank_column1(static_cast(*p))) { - try_candidate(p); - } + if (*(p - 1) == '\n' && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); } ++p; } } @@ -277,7 +262,7 @@ void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::siz boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin; std::size_t end = right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end; - scan_section_range(data_ + begin, data_ + end, true); + scan_section_range(data_ + begin, data_ + end); } void mps_section_block_scanner_t::observe_block(std::size_t block_index, @@ -289,11 +274,9 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, "MPS section scanner observed invalid LZ4 block index"); } - scan_section_range(begin, end, false); - block_begin_offsets_[block_index].store(static_cast(begin - data_), - std::memory_order_relaxed); - block_end_offsets_[block_index].store(static_cast(end - data_), - std::memory_order_relaxed); + scan_section_range(begin, end); + block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed); + block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed); block_decoded_[block_index].store(1, std::memory_order_release); if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) { @@ -308,11 +291,18 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes) { ready_bytes_.store(ready_bytes, std::memory_order_release); + std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0; + scan_section_range(data_ + begin, data_ + ready_bytes); publish_section_ranges(); } void mps_section_block_scanner_t::publish_section_ranges() { + // Publication model: each present phase runs from its own section header to + // the first later section header that has been discovered. Optional sections + // publish present=false once a later boundary proves they cannot still appear. + // ENDATA, or final ready bytes for truncated/non-newline files, is the final + // boundary for the trailing optional/quadratic phases. std::lock_guard lock(publish_mutex_); std::size_t ready = ready_bytes_.load(std::memory_order_acquire); const char* ready_ptr = data_ + ready; @@ -349,6 +339,21 @@ void mps_section_block_scanner_t::publish_section_ranges() } return best; }; + auto publish_optional = [&](mps_phase_kind phase, + const char* self, + const char* predecessor, + std::initializer_list later_candidates) { + if (registry_.ready(phase)) { return; } + if (available(self)) { + const char* end = earliest_available_after(self, later_candidates); + if (end != nullptr) { registry_.publish(phase, {self, end, true}); } + return; + } + if (predecessor != nullptr && + earliest_available_after(predecessor, later_candidates) != nullptr) { + registry_.publish(phase, {nullptr, nullptr, false}); + } + }; if (available(rows) && !registry_.ready(mps_phase_kind::header)) { registry_.publish(mps_phase_kind::header, {data_, rows, true}); @@ -364,43 +369,18 @@ void mps_section_block_scanner_t::publish_section_ranges() } } - if (!registry_.ready(mps_phase_kind::rhs)) { - if (available(rhs)) { - const char* rhs_end = - earliest_available_after(rhs, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); - if (rhs_end != nullptr) { registry_.publish(mps_phase_kind::rhs, {rhs, rhs_end, true}); } - } else { - const char* after_columns = earliest_available_after( - columns, {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); - if (after_columns != nullptr) { - registry_.publish(mps_phase_kind::rhs, {nullptr, nullptr, false}); - } - } - } - - if (!registry_.ready(mps_phase_kind::ranges)) { - const char* ranges_end = - earliest_available_after(ranges, {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); - const char* after_rhs = earliest_available_after( - rhs ? rhs : columns, {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); - if (available(ranges) && ranges_end != nullptr) { - registry_.publish(mps_phase_kind::ranges, {ranges, ranges_end, true}); - } else if (!ranges && after_rhs != nullptr) { - registry_.publish(mps_phase_kind::ranges, {nullptr, nullptr, false}); - } - } - - if (!registry_.ready(mps_phase_kind::bounds)) { - const char* bounds_end = - earliest_available_after(bounds, {quadobj, qmatrix, qcmatrix, final_boundary}); - const char* after_ranges = earliest_available_after( - ranges ? ranges : (rhs ? rhs : columns), {quadobj, qmatrix, qcmatrix, final_boundary}); - if (available(bounds) && bounds_end != nullptr) { - registry_.publish(mps_phase_kind::bounds, {bounds, bounds_end, true}); - } else if (!bounds && after_ranges != nullptr) { - registry_.publish(mps_phase_kind::bounds, {nullptr, nullptr, false}); - } - } + publish_optional(mps_phase_kind::rhs, + rhs, + columns, + {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + publish_optional(mps_phase_kind::ranges, + ranges, + rhs ? rhs : columns, + {bounds, quadobj, qmatrix, qcmatrix, final_boundary}); + publish_optional(mps_phase_kind::bounds, + bounds, + ranges ? ranges : (rhs ? rhs : columns), + {quadobj, qmatrix, qcmatrix, final_boundary}); if (!registry_.ready(mps_phase_kind::quadratic)) { const char* quadratic_begin = nullptr; diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index cc287368fb..74bf89da7f 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -47,6 +47,8 @@ class mps_phase_registry_t { void attach_event(mps_phase_kind phase, omp_event_handle_t event); bool ready(mps_phase_kind phase) const; + // range() is lock-free: callers must observe ready(phase)==true first. The + // acquire load in ready() pairs with publish()'s release store before ranges_. mps_phase_range_t range(mps_phase_kind phase) const; private: @@ -62,8 +64,6 @@ class mps_phase_registry_t { mutable std::mutex mutex_; }; -bool line_is_section(const char* line_start, const char* line_end, mps_section_kind* kind); - class mps_section_block_scanner_t { public: mps_section_block_scanner_t(const char* data, @@ -74,12 +74,14 @@ class mps_section_block_scanner_t { void publish_ready(std::size_t ready_bytes); private: - static constexpr std::size_t section_count = 9; + static constexpr std::size_t section_count = 9; + // Section titles are short; 128 bytes is enough to rescan around a decoded + // block boundary and catch a newline/title pair split across adjacent blocks. static constexpr std::size_t boundary_overlap = 128; static std::size_t section_hit_index(mps_section_kind kind); - void scan_section_range(const char* begin, const char* end, bool boundary_scan); + void scan_section_range(const char* begin, const char* end); void scan_boundary(std::size_t left_index, std::size_t right_index); void record_section_hit(mps_section_kind kind, const char* ptr); void publish_section_ranges(); diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp index 23f4b4b8c1..f8a6d04d1e 100644 --- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -121,7 +121,7 @@ class scoped_range { inline void name_current_thread(const char* name) { #ifdef MPS_FAST_NVTX - nvtxNameOsThreadA(static_cast(::syscall(SYS_gettid)), name); + nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name); #else (void)name; #endif diff --git a/cpp/src/io/experimental_mps_fast/simd_compat.hpp b/cpp/src/io/experimental_mps_fast/simd_compat.hpp deleted file mode 100644 index fb849fcff0..0000000000 --- a/cpp/src/io/experimental_mps_fast/simd_compat.hpp +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights -// reserved. SPDX-License-Identifier: Apache-2.0 - -#pragma once - -// Use SIMDe's explicit simde_* API. On x86 it can still lower to native -// intrinsics; on other targets it provides the portable implementation. -#include -#include -#include diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt index bc057db1e2..fcceb4af56 100644 --- a/cpp/tests/linear_programming/CMakeLists.txt +++ b/cpp/tests/linear_programming/CMakeLists.txt @@ -21,6 +21,44 @@ ConfigureTest(MPS_PARSER_TEST ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp LABELS numopt) +function(ConfigureStandaloneMpsFastTest CMAKE_TEST_NAME TEST_SOURCE) + add_executable(${CMAKE_TEST_NAME} ${TEST_SOURCE}) + target_include_directories(${CMAKE_TEST_NAME} + PRIVATE + "${CUOPT_TEST_DIR}/../src" + "${CUOPT_TEST_DIR}/../src/io" + "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast" + ) + target_compile_features(${CMAKE_TEST_NAME} PRIVATE cxx_std_20) + target_compile_options(${CMAKE_TEST_NAME} + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + ) + target_link_libraries(${CMAKE_TEST_NAME} + PRIVATE + cuopt + simde::simde + ${CUOPT_PRIVATE_CUDA_LIBS} + ) + if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") + target_link_options(${CMAKE_TEST_NAME} PRIVATE -Wl,--enable-new-dtags) + endif() + + add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) + set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "numopt") + + install( + TARGETS ${CMAKE_TEST_NAME} + COMPONENT testing + DESTINATION bin/gtests/libcuopt + EXCLUDE_FROM_ALL + ) +endfunction() + +ConfigureStandaloneMpsFastTest(MPS_FAST_FP64_PARSER_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp) +ConfigureStandaloneMpsFastTest(MPS_FAST_PARSER_EDGE_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp) + # ################################################################################################## # - C API Tests---------------------------------------------------------------------- # The C API tests require a separate library to be linked against. So we don't use the ConfigureTest macro. diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp new file mode 100644 index 0000000000..36171267cf --- /dev/null +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fast_fp64_parser.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +uint64_t bits(double value) { return std::bit_cast(value); } + +[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); } + +void expect_true(bool condition, const std::string& message) +{ + if (!condition) { fail(message); } +} + +void expect_eq_ptr(const char* got, const char* expected, std::string_view context) +{ + if (got != expected) { + std::ostringstream out; + out << context << ": pointer mismatch got_delta=" << (got - expected); + fail(out.str()); + } +} + +double reference_strtod(std::string_view token) +{ + std::string normalized(token); + for (char& c : normalized) { + if (c == 'd' || c == 'D') { c = 'e'; } + } + char* end = nullptr; + errno = 0; + double value = std::strtod(normalized.c_str(), &end); + expect_eq_ptr(end, normalized.c_str() + normalized.size(), token); + return value; +} + +double parse_token(std::string_view token) +{ + const char* p = token.data(); + return mps_fast::fp64::parse_fp64_advance(p, token.data() + token.size()); +} + +double parse_padded_token(std::string_view token) +{ + std::string padded(token); + padded.append(40, ' '); + const char* p = padded.data(); + double value = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size()); + expect_eq_ptr(p, padded.data() + token.size(), token); + return value; +} + +void expect_bitwise_strtod(std::string_view token) +{ + double ref = reference_strtod(token); + uint64_t token_bits = bits(parse_token(token)); + uint64_t padded_bits = bits(parse_padded_token(token)); + uint64_t ref_bits = bits(ref); + if (token_bits != ref_bits || padded_bits != ref_bits) { + std::ostringstream out; + out << "bitwise mismatch for '" << token << "' ref=0x" << std::hex << ref_bits << " token=0x" + << token_bits << " padded=0x" << padded_bits; + fail(out.str()); + } +} + +std::string random_token(std::mt19937_64& rng) +{ + std::uniform_int_distribution sign_dist(0, 4); + std::uniform_int_distribution digit_dist(0, 9); + std::uniform_int_distribution shape_dist(0, 5); + std::uniform_int_distribution len_dist(1, 19); + std::uniform_int_distribution exp_dist(-30, 30); + + std::string token; + int sign = sign_dist(rng); + if (sign == 0) { + token.push_back('-'); + } else if (sign == 1) { + token.push_back('+'); + } + + int shape = shape_dist(rng); + if (shape == 0) { + token.append("0."); + int frac_len = std::uniform_int_distribution(1, 19)(rng); + for (int i = 0; i < frac_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + } else { + int int_len = len_dist(rng); + token.push_back(static_cast('1' + std::uniform_int_distribution(0, 8)(rng))); + for (int i = 1; i < int_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + if (shape >= 2) { + token.push_back('.'); + int remaining = 24 - static_cast(token.size()); + int max_frac = std::max(0, std::min(19, remaining)); + int frac_len = max_frac == 0 ? 0 : std::uniform_int_distribution(0, max_frac)(rng); + for (int i = 0; i < frac_len; ++i) { + token.push_back(static_cast('0' + digit_dist(rng))); + } + } + } + + if (shape == 5) { + int exp = exp_dist(rng); + std::string suffix = "e" + std::to_string(exp); + if (token.size() + suffix.size() <= 25) { token += suffix; } + } + + if (token.size() > 25) { token.resize(25); } + return token; +} + +void common_table_matches_strtod_bitwise() +{ + std::setlocale(LC_NUMERIC, "C"); + const std::vector cases = { + "0", + "-0", + "1", + "-1", + "+1", + "2", + "42", + "123456789", + "57.", + "-57.", + "0.1", + "0.01", + "0.12345678901234", + "0.1234567890123456", + "0.3333333333333333", + "0.6508282938248958", + "3.14159", + "3130000", + "8594600.16", + "2344.55", + "0.000000000000001", + "9999999999999999", + "1844674407370955161", + "1e0", + "1e-9", + "1E12", + "-2.5e3", + "3.125D-2", + }; + + for (std::string_view token : cases) { + expect_bitwise_strtod(token); + } +} + +void cursor_advances_to_token_end() +{ + std::setlocale(LC_NUMERIC, "C"); + std::string text = "123.45 ABC"; + const char* p = text.data(); + double value = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size()); + + expect_true(bits(value) == bits(reference_strtod("123.45")), "parsed value mismatch"); + expect_eq_ptr(p, text.data() + 6, "cursor_advances_to_token_end"); + expect_true(std::string_view(p, 5) == " ABC", "cursor did not stop before trailing field"); +} + +void fixed_seed_random_differential() +{ + std::setlocale(LC_NUMERIC, "C"); + std::mt19937_64 rng(0x4d50535f46415354ULL); + for (int i = 0; i < 100000; ++i) { + std::string token = random_token(rng); + expect_true(token.size() <= 25U, "generated token exceeds MPS numeric token length"); + expect_bitwise_strtod(token); + } +} + +} // namespace + +int main() +{ + struct TestCase { + const char* name; + void (*fn)(); + }; + + const TestCase tests[] = { + {"CommonTableMatchesStrtodBitwise", common_table_matches_strtod_bitwise}, + {"CursorAdvancesToTokenEnd", cursor_advances_to_token_end}, + {"FixedSeedRandomDifferential", fixed_seed_random_differential}, + }; + + int failed = 0; + for (const TestCase& test : tests) { + std::cout << "[ RUN ] " << test.name << '\n'; + try { + test.fn(); + std::cout << "[ OK ] " << test.name << '\n'; + } catch (const std::exception& e) { + ++failed; + std::cerr << "[ FAILED ] " << test.name << ": " << e.what() << '\n'; + } + } + + if (failed != 0) { + std::cerr << failed << " test(s) failed\n"; + return 1; + } + std::cout << "[ PASSED ] " << std::size(tests) << " test(s)\n"; + return 0; +} diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp new file mode 100644 index 0000000000..2e087ec4ee --- /dev/null +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -0,0 +1,871 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fast_parser.hpp" +#include "mps_section_scanner.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace { + +struct skip_test : std::runtime_error { + using std::runtime_error::runtime_error; +}; + +[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); } + +void expect_true(bool condition, const std::string& message) +{ + if (!condition) { fail(message); } +} + +template +void expect_eq(const A& got, const B& expected, std::string_view context) +{ + if (!(got == expected)) { + std::ostringstream out; + out << context << ": got=" << got << " expected=" << expected; + fail(out.str()); + } +} + +template +void expect_vector_eq(const VecA& got, const VecB& expected, std::string_view context) +{ + if (got.size() != expected.size()) { + std::ostringstream out; + out << context << ": size got=" << got.size() << " expected=" << expected.size(); + fail(out.str()); + } + for (size_t i = 0; i < got.size(); ++i) { + if (!(got[i] == expected[i])) { + std::ostringstream out; + out << context << ": first mismatch at " << i; + fail(out.str()); + } + } +} + +void expect_near_inf(double value, int sign, std::string_view context) +{ + expect_true(std::isinf(value), std::string(context) + ": expected infinity"); + expect_true(std::signbit(value) == (sign < 0), std::string(context) + ": wrong infinity sign"); +} + +struct TempMpsFile { + explicit TempMpsFile(std::string contents) + { + char path_template[128]; + std::snprintf(path_template, + sizeof(path_template), + "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps", + static_cast(getpid())); + int fd = mkstemps(path_template, 4); + if (fd < 0) { fail(std::string("mkstemps failed: ") + std::strerror(errno)); } + path = path_template; + FILE* file = fdopen(fd, "wb"); + if (file == nullptr) { + close(fd); + fail(std::string("fdopen failed: ") + std::strerror(errno)); + } + if (!contents.empty() && + std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) { + std::fclose(file); + fail(std::string("failed to write temporary MPS file: ") + std::strerror(errno)); + } + if (std::fclose(file) != 0) { + fail(std::string("failed to close temporary MPS file: ") + std::strerror(errno)); + } + } + + TempMpsFile(const TempMpsFile&) = delete; + TempMpsFile& operator=(const TempMpsFile&) = delete; + + ~TempMpsFile() + { + if (!path.empty()) { std::remove(path.c_str()); } + } + + std::string path; +}; + +struct TempOwnedPath { + explicit TempOwnedPath(std::string p) : path(std::move(p)) {} + TempOwnedPath(const TempOwnedPath&) = delete; + TempOwnedPath& operator=(const TempOwnedPath&) = delete; + + ~TempOwnedPath() + { + if (!path.empty()) { std::remove(path.c_str()); } + } + + std::string path; +}; + +template +void expect_throws(Fn&& fn, std::string_view context) +{ + try { + fn(); + } catch (const std::exception&) { + return; + } + fail(std::string(context) + ": expected exception"); +} + +void expect_fast_parse_error(std::string_view fixture_name, std::string contents) +{ + TempMpsFile file(std::move(contents)); + expect_throws( + [&] { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + fixture_name); +} + +std::string_view range_text(const mps_fast::mps_phase_range_t& range) +{ + if (!range.present) { return {}; } + return std::string_view(range.begin, static_cast(range.end - range.begin)); +} + +void scanner_finds_section_split_across_blocks() +{ + const std::string mps = + "NAME EDGE\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " x1 OBJ 1\n" + " x1 rowA 2\n" + "RHS\n" + " rhs rowA 3\n" + "ENDATA\n"; + + const size_t columns_pos = mps.find("COLUMNS"); + expect_true(columns_pos != std::string::npos, "failed to place COLUMNS split"); + const size_t split = columns_pos + 3; + + mps_fast::mps_phase_registry_t registry; + mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); + + scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); + scanner.publish_ready(0); + scanner.observe_block(0, mps.data(), mps.data() + split); + scanner.publish_ready(mps.size()); + + expect_true(registry.ready(mps_fast::mps_phase_kind::header), "header not ready"); + expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready"); + expect_true(registry.ready(mps_fast::mps_phase_kind::columns), "columns not ready"); + expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready"); + expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), "quadratic sentinel not ready"); + + expect_true(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"), + "columns range begins at wrong boundary"); + expect_true(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"), + "rhs range begins at wrong boundary"); +} + +void scanner_rejects_unknown_column_one_records_after_rows() +{ + const std::string mps = + "NAME BAD\n" + "ROWS\n" + " N OBJ\n" + "FOO\n" + "COLUMNS\n" + " x OBJ 1\n" + "ENDATA\n"; + + expect_throws( + [&] { + mps_fast::mps_phase_registry_t registry; + mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry); + scanner.observe_block(0, mps.data(), mps.data() + mps.size()); + scanner.publish_ready(mps.size()); + }, + "unknown column-1 record after ROWS"); +} + +uint64_t bits(double value) { return std::bit_cast(value); } + +void expect_double_bitwise_eq(double got, double expected, std::string_view context) +{ + if (bits(got) != bits(expected)) { + std::ostringstream out; + out << context << ": got=0x" << std::hex << bits(got) << " expected=0x" << bits(expected); + fail(out.str()); + } +} + +template +void expect_double_vector_bitwise_eq(const VecA& got, + const VecB& expected, + std::string_view context) +{ + if (got.size() != expected.size()) { + std::ostringstream out; + out << context << ": size got=" << got.size() << " expected=" << expected.size(); + fail(out.str()); + } + for (size_t i = 0; i < got.size(); ++i) { + if (bits(got[i]) != bits(expected[i])) { + std::ostringstream out; + out << context << ": first bitwise mismatch at " << i << " got=0x" << std::hex << bits(got[i]) + << " expected=0x" << bits(expected[i]); + fail(out.str()); + } + } +} + +void expect_models_match_reference_bitwise( + const mps_fast::parser_model_t& fast, + const cuopt::linear_programming::io::mps_data_model_t& reference, + std::string_view context) +{ + expect_eq(fast.n_vars_, reference.n_vars_, std::string(context) + " n_vars"); + expect_eq(fast.n_constraints_, reference.n_constraints_, std::string(context) + " n_constraints"); + expect_eq(fast.nnz_, reference.nnz_, std::string(context) + " nnz"); + expect_eq(fast.maximize_, reference.maximize_, std::string(context) + " maximize"); + expect_eq(fast.problem_name_, reference.problem_name_, std::string(context) + " problem_name"); + expect_eq( + fast.objective_name_, reference.objective_name_, std::string(context) + " objective_name"); + + expect_double_bitwise_eq(fast.objective_scaling_factor_, + reference.objective_scaling_factor_, + std::string(context) + " objective_scaling_factor"); + expect_double_bitwise_eq(fast.objective_offset_, + reference.objective_offset_, + std::string(context) + " objective_offset"); + + expect_double_vector_bitwise_eq(fast.A_, reference.A_, std::string(context) + " A"); + expect_vector_eq(fast.A_indices_, reference.A_indices_, std::string(context) + " A_indices"); + expect_vector_eq(fast.A_offsets_, reference.A_offsets_, std::string(context) + " A_offsets"); + expect_double_vector_bitwise_eq(fast.b_, reference.b_, std::string(context) + " b"); + expect_double_vector_bitwise_eq(fast.c_, reference.c_, std::string(context) + " c"); + expect_double_vector_bitwise_eq(fast.variable_lower_bounds_, + reference.variable_lower_bounds_, + std::string(context) + " variable_lower_bounds"); + expect_double_vector_bitwise_eq(fast.variable_upper_bounds_, + reference.variable_upper_bounds_, + std::string(context) + " variable_upper_bounds"); + expect_double_vector_bitwise_eq(fast.constraint_lower_bounds_, + reference.constraint_lower_bounds_, + std::string(context) + " constraint_lower_bounds"); + expect_double_vector_bitwise_eq(fast.constraint_upper_bounds_, + reference.constraint_upper_bounds_, + std::string(context) + " constraint_upper_bounds"); + expect_vector_eq(fast.var_types_, reference.var_types_, std::string(context) + " var_types"); + expect_vector_eq(fast.row_types_, reference.row_types_, std::string(context) + " row_types"); + expect_vector_eq(fast.var_names_, reference.var_names_, std::string(context) + " var_names"); + expect_vector_eq(fast.row_names_, reference.row_names_, std::string(context) + " row_names"); +} + +void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) +{ + TempMpsFile file(std::move(contents)); + auto fast = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto reference = cuopt::linear_programming::io::read_mps(file.path, false); + expect_models_match_reference_bitwise(fast, reference, fixture_name); +} + +std::string row_name(size_t i) +{ + std::ostringstream out; + out << 'R' << std::setw(6) << std::setfill('0') << i; + return out.str(); +} + +size_t find_var(const mps_fast::parser_model_t& model, std::string_view name) +{ + for (size_t i = 0; i < model.var_names_.size(); ++i) { + if (model.var_names_[i] == name) { return i; } + } + fail("variable not found: " + std::string(name)); +} + +void expect_model_shapes(const mps_fast::parser_model_t& model, + int rows, + int vars, + int nnz, + std::string_view context) +{ + expect_eq(model.n_constraints_, rows, std::string(context) + " rows"); + expect_eq(model.n_vars_, vars, std::string(context) + " vars"); + expect_eq(model.nnz_, nnz, std::string(context) + " nnz"); + expect_eq( + model.A_offsets_.size(), static_cast(rows + 1), std::string(context) + " offsets"); + expect_eq(model.A_.size(), static_cast(nnz), std::string(context) + " values"); + expect_eq(model.A_indices_.size(), static_cast(nnz), std::string(context) + " indices"); +} + +std::string section_split_fixture() +{ + return "NAME SPLITS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " UP BND X1 4\n" + "ENDATA\n"; +} + +void scanner_finds_headers_split_at_every_byte() +{ + const std::string mps = section_split_fixture(); + const std::vector headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"}; + + for (std::string_view header : headers) { + const size_t pos = mps.find(header); + expect_true(pos != std::string::npos, "missing header in split fixture"); + for (size_t offset = 1; offset < header.size(); ++offset) { + const size_t split = pos + offset; + mps_fast::mps_phase_registry_t registry; + mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); + + scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); + scanner.observe_block(0, mps.data(), mps.data() + split); + scanner.publish_ready(mps.size()); + + expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready after split"); + expect_true(registry.ready(mps_fast::mps_phase_kind::columns), + "columns not ready after split"); + expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready after split"); + expect_true(registry.ready(mps_fast::mps_phase_kind::bounds), "bounds not ready after split"); + expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), + "quadratic sentinel not ready after split"); + } + } +} + +void bounds_defaults_and_types_match_reference() +{ + verify_fixture_bitwise("bounds_defaults_and_types", + "NAME BOUNDS_EDGE\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " XFREE rowA 1\n" + " XUP0 rowA 1\n" + " XNEG rowA 1\n" + " XBV rowA 1\n" + " XFX rowA 1\n" + " XLI rowA 1\n" + "RHS\n" + " RHS1 rowA 10\n" + "BOUNDS\n" + " FR BND XFREE\n" + " UP BND XUP0 0\n" + " UP BND XNEG -1\n" + " BV BND XBV\n" + " FX BND XFX 7\n" + " LI BND XLI 2\n" + " UI BND XLI 9\n" + "ENDATA\n"); +} + +void duplicate_bounds_last_statement_wins() +{ + const std::string contents = + "NAME BOUNDS_DUP\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " X1 rowA 1\n" + "RHS\n" + " RHS1 rowA 10\n" + "BOUNDS\n" + " LO BND X1 0\n" + " UP BND X1 5\n" + " UP BND X1 3\n" + " LO BND X1 2\n" + "ENDATA\n"; + + verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents); + TempMpsFile file(contents); + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_eq(model.n_vars_, 1, "n_vars"); + expect_eq(model.variable_lower_bounds_.at(0), 2.0, "duplicate lower bound"); + expect_eq(model.variable_upper_bounds_.at(0), 3.0, "duplicate upper bound"); +} + +void nondense_row_and_column_names_use_hash_path() +{ + verify_fixture_bitwise("nondense_row_and_column_names", + "NAME HASH_NAMES\n" + "ROWS\n" + " N obj.row\n" + " G demand-east\n" + " L capacity-west\n" + " E balance.17\n" + "COLUMNS\n" + " alpha obj.row 4.5 demand-east 1\n" + " beta_two capacity-west -2 balance.17 3\n" + " z-last demand-east 7 balance.17 -1\n" + "RHS\n" + " rhs demand-east 2 capacity-west 9\n" + " rhs balance.17 0\n" + "BOUNDS\n" + " LO b alpha -5\n" + " UP b beta_two 6\n" + " FR b z-last\n" + "ENDATA\n"); +} + +void missing_optional_bounds_fast_path() +{ + TempMpsFile file( + "NAME OPTIONALS\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " X1 OBJ 1 rowA 2\n" + "RHS\n" + " RHS1 rowA 0\n" + "ENDATA\n"); + + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_eq(model.n_vars_, 1, "missing optional n_vars"); + expect_eq(model.n_constraints_, 1, "missing optional n_constraints"); + expect_eq(model.variable_lower_bounds_.at(0), 0.0, "missing BOUNDS lower default"); + expect_near_inf(model.variable_upper_bounds_.at(0), 1, "missing BOUNDS upper default"); +} + +void bounds_only_variables_are_appended_deterministically() +{ + TempMpsFile file( + "NAME BOUNDS_ONLY\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " XMAIN OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " UP B AUX_Z 9\n" + " LO B AUX_Z -3\n" + " BV B AUX_A\n" + " SC B AUX_S 5\n" + "ENDATA\n"); + + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_model_shapes(model, 1, 4, 1, "bounds-only"); + expect_eq(model.var_names_.at(0), std::string("XMAIN"), "main var name"); + expect_eq(model.var_names_.at(1), std::string("AUX_A"), "bounds-only sorted name 1"); + expect_eq(model.var_names_.at(2), std::string("AUX_S"), "bounds-only sorted name 2"); + expect_eq(model.var_names_.at(3), std::string("AUX_Z"), "bounds-only sorted name 3"); + + size_t aux_a = find_var(model, "AUX_A"); + size_t aux_s = find_var(model, "AUX_S"); + size_t aux_z = find_var(model, "AUX_Z"); + expect_eq(model.var_types_.at(aux_a), 'I', "bounds-only BV type"); + expect_eq(model.variable_lower_bounds_.at(aux_a), 0.0, "bounds-only BV lb"); + expect_eq(model.variable_upper_bounds_.at(aux_a), 1.0, "bounds-only BV ub"); + expect_eq(model.var_types_.at(aux_s), 'S', "bounds-only SC type"); + expect_eq(model.variable_upper_bounds_.at(aux_s), 5.0, "bounds-only SC ub"); + expect_eq(model.variable_lower_bounds_.at(aux_z), -3.0, "bounds-only duplicate lb"); + expect_eq(model.variable_upper_bounds_.at(aux_z), 9.0, "bounds-only duplicate ub"); +} + +void integer_markers_assign_types_and_default_bounds() +{ + TempMpsFile file( + "NAME MARKERS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " MARK000 'MARKER' 'INTORG'\n" + " XINT OBJ 1 R1 1\n" + " MARK001 'MARKER' 'INTEND'\n" + " XCONT OBJ 2 R1 2\n" + " MARK002 'MARKER' 'INTORG'\n" + " XBIN OBJ 3 R1 3\n" + " MARK003 'MARKER' 'INTEND'\n" + "RHS\n" + " RHS1 R1 10\n" + "ENDATA\n"); + + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_model_shapes(model, 1, 3, 3, "integer markers"); + size_t xint = find_var(model, "XINT"); + size_t xcont = find_var(model, "XCONT"); + size_t xbin = find_var(model, "XBIN"); + expect_eq(model.var_types_.at(xint), 'I', "XINT type"); + expect_eq(model.var_types_.at(xcont), 'C', "XCONT type"); + expect_eq(model.var_types_.at(xbin), 'I', "XBIN type"); + expect_eq(model.variable_lower_bounds_.at(xint), 0.0, "XINT default lb"); + expect_eq(model.variable_upper_bounds_.at(xint), 1.0, "XINT default ub"); + expect_eq(model.variable_lower_bounds_.at(xbin), 0.0, "XBIN default lb"); + expect_eq(model.variable_upper_bounds_.at(xbin), 1.0, "XBIN default ub"); +} + +void numeric_parsing_integration_matches_reference_bitwise() +{ + verify_fixture_bitwise("numeric_parsing_integration", + "NAME NUMBERS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + " G R2\n" + " E R3\n" + "COLUMNS\n" + " X0 OBJ 0.12345678901234 R1 1e-9\n" + " X1 OBJ -2.5E3 R2 0.12345678901234567890123\n" + " X2 R3 9999999999999999\n" + "RHS\n" + " RHS1 R1 3.14159 R2 -0.000000000000001\n" + " RHS1 R3 42\n" + "RANGES\n" + " RNG R1 0.25 R2 1E2\n" + "BOUNDS\n" + " LO B X0 -123456789\n" + " UP B X0 123456789\n" + " FX B X1 0.3333333333333333\n" + " FR B X2\n" + "ENDATA\n"); +} + +std::string to_crlf(std::string text) +{ + std::string converted; + converted.reserve(text.size() + text.size() / 8); + for (char c : text) { + if (c == '\n') { + converted += "\r\n"; + } else { + converted.push_back(c); + } + } + return converted; +} + +void crlf_line_endings_match_reference_bitwise() +{ + verify_fixture_bitwise("crlf_line_endings", + to_crlf("NAME CRLF_EDGE\n" + "OBJSENSE\n" + " MAX\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " UP B X1 4\n" + "ENDATA\n")); +} + +void comment_placement_supported_cases_match_reference_bitwise() +{ + verify_fixture_bitwise("comment_placement_supported_cases", + "* leading star comment\n" + "$ leading dollar comment\n" + "NAME COMMENTS\n" + "$ comment between NAME and ROWS\n" + "ROWS\n" + "* comment after ROWS header\n" + " N OBJ $ row objective comment\n" + "$ comment between ROW records\n" + " L R1 $ row constraint comment\n" + "COLUMNS\n" + "* comment after COLUMNS header\n" + " X1 OBJ 1 R1 2 $ inline column comment\n" + "$ comment before next column\n" + " X2 OBJ -1 R1 3\n" + "RHS\n" + "$ comment after RHS header\n" + " RHS1 R1 5 $ inline rhs comment\n" + "BOUNDS\n" + "* comment after BOUNDS header\n" + " LO B X1 0 $ inline bound comment\n" + "$ comment before ENDATA\n" + "ENDATA\n"); +} + +void objective_metadata_selects_named_objective() +{ + TempMpsFile file( + "NAME OBJMETA\n" + "OBJSENSE\n" + " MAX\n" + "OBJNAME\n" + " COST\n" + "ROWS\n" + " N ALT\n" + " N COST\n" + " L R1\n" + "COLUMNS\n" + " X1 ALT 100 COST 5\n" + " X1 R1 1\n" + " X2 COST -2 R1 3\n" + "RHS\n" + " RHS1 COST 7 R1 11\n" + "ENDATA\n"); + + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_true(model.maximize_, "OBJSENSE MAX not applied"); + expect_eq(model.problem_name_, std::string("OBJMETA"), "problem name"); + expect_eq(model.objective_name_, std::string("COST"), "objective name"); + expect_eq(model.objective_offset_, -7.0, "objective RHS offset"); + size_t x1 = find_var(model, "X1"); + size_t x2 = find_var(model, "X2"); + expect_eq(model.c_.at(x1), 5.0, "named objective coefficient X1"); + expect_eq(model.c_.at(x2), -2.0, "named objective coefficient X2"); +} + +void malformed_inputs_report_errors() +{ + expect_fast_parse_error("bad objsense", + "NAME BADOBJ\n" + "OBJSENSE\n" + " SIDEWAYS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + + expect_fast_parse_error("unknown row in columns", + "NAME BADCOLROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 MISSING 1\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + + expect_fast_parse_error("unknown row in rhs", + "NAME BADRHSROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 MISSING 1\n" + "ENDATA\n"); + + expect_fast_parse_error("unknown bound type", + "NAME BADBOUND\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " XX B X1 1\n" + "ENDATA\n"); + + expect_fast_parse_error("semi-continuous bound without value", + "NAME BADSC\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " SC B X1\n" + "ENDATA\n"); +} + +void large_columns_repeated_column_chunk_boundary() +{ + constexpr size_t row_count = 180000; + std::string mps; + mps.reserve(8 * 1024 * 1024); + mps += "NAME BIGCOLS\nROWS\n N OBJ\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " L "; + mps += row_name(i); + mps += '\n'; + } + mps += "COLUMNS\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " XBIG "; + mps += row_name(i); + mps += " 1\n"; + } + mps += " XTAIL "; + mps += row_name(1); + mps += " 2\nRHS\n RHS1 "; + mps += row_name(1); + mps += " 0\nENDATA\n"; + + TempMpsFile file(std::move(mps)); + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_model_shapes( + model, static_cast(row_count), 2, static_cast(row_count + 1), "large columns"); + expect_eq(model.var_names_.at(0), std::string("XBIG"), "large repeated column name"); + expect_eq(model.var_names_.at(1), std::string("XTAIL"), "large tail column name"); +} + +void large_bounds_repeated_var_stays_ordered() +{ + constexpr size_t repeat_count = 700000; + std::string mps; + mps.reserve(12 * 1024 * 1024); + mps += + "NAME BIGBOUNDS\nROWS\n N OBJ\n L R1\nCOLUMNS\n alpha OBJ 1 R1 1\nRHS\n RHS1 R1 0\nBOUNDS\n"; + for (size_t i = 0; i < repeat_count; ++i) { + mps += " UP B alpha "; + mps += std::to_string(i % 1000); + mps += '\n'; + } + mps += "ENDATA\n"; + + TempMpsFile file(std::move(mps)); + auto model = + mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + expect_model_shapes(model, 1, 1, 1, "large bounds"); + expect_eq(model.variable_upper_bounds_.at(0), + static_cast((repeat_count - 1) % 1000), + "large repeated bounds last value"); +} + +void lz4_and_raw_paths_match_on_multiblock_input() +{ + constexpr size_t row_count = 70000; + std::string mps; + mps.reserve(4 * 1024 * 1024); + mps += "NAME LZ4PARITY\nROWS\n N OBJ\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " L "; + mps += row_name(i); + mps += '\n'; + } + mps += "COLUMNS\n"; + for (size_t i = 1; i <= row_count; ++i) { + mps += " X"; + mps += std::to_string(i); + mps += ' '; + mps += row_name(i); + mps += " 0.125\n"; + } + mps += "RHS\n RHS1 "; + mps += row_name(1); + mps += " 1\nENDATA\n"; + + TempMpsFile raw_file(std::move(mps)); + TempOwnedPath lz4_file(raw_file.path + ".lz4"); + const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path; + if (std::system(cmd.c_str()) != 0) { throw skip_test("lz4 CLI unavailable"); } + + auto raw = + mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); + auto lz4 = + mps_fast::parse_mps_fast_file(lz4_file.path, mps_fast::FileReadMethod::Read); + + expect_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity"); + expect_eq(lz4.var_names_.size(), raw.var_names_.size(), "lz4 var name count"); + expect_eq(lz4.row_names_.size(), raw.row_names_.size(), "lz4 row name count"); + expect_vector_eq(lz4.A_, raw.A_, "lz4 A values"); + expect_vector_eq(lz4.A_indices_, raw.A_indices_, "lz4 A indices"); + expect_vector_eq(lz4.A_offsets_, raw.A_offsets_, "lz4 A offsets"); + expect_vector_eq(lz4.c_, raw.c_, "lz4 objective"); + expect_vector_eq(lz4.b_, raw.b_, "lz4 rhs"); + expect_vector_eq(lz4.var_types_, raw.var_types_, "lz4 var types"); + expect_vector_eq(lz4.variable_lower_bounds_, raw.variable_lower_bounds_, "lz4 lower bounds"); + expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds"); +} + +} // namespace + +int main() +{ + struct TestCase { + const char* name; + void (*fn)(); + }; + + const TestCase tests[] = { + {"ScannerFindsSectionSplitAcrossBlocks", scanner_finds_section_split_across_blocks}, + {"ScannerFindsHeadersSplitAtEveryByte", scanner_finds_headers_split_at_every_byte}, + {"ScannerRejectsUnknownColumnOneRecordsAfterRows", + scanner_rejects_unknown_column_one_records_after_rows}, + {"BoundsDefaultsAndTypesMatchReference", bounds_defaults_and_types_match_reference}, + {"DuplicateBoundsLastStatementWins", duplicate_bounds_last_statement_wins}, + {"NondenseRowAndColumnNamesUseHashPath", nondense_row_and_column_names_use_hash_path}, + {"MissingOptionalBoundsFastPath", missing_optional_bounds_fast_path}, + {"BoundsOnlyVariablesAreAppendedDeterministically", + bounds_only_variables_are_appended_deterministically}, + {"IntegerMarkersAssignTypesAndDefaultBounds", integer_markers_assign_types_and_default_bounds}, + {"NumericParsingIntegrationMatchesReferenceBitwise", + numeric_parsing_integration_matches_reference_bitwise}, + {"CrlfLineEndingsMatchReferenceBitwise", crlf_line_endings_match_reference_bitwise}, + {"CommentPlacementSupportedCasesMatchReferenceBitwise", + comment_placement_supported_cases_match_reference_bitwise}, + {"ObjectiveMetadataSelectsNamedObjective", objective_metadata_selects_named_objective}, + {"MalformedInputsReportErrors", malformed_inputs_report_errors}, + {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary}, + {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered}, + {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input}, + }; + + int failed = 0; + for (const TestCase& test : tests) { + std::cout << "[ RUN ] " << test.name << '\n'; + try { + test.fn(); + std::cout << "[ OK ] " << test.name << '\n'; + } catch (const skip_test& e) { + std::cout << "[ SKIPPED ] " << test.name << ": " << e.what() << '\n'; + } catch (const std::exception& e) { + ++failed; + std::cerr << "[ FAILED ] " << test.name << ": " << e.what() << '\n'; + } + } + + if (failed != 0) { + std::cerr << failed << " test(s) failed\n"; + return 1; + } + std::cout << "[ PASSED ] " << std::size(tests) << " test(s)\n"; + return 0; +} From 8e01e28b61d928a89776f9f3032a3057c2acd60f Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Wed, 10 Jun 2026 09:40:33 -0700 Subject: [PATCH 07/22] moved perf counters --- cpp/src/io/experimental_mps_fast/fast_parser.cpp | 2 +- .../{io/experimental_mps_fast => utilities}/perf_counters.hpp | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/{io/experimental_mps_fast => utilities}/perf_counters.hpp (100%) diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index de1b3ea84c..bd83ef2088 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -9,7 +9,7 @@ #include "mps_section_scanner.hpp" #include "nvtx_ranges.hpp" #ifdef MPS_FAST_PERF_COUNTERS -#include "perf_counters.hpp" +#include #endif #include diff --git a/cpp/src/io/experimental_mps_fast/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp similarity index 100% rename from cpp/src/io/experimental_mps_fast/perf_counters.hpp rename to cpp/src/utilities/perf_counters.hpp From 94bfbc78448ddae9cb85f12aee19f296871fc2a6 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Thu, 11 Jun 2026 06:17:57 -0700 Subject: [PATCH 08/22] extend the lz4 decompression to the regular parser, more cleanup and refactor --- cpp/cuopt_cli.cpp | 5 +- .../cuopt/linear_programming/io/parser.hpp | 47 +- .../fast_fp64_parser.hpp | 30 +- .../fast_parse_primitives.hpp | 100 +- .../io/experimental_mps_fast/fast_parser.cpp | 1274 +++++++---------- .../io/experimental_mps_fast/file_reader.cpp | 288 ---- .../io/experimental_mps_fast/file_reader.hpp | 70 +- .../experimental_mps_fast/lz4_file_reader.cpp | 162 +-- .../io/experimental_mps_fast/mmap_region.hpp | 4 +- .../mps_section_scanner.cpp | 31 + .../mps_section_scanner.hpp | 8 + .../io/experimental_mps_fast/nvtx_ranges.hpp | 22 +- cpp/src/io/file_to_string.cpp | 167 ++- cpp/src/io/file_to_string.hpp | 1 + cpp/src/utilities/perf_counters.hpp | 31 + .../fast_parser_edge_test.cpp | 44 + 16 files changed, 1052 insertions(+), 1232 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index e99462091e..714d76dbf5 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -286,9 +286,8 @@ int main(int argc, char* argv[]) program.add_argument("filename") .help( "input problem file; format dispatched by extension (case-insensitive). " - "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants " - "(e.g. .lp.gz, .mps.bz2, .qps.gz). Experimental .mps.lz4 inputs require " - "--mps-reader fast") + "Supported: .lp, .mps, .qps and their .gz / .bz2 / .lz4 compressed variants " + "(e.g. .lp.gz, .mps.bz2, .qps.lz4).") .nargs(1) .required(); diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index 1d47590287..08254f84b3 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -21,7 +21,7 @@ namespace cuopt::linear_programming::io { * @brief Selects which MPS reader implementation should be used by dispatching entry points. * * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems - * from raw .mps and .mps.lz4 files only. + * from raw .mps, .mps.lz4, .mps.gz, and .mps.bz2 files. */ enum class mps_reader_type_t { default_reader, fast_experimental }; @@ -29,7 +29,7 @@ enum class mps_reader_type_t { default_reader, fast_experimental }; * @brief Reads the equation from an MPS or QPS file. * * The input file can be a plain text file in MPS-/QPS-format or a compressed MPS/QPS - * file (.mps.gz or .mps.bz2). + * file (.mps.gz, .mps.bz2, or .mps.lz4). * * Read this link http://lpsolve.sourceforge.net/5.5/mps-format.htm for more * details on both free and fixed MPS format. @@ -40,8 +40,8 @@ enum class mps_reader_type_t { default_reader, fast_experimental }; * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ) * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP) * - * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression - * libraries zlib or libbzip2 are installed, respectively. + * Note: Compressed MPS files .mps.gz, .mps.bz2, and .mps.lz4 can only be read if + * zlib, libbzip2, or liblz4 are installed, respectively. * * @param[in] mps_file_path Path to MPS/QPSfile. * @param[in] fixed_mps_format If MPS/QPS file should be parsed as fixed, false by default @@ -54,10 +54,10 @@ mps_data_model_t read_mps(const std::string& mps_file_path, /** * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader. * - * This prototype reader supports raw .mps and .mps.lz4 files only. It does not support LP, QPS, - * quadratic MPS sections, fixed-format forcing, or .gz/.bz2 compressed inputs. + * This prototype reader supports raw .mps plus .mps.lz4/.mps.gz/.mps.bz2 files. It does not + * support LP, QPS, quadratic constraint sections, or fixed-format forcing. * - * @param[in] mps_file_path Path to a raw .mps or .mps.lz4 file. + * @param[in] mps_file_path Path to a raw or compressed .mps file. * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file. */ template @@ -137,9 +137,9 @@ inline mps_data_model_t read(const std::string& path, * extension. Extension matching is case-insensitive. * * Routing: - * - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps() - * - .mps.lz4 → experimental fast MPS reader only - * - .lp, .lp.gz, .lp.bz2 → read_lp() + * - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4 + * → read_mps() + * - .lp, .lp.gz, .lp.bz2, .lp.lz4 → read_lp() * - anything else → std::logic_error * * This is the entry point of choice for user-facing tools (CLI, C API) that @@ -165,33 +165,36 @@ inline mps_data_model_t read(const std::string& path, std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); - const bool is_mps_lz4 = lower.ends_with(".mps.lz4"); - if (lower.ends_with(".mps") || is_mps_lz4 || lower.ends_with(".mps.gz") || - lower.ends_with(".mps.bz2") || lower.ends_with(".qps") || lower.ends_with(".qps.gz") || - lower.ends_with(".qps.bz2")) { + const bool is_mps_lz4 = lower.ends_with(".mps.lz4"); + const bool is_mps_gzip = lower.ends_with(".mps.gz"); + const bool is_mps_bzip = lower.ends_with(".mps.bz2"); + const bool is_qps_lz4 = lower.ends_with(".qps.lz4"); + const bool is_lp_lz4 = lower.ends_with(".lp.lz4"); + if (lower.ends_with(".mps") || is_mps_lz4 || is_mps_gzip || is_mps_bzip || + lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") || + is_qps_lz4) { if (mps_reader == mps_reader_type_t::fast_experimental) { if (fixed_mps_format) { throw std::logic_error( "experimental fast MPS reader does not support fixed MPS format forcing"); } - if (!lower.ends_with(".mps") && !is_mps_lz4) { + if (!lower.ends_with(".mps") && !is_mps_lz4 && !is_mps_gzip && !is_mps_bzip) { throw std::logic_error( - "experimental fast MPS reader supports raw .mps and .mps.lz4 LP/MIP files only"); + "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 " + "LP/MIP files only"); } return read_mps_fast_experimental(path); } - if (is_mps_lz4) { - throw std::logic_error(".mps.lz4 inputs require the experimental fast MPS reader"); - } return read_mps(path, fixed_mps_format); } - if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) { + if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2") || + is_lp_lz4) { return read_lp(path); } throw std::logic_error( "read: unrecognized input file extension. Supported (case-insensitive): " - ".mps, .mps.lz4, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, " - ".lp.bz2. " + ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, " + ".lp, .lp.gz, .lp.bz2, .lp.lz4. " "Given path: " + path); } diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index 905dcc9e7b..0f947aa644 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -40,6 +40,8 @@ struct power_10_lut_entry_t { int biased_e2; }; +// util class to perform 256bit precision arithmetic in constexpr to build the eisel-lemire lookup +// table struct cuopt_uint256_t { std::array limb{}; @@ -169,7 +171,7 @@ inline constexpr std::array small_integer_powers = {1ULL, 100000000000000ULL, 1000000000000000ULL}; -struct ParsedDecimal { +struct parsed_decimal_t { bool negative = false; bool fast_eligible = false; uint64_t mantissa = 0; @@ -181,6 +183,7 @@ static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; } // SWAR 8char run of digits -> integer representation static inline bool parse_8_digits(const char* p, uint32_t& out) { + // comply with strict aliasing rules std::array bytes{}; std::memcpy(bytes.data(), p, bytes.size()); uint64_t raw = std::bit_cast(bytes); @@ -195,10 +198,26 @@ static inline bool parse_8_digits(const char* p, uint32_t& out) return true; } +static inline void parse_u64_digits_advance(const char*& p, const char* end, uint64_t& out) +{ + while (p < end && is_digit(*p)) { + if (end - p >= 8) { + uint32_t chunk = 0; + if (parse_8_digits(p, chunk)) { + out = out * 100000000ULL + (uint64_t)chunk; + p += 8; + continue; + } + } + out = out * 10 + (uint64_t)(*p - '0'); + ++p; + } +} + static inline void scan_digit_run(const char*& p, const char* end, bool after_dot, - ParsedDecimal& out, + parsed_decimal_t& out, bool& saw_digit, int& frac_digits, int& sig_digits, @@ -244,7 +263,7 @@ static inline void scan_digit_run(const char*& p, } } -static inline bool parse_decimal_advance(const char*& p, const char* end, ParsedDecimal& out) +static inline bool parse_decimal_advance(const char*& p, const char* end, parsed_decimal_t& out) { if (p < end && (*p == '-' || *p == '+')) { out.negative = *p == '-'; @@ -294,6 +313,7 @@ static inline bool parse_decimal_advance(const char*& p, const char* end, Parsed static inline double fallback_strtod(std::string_view s) { char stack_buf[32]; + // The MPS specs mandate that numeric tokens are no longer than 25 characters if (s.size() >= sizeof(stack_buf)) { mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length"); } @@ -358,7 +378,7 @@ static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits) return true; } -static inline double assemble_fp64(const ParsedDecimal& dec) +static inline double assemble_fp64(const parsed_decimal_t& dec) { uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0; if (dec.mantissa == 0) { return std::bit_cast(bits); } @@ -390,7 +410,7 @@ static inline double assemble_fp64(const ParsedDecimal& dec) static inline double parse_fp64_advance(const char*& p, const char* end) { const char* start = p; - ParsedDecimal dec; + parsed_decimal_t dec; if (!parse_decimal_advance(p, end, dec)) { return fallback_strtod(std::string_view(start, (size_t)(p - start))); } diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index 70ed3283c3..d3317c50e1 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -15,16 +15,21 @@ #include #include -#ifndef __likely -#define __likely(x) __builtin_expect(!!(x), 1) +#ifndef LIKELY +#define LIKELY(x) __builtin_expect(!!(x), 1) #endif -#ifndef __unlikely -#define __unlikely(x) __builtin_expect(!!(x), 0) +#ifndef UNLIKELY +#define UNLIKELY(x) __builtin_expect(!!(x), 0) #endif namespace mps_fast { +enum scan_mode { + skip_whitespace, + until_whitespace, +}; + struct cursor_t { const char* start; const char* ptr; @@ -67,12 +72,12 @@ struct cursor_t { ptr += n; } - template + template static const char* scalar_scan(const char* p, const char* end) { while (p < end) { unsigned char c = (unsigned char)*p; - if constexpr (skip_ws_mode) { + if constexpr (mode == skip_whitespace) { if (c > 32 || c == '\n') return p; } else { if (c <= 32) return p; @@ -82,7 +87,7 @@ struct cursor_t { return end; } - template + template static const char* simd_scan(const char* p, const char* end) { const simde__m256i v32 = simde_mm256_set1_epi8(32); @@ -93,7 +98,7 @@ struct cursor_t { simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); unsigned int mask; - if (skip_ws_mode) { + if constexpr (mode == skip_whitespace) { simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); } else { @@ -103,10 +108,10 @@ struct cursor_t { if (mask != 0) { return p + __builtin_ctz(mask); } p += 32; } - return scalar_scan(p, end); + return scalar_scan(p, end); } - void skip_ws() { ptr = simd_scan(ptr, end); } + void skip_ws() { ptr = simd_scan(ptr, end); } bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); } @@ -135,13 +140,31 @@ struct cursor_t { } } + std::string_view read_rest_of_line_trimmed() + { + const char* begin = ptr; + const char* line_end = begin; + while (line_end < end && *line_end != '\n' && *line_end != '\r') { + ++line_end; + } + + while (begin < line_end && (*begin == ' ' || *begin == '\t')) { + ++begin; + } + while (line_end > begin && (line_end[-1] == ' ' || line_end[-1] == '\t')) { + --line_end; + } + ptr = line_end; + return std::string_view(begin, (std::size_t)(line_end - begin)); + } + inline __attribute__((always_inline)) std::string_view read_field() { - if (__unlikely(done())) { return {}; } + if (UNLIKELY(done())) { return {}; } const char* field_start = ptr; - if (__unlikely(end - ptr < 32)) { - ptr = scalar_scan(ptr, end); + if (UNLIKELY(end - ptr < 32)) { + ptr = scalar_scan(ptr, end); const char* field_end = ptr; if (ptr < end) { skip_ws(); } return std::string_view(field_start, field_end - field_start); @@ -150,14 +173,14 @@ struct cursor_t { const simde__m256i v32 = simde_mm256_set1_epi8(32); const simde__m256i vnl = simde_mm256_set1_epi8('\n'); - // Input buffers are padded by file_reader/lz4_file_reader/small_raw_read, - // so this unaligned 32-byte load is valid whenever end - ptr >= 32. + // All input streams provide trailing padding, so this unaligned 32-byte load is valid + // whenever end - ptr >= 32. simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); - if (__unlikely(ws_mask == 0)) { - ptr = simd_scan(ptr + 32, end); + if (UNLIKELY(ws_mask == 0)) { + ptr = simd_scan(ptr + 32, end); const char* field_end = ptr; if (ptr < end) { skip_ws(); } return std::string_view(field_start, field_end - field_start); @@ -171,7 +194,7 @@ struct cursor_t { (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl)); unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1); - if (__likely(after_field != 0)) { + if (LIKELY(after_field != 0)) { ptr = ptr + __builtin_ctz(after_field); } else { ptr = field_end; @@ -183,11 +206,18 @@ struct cursor_t { inline __attribute__((always_inline)) std::string_view peek_field() { - if (__unlikely(done())) { return {}; } - const char* field_end = simd_scan(ptr, end); + if (UNLIKELY(done())) { return {}; } + const char* field_end = simd_scan(ptr, end); return std::string_view(ptr, field_end - ptr); } + static inline std::string_view peek_field_at(const char* line_start, const char* section_end) + { + cursor_t cursor(line_start, (std::size_t)(section_end - line_start)); + cursor.skip_ws(); + return cursor.peek_field(); + } + inline __attribute__((always_inline)) std::pair read_two_fields() { @@ -197,7 +227,7 @@ struct cursor_t { return std::pair{f1, f2}; }; - if (__unlikely(end - ptr < 32)) { return slow(); } + if (UNLIKELY(end - ptr < 32)) { return slow(); } const char* field1_start = ptr; const simde__m256i v32 = simde_mm256_set1_epi8(32); @@ -213,21 +243,21 @@ struct cursor_t { unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); unsigned int stop_mask = printable_mask | nl_mask; - if (__unlikely(ws_mask == 0)) { return slow(); } + if (UNLIKELY(ws_mask == 0)) { return slow(); } int field1_end_off = __builtin_ctz(ws_mask); unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1); - if (__unlikely(after_field1 == 0)) { return slow(); } + if (UNLIKELY(after_field1 == 0)) { return slow(); } int field2_start_off = __builtin_ctz(after_field1); - if (__unlikely(ptr[field2_start_off] == '\n')) { return slow(); } + if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); } unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1); - if (__unlikely(ws_after_field2_start == 0)) { return slow(); } + if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); } int field2_end_off = __builtin_ctz(ws_after_field2_start); unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1); - if (__likely(after_field2 != 0)) { + if (LIKELY(after_field2 != 0)) { ptr = ptr + __builtin_ctz(after_field2); } else { ptr = ptr + field2_end_off; @@ -242,7 +272,7 @@ struct cursor_t { static inline void expect(cursor_t& cursor, const char* field) { auto id = cursor.read_field(); - if (__unlikely(id != field)) { + if (UNLIKELY(id != field)) { cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data()); } } @@ -260,7 +290,7 @@ static inline void accept_comment_line(cursor_t& cursor) static inline void expect_eol(cursor_t& cursor) { - if (__unlikely(!cursor.eol())) { + if (UNLIKELY(!cursor.eol())) { auto got = cursor.peek_field(); cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data()); } @@ -269,20 +299,18 @@ static inline void expect_eol(cursor_t& cursor) while (cursor.eol()) { cursor.consume_eol(); } - if (__unlikely(cursor.done())) { return; } + if (UNLIKELY(cursor.done())) { return; } - if (__unlikely(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) { + if (UNLIKELY(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) { cursor.skip_comment_line(); continue; } - if (__likely(cursor.ptr[0] == ' ') && __likely(cursor.ptr + 1 < cursor.end)) { - cursor.ptr += 1; - } + if (LIKELY(cursor.ptr[0] == ' ') && LIKELY(cursor.ptr + 1 < cursor.end)) { cursor.ptr += 1; } - if (__unlikely(cursor.done())) { return; } + if (UNLIKELY(cursor.done())) { return; } char c = cursor.ptr[0]; - if (__unlikely(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) { + if (UNLIKELY(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) { cursor.skip_ws(); if (cursor.eol()) { continue; } } @@ -344,7 +372,7 @@ static inline bool accept_section(cursor_t& cursor, const char* section) static inline bool accept_comment(cursor_t& cursor) { - if (__unlikely(!cursor.done() && cursor.ptr[0] == '$')) { + if (UNLIKELY(!cursor.done() && cursor.ptr[0] == '$')) { cursor.skip_to_eol(); return true; } diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index bd83ef2088..33bf916e05 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -2,12 +2,15 @@ // reserved. SPDX-License-Identifier: Apache-2.0 #include "fast_parser.hpp" +#include #include "fast_parse_primitives.hpp" #include "file_reader.hpp" #include "hash_table_smallstr.hpp" #include "mmap_region.hpp" #include "mps_section_scanner.hpp" #include "nvtx_ranges.hpp" + +#include #ifdef MPS_FAST_PERF_COUNTERS #include #endif @@ -36,31 +39,43 @@ #include #include #include +#include #include #include -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif +#define MPS_FAST_COMPACT_ROW_HASH +#define MPS_FAST_THP_PREFAULT namespace mps_fast { -static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; -static constexpr int MPS_ROWS_THREAD_CAP = 16; -static constexpr int MPS_COLUMNS_THREAD_CAP = 32; -static constexpr int MPS_BOUNDS_THREAD_CAP = 32; -static constexpr int MPS_NAMES_THREAD_CAP = 16; -static constexpr size_t MPS_BOUNDS_PARALLEL_INIT_MIN_VARS = 16 * 1024 * 1024; -static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256ull * 1024ull * 1024ull; -static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8ull * 1024ull * 1024ull; -static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * 1024 * 1024; -static constexpr size_t MPS_SMALL_RAW_FILE_BYTES = 4ull * 1024ull * 1024ull; -static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull; -static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64ull * 1024ull; -static constexpr size_t MPS_ROW_HASH_PARTITIONS = 32; -static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; -static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16; -static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32; +static constexpr size_t KiB = 1024; +static constexpr size_t MiB = 1024 * KiB; +static constexpr size_t GiB = 1024 * MiB; + +// per-chunk row-count scratch tile for the column parsing workers +// small enough to remain warm in L1 +static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096; +static constexpr int MPS_ROWS_THREAD_CAP = 16; +static constexpr int MPS_COLUMNS_THREAD_CAP = 32; +static constexpr int MPS_BOUNDS_THREAD_CAP = 32; +static constexpr int MPS_NAMES_THREAD_CAP = 16; +// avoid openmp setup for small bounds sections +static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256 * MiB; +// ordered-name fallback is cheap enough to parallelize on smaller bounds sections +static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB; +// lower bound on columns chunk size to avoid tiny parser tasks +static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB; +// parser-wide thread cap switch; very small files lose to scheduling overhead +static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull; +// below this, the serial row-hash build is usually cheaper than partition setup +static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * KiB; +// number of partitions for the row hash table, used to avoid races and atomics during row hash +// table initialization +static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; +static constexpr size_t MPS_ROW_HASH_PARTITIONS = (size_t{1} << MPS_ROW_HASH_PARTITION_BITS); +// thread caps for small and large files +static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16; +static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32; static int parser_thread_cap_for_size(size_t bytes) { @@ -75,6 +90,8 @@ static int phase_thread_count(int phase_cap) return std::max(1, std::min(phase_cap, available_threads)); } +// Arena allocator for the strings (row names, column names) to avoid the dreadful overheads of +// glibc's malloc and std::vector class chunk_name_arena_t { public: void reserve(size_t bytes) @@ -92,110 +109,45 @@ class chunk_name_arena_t { private: struct slab_t { - std::unique_ptr data; - size_t capacity = 0; - size_t used = 0; + std::vector data; + size_t used = 0; }; char* allocate(size_t bytes) { - if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().capacity) { + if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().data.size()) { size_t capacity = std::max(bytes, next_slab_size_); slab_t slab; - slab.data = std::make_unique(capacity); - slab.capacity = capacity; + slab.data.resize(capacity); slabs_.push_back(std::move(slab)); next_slab_size_ = std::max(next_slab_size_ * 2, capacity); } slab_t& slab = slabs_.back(); - char* ptr = slab.data.get() + slab.used; + char* ptr = slab.data.data() + slab.used; slab.used += bytes; return ptr; } std::vector slabs_; - size_t next_slab_size_ = 64 * 1024; + size_t next_slab_size_ = 64 * KiB; }; +// returns the hash table partition to use for a given hash static inline size_t row_hash_partition_for(uint32_t hash) { return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); } -// ============================================================================= -// RAII Timer for profiling with deferred output -// ============================================================================= - -struct TimerEntry { +struct timer_entry_t { const char* name; double elapsed_ms; size_t rss_kb; size_t hwm_kb; - size_t compressed_bytes; -}; - -static std::atomic_size_t& get_timer_compressed_bytes() -{ - static std::atomic_size_t compressed_bytes{0}; - return compressed_bytes; -} - -class timer_io_context_t { - public: - explicit timer_io_context_t(size_t compressed_bytes) - : old_compressed_bytes_( - get_timer_compressed_bytes().exchange(compressed_bytes, std::memory_order_acq_rel)) - { - } - - ~timer_io_context_t() - { - get_timer_compressed_bytes().store(old_compressed_bytes_, std::memory_order_release); - } - - timer_io_context_t(const timer_io_context_t&) = delete; - timer_io_context_t& operator=(const timer_io_context_t&) = delete; - - private: - size_t old_compressed_bytes_ = 0; }; -static size_t parse_status_kb_line(const char* line, const char* key) +static std::vector& get_timer_buffer() { - size_t key_len = std::strlen(key); - if (std::strncmp(line, key, key_len) != 0) { return 0; } - const char* p = line + key_len; - while (*p == ' ' || *p == '\t') { - ++p; - } - size_t value = 0; - while (*p >= '0' && *p <= '9') { - value = value * 10 + (size_t)(*p - '0'); - ++p; - } - return value; -} - -static std::pair current_process_rss_kb() -{ - FILE* file = std::fopen("/proc/self/status", "r"); - if (file == nullptr) { return {0, 0}; } - - size_t rss_kb = 0; - size_t hwm_kb = 0; - char line[256]; - while (std::fgets(line, sizeof(line), file) != nullptr) { - if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); } - if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); } - if (rss_kb != 0 && hwm_kb != 0) { break; } - } - std::fclose(file); - return {rss_kb, hwm_kb}; -} - -static std::vector& get_timer_buffer() -{ - static std::vector buffer; + static std::vector buffer; buffer.reserve(100); return buffer; } @@ -213,26 +165,16 @@ static void flush_timers() auto& buffer = get_timer_buffer(); for (const auto& entry : buffer) { std::fprintf(stderr, - "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f compressed_GB=%.3f\n", + "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f\n", entry.name, entry.elapsed_ms, - (double)entry.rss_kb / (1024.0 * 1024.0), - (double)entry.hwm_kb / (1024.0 * 1024.0), - (double)entry.compressed_bytes / (1024.0 * 1024.0 * 1024.0)); + (double)entry.rss_kb / (double)(GiB / KiB), + (double)entry.hwm_kb / (double)(GiB / KiB)); } buffer.clear(); #endif } -static size_t system_page_size() -{ - static size_t page_size = [] { - long value = sysconf(_SC_PAGESIZE); - return value > 0 ? (size_t)value : (size_t)4096; - }(); - return page_size; -} - enum class materialize_touch_t { write_2mb, write_4kb, @@ -248,7 +190,7 @@ static void materialize_hugepages(const char* label, (void)label; if (data == nullptr || bytes == 0) return; - constexpr size_t two_mb = 2 * 1024 * 1024; + constexpr size_t two_mb = 2 * MiB; size_t page_size = system_page_size(); uintptr_t start = reinterpret_cast(data); uintptr_t end = start + bytes; @@ -257,10 +199,10 @@ static void materialize_hugepages(const char* label, size_t aligned_bytes = (size_t)(aligned_end - aligned_start); errno = 0; - madvise(reinterpret_cast(aligned_start), aligned_bytes, MADV_HUGEPAGE); + madvise((void*)(aligned_start), aligned_bytes, MADV_HUGEPAGE); size_t step = touch == materialize_touch_t::write_2mb ? two_mb : page_size; - volatile char* ptr = reinterpret_cast(data); + volatile char* ptr = (volatile char*)(data); for (size_t offset = 0; offset < bytes; offset += step) { ptr[offset] = ptr[offset]; } @@ -297,10 +239,9 @@ class scoped_timer_t { double elapsed_ms = std::chrono::duration(end - start_).count(); nvtx_.end(); if (accumulator_) { *accumulator_ += elapsed_ms; } - auto [rss_kb, hwm_kb] = current_process_rss_kb(); - size_t compressed_bytes = get_timer_compressed_bytes().load(std::memory_order_acquire); + auto [rss_kb, hwm_kb] = current_process_rss_kb(); std::lock_guard lock(get_timer_mutex()); - get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb, compressed_bytes}); + get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb}); #endif } @@ -313,11 +254,27 @@ class scoped_timer_t { #endif double* accumulator_; #ifdef MPS_FAST_TIMERS - nvtx::scoped_range nvtx_; + nvtx::scoped_range_t nvtx_; std::chrono::high_resolution_clock::time_point start_; #endif }; +class omp_max_active_levels_guard_t { + public: + explicit omp_max_active_levels_guard_t(int value) : old_value_(omp_get_max_active_levels()) + { + omp_set_max_active_levels(value); + } + + ~omp_max_active_levels_guard_t() { omp_set_max_active_levels(old_value_); } + + omp_max_active_levels_guard_t(const omp_max_active_levels_guard_t&) = delete; + omp_max_active_levels_guard_t& operator=(const omp_max_active_levels_guard_t&) = delete; + + private: + int old_value_ = 0; +}; + static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section) { const char* row_end = row_start; @@ -327,29 +284,17 @@ static inline void error_unknown_row(cursor_t& cursor, const char* row_start, co cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start); } -// ============================================================================= -// Parsing state shared across section parsers -// ============================================================================= - -static inline size_t next_power_of_2(size_t n) -{ - if (n == 0) return 1; - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - n |= n >> 32; - return n + 1; -} - -enum class row_index_mode_t { +// Two modes for row/column name lookup: +// - hash: arbitrary names via hash table (rows) or var_names_map (columns) +// - dense_ordered: sequential numeric suffixes like R0001/R0002 or V0/V1 +enum class index_mode_t { hash, dense_ordered, }; -static inline bool is_decimal_digit(char c) { return (unsigned)(c - '0') <= 9; } +// Every 19-digit decimal string fits in uint64_t; 20+ digits may not and are wildly unlikely in the +// context of dense MPS rows/cols +static constexpr size_t dense_suffix_max_digits = 19; static inline size_t decimal_digits_u64(uint64_t value) { @@ -367,39 +312,144 @@ static inline bool parse_trailing_u64(std::string_view name, size_t& suffix_width) { size_t pos = name.size(); - while (pos > 0 && is_decimal_digit(name[pos - 1])) { + while (pos > 0 && fp64::is_digit(name[pos - 1])) { pos--; } if (pos == name.size()) { return false; } + suffix_width = name.size() - pos; + if (suffix_width > dense_suffix_max_digits) { return false; } + uint64_t parsed = 0; for (size_t i = pos; i < name.size(); ++i) { - uint64_t digit = (uint64_t)(name[i] - '0'); - if (parsed > (std::numeric_limits::max() - digit) / 10) { return false; } - parsed = parsed * 10 + digit; + parsed = parsed * 10 + (uint64_t)(name[i] - '0'); } - prefix = std::string_view(name.data(), pos); - value = parsed; - suffix_width = name.size() - pos; + prefix = std::string_view(name.data(), pos); + value = parsed; return true; } +// necessary to handle cases like R0001, ..., R2000, ... static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width) { return suffix_width > 1 && name[name.size() - suffix_width] == '0'; } -static inline bool dense_suffix_width_ok(uint64_t value, - size_t suffix_width, - bool zero_padded, - size_t pad_width) +static inline size_t dense_initial_pad_width(std::string_view name, size_t suffix_width) +{ + return dense_suffix_is_zero_padded(name, suffix_width) ? suffix_width : 0; +} + +static inline bool dense_suffix_width_ok(uint64_t value, size_t suffix_width, size_t pad_width) { size_t digits = decimal_digits_u64(value); - size_t expected_width = zero_padded ? std::max(pad_width, digits) : digits; + size_t expected_width = std::max(pad_width, digits); return suffix_width == expected_width; } +struct dense_name_index_t { + std::string prefix; + uint64_t min_id = 0; + uint64_t max_id = 0; + size_t pad_width = 0; + + void reset() + { + prefix.clear(); + min_id = 0; + max_id = 0; + pad_width = 0; + } + + bool suffix_width_ok(uint64_t value, size_t suffix_width) const + { + return dense_suffix_width_ok(value, suffix_width, pad_width); + } + + size_t lookup(std::string_view name) const + { + std::string_view parsed_prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, parsed_prefix, value, suffix_width)) { return SIZE_MAX; } + if (parsed_prefix != prefix || !suffix_width_ok(value, suffix_width)) { return SIZE_MAX; } + if (value < min_id || value > max_id) { return SIZE_MAX; } + return (size_t)(value - min_id); + } + + void format_name(size_t idx, std::string& out) const + { + uint64_t value = min_id + idx; + char digits_buf[32]; + auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value); + if (ec != std::errc()) { + out.assign(prefix); + return; + } + size_t digits_len = (size_t)(digits_end - digits_buf); + size_t width = std::max(pad_width, digits_len); + out.resize(prefix.size() + width); + std::memcpy(out.data(), prefix.data(), prefix.size()); + char* suffix = out.data() + prefix.size(); + if (width > digits_len) { + std::memset(suffix, '0', width - digits_len); + suffix += width - digits_len; + } + std::memcpy(suffix, digits_buf, digits_len); + } +}; + +struct dense_observe_state_t { + bool candidate = true; + dense_name_index_t index; + size_t count = 0; +}; + +static inline void observe_dense_name(bool& candidate, + dense_name_index_t& index, + size_t& observed_count, + std::string_view name, + uint64_t expected_id = std::numeric_limits::max()) +{ + if (!candidate) { return; } + + std::string_view prefix; + uint64_t value = 0; + size_t suffix_width = 0; + if (!parse_trailing_u64(name, prefix, value, suffix_width)) { + candidate = false; + return; + } + + if (observed_count == 0) { + index.prefix.assign(prefix); + index.min_id = value; + index.max_id = value; + index.pad_width = dense_initial_pad_width(name, suffix_width); + observed_count = 1; + return; + } + + if (prefix != index.prefix) { + candidate = false; + return; + } + + if (expected_id != std::numeric_limits::max() && value != expected_id) { + candidate = false; + return; + } + + if (!index.suffix_width_ok(value, suffix_width)) { + candidate = false; + return; + } + + index.max_id = value; + observed_count++; +} + template struct parse_state_t { struct row_hash_partition_t { @@ -411,42 +461,36 @@ struct parse_state_t { cuopt::linear_programming::io::mps_data_model_t& problem; cursor_t& cursor; - // Temporary string_view storage (points into input buffer, no allocation) + // backed by the input buffer std::vector row_names_sv; + // backed by the arena allocator std::vector var_names_sv; std::vector var_name_arenas; std::string_view problem_name_sv; std::string_view objective_name_sv; - std::vector ignored_objective_names_sv; + // secondary 'N' rows in ROWS — rare; membership distinguishes them from unknown row names + std::unordered_set ignored_objective_names; - // Optional dense ordered column index for labels like V0, V1, ... - bool col_dense_ordered = false; - std::string col_dense_prefix_storage; - std::string_view col_dense_prefix; - uint64_t col_dense_min_id = 0; - uint64_t col_dense_max_id = 0; - size_t col_dense_pad_width = 0; - bool col_dense_zero_padded = false; + // Column name lookup for labels like V0, V1, ... + index_mode_t col_index_mode = index_mode_t::hash; + dense_name_index_t col_dense; // Row name hash table - sized at runtime based on row count size_t row_hash_buckets = 0; size_t row_hash_mask = 0; // buckets - 1, for fast modulo via & mmap_region_t row_hash_region; - hash_slot_var_t* row_names_ht = nullptr; + hash_slot_var_t* row_names_ht = nullptr; + // compute hash, select the subtable from high hash bits, + // then run the same open-addressing probe loop inside that subtable. size_t row_hash_partition_count = 0; std::array row_hash_partitions = {}; - // Overflow map for row names longer than HASH_KEY_BYTES + // Overflow map for row names longer than HASH_KEY_BYTES (usually very rare) std::unordered_map row_names_long; - // Optional dense ordered row index for labels like R0001, R0002, ... - row_index_mode_t row_index_mode = row_index_mode_t::hash; - bool row_dense_candidate = true; - std::string_view row_dense_prefix; - uint64_t row_dense_min_id = 0; - uint64_t row_dense_max_id = 0; - uint64_t row_dense_base_id = 0; - size_t row_dense_pad_width = 0; - bool row_dense_zero_padded = false; + // Row name lookup for labels like R0001, R0002, ... + index_mode_t row_index_mode = index_mode_t::hash; + bool row_dense_candidate = true; + dense_name_index_t row_dense; // var_names still uses STL (only used in parse_bounds, not as hot) std::unordered_map var_names_map; @@ -457,7 +501,7 @@ struct parse_state_t { char type = 'C'; }; - // Some writers introduce zero-column variables only in BOUNDS. + // some writers introduce zero-column variables only in BOUNDS. std::map bounds_only_vars; parse_state_t(cuopt::linear_programming::io::mps_data_model_t& p, cursor_t& c) @@ -471,77 +515,13 @@ struct parse_state_t { init_row_hash_table_impl(); } - bool row_dense_has_expected_width(uint64_t value, size_t suffix_width) const - { - return dense_suffix_width_ok(value, suffix_width, row_dense_zero_padded, row_dense_pad_width); - } - - bool col_dense_has_expected_width(uint64_t value, size_t suffix_width) const - { - return dense_suffix_width_ok(value, suffix_width, col_dense_zero_padded, col_dense_pad_width); - } - - bool is_ignored_objective_name(std::string_view name) const - { - return std::find(ignored_objective_names_sv.begin(), ignored_objective_names_sv.end(), name) != - ignored_objective_names_sv.end(); - } - - void add_ignored_objective_name(std::string_view name) - { - if (name == objective_name_sv || is_ignored_objective_name(name)) { return; } - ignored_objective_names_sv.push_back(name); - } - void observe_objective_row_name(std::string_view name) { if (objective_name_sv.empty()) { objective_name_sv = name; - } else { - add_ignored_objective_name(name); - } - } - - void observe_row_name_for_dense_index(std::string_view name, size_t row_index) - { - if (!row_dense_candidate) { return; } - - std::string_view prefix; - uint64_t value = 0; - size_t suffix_width = 0; - if (!parse_trailing_u64(name, prefix, value, suffix_width)) { - row_dense_candidate = false; - return; - } - - if (row_index == 0) { - row_dense_prefix = prefix; - row_dense_min_id = value; - row_dense_max_id = value; - row_dense_base_id = value; - row_dense_pad_width = suffix_width; - row_dense_zero_padded = dense_suffix_is_zero_padded(name, suffix_width); - return; - } - - if (prefix != row_dense_prefix) { - row_dense_candidate = false; - return; + } else if (name != objective_name_sv) { + ignored_objective_names.insert(name); } - - if (row_dense_base_id > std::numeric_limits::max() - row_index) { - row_dense_candidate = false; - return; - } - - uint64_t expected = row_dense_base_id + row_index; - if (value != expected || !row_dense_has_expected_width(value, suffix_width)) { - row_dense_candidate = false; - return; - } - - row_dense_min_id = std::min(row_dense_min_id, value); - row_dense_max_id = std::max(row_dense_max_id, value); } bool init_row_dense_ordered_table() @@ -549,23 +529,22 @@ struct parse_state_t { scoped_timer_t timer("row_dense_finalize"); size_t n_rows = row_names_sv.size(); if (!row_dense_candidate || n_rows == 0) { return false; } - if (row_dense_max_id < row_dense_min_id) { return false; } - uint64_t dense_count = row_dense_max_id - row_dense_min_id + 1; + if (row_dense.max_id < row_dense.min_id) { return false; } + uint64_t dense_count = row_dense.max_id - row_dense.min_id + 1; if (dense_count != n_rows) { return false; } - row_index_mode = row_index_mode_t::dense_ordered; + row_index_mode = index_mode_t::dense_ordered; return true; } size_t row_hash_bucket_count_for(size_t n_rows) const { #ifdef MPS_FAST_COMPACT_ROW_HASH - // Keep the row hash compact. Probe counts are usually low, and a smaller + // probe counts are usually low, and a smaller // table reduces cache/TLB footprint on medium instances. - return next_power_of_2(std::max(n_rows + n_rows / 2, (size_t)64)); + return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); #else - // Original conservative sizing policy. - return next_power_of_2(std::max((size_t)(n_rows * 2), (size_t)64)); + return cuda::next_power_of_two(std::max((size_t)(n_rows * 2), (size_t)64)); #endif } @@ -582,11 +561,13 @@ struct parse_state_t { if (use_partitioned) { scoped_timer_t timer("row_hash_partition_metadata"); + // Pre-hash once, count rows per partition, then pack row indices by partition. + // This turns the build into disjoint single-writer table fills. row_hashes.resize(n_rows); size_t inline_rows = 0; for (size_t idx = 0; idx < n_rows; ++idx) { std::string_view name = row_names_sv[idx]; - if (__unlikely(name.size() > HASH_KEY_BYTES)) { + if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { row_names_long[name] = idx; continue; } @@ -603,7 +584,7 @@ struct parse_state_t { row_order.resize(inline_rows); auto next_offsets = partition_offsets; for (size_t idx = 0; idx < n_rows; ++idx) { - if (__unlikely(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; } + if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; } size_t part = row_hash_partition_for(row_hashes[idx]); row_order[next_offsets[part]++] = idx; } @@ -639,7 +620,7 @@ struct parse_state_t { next_slots += row_hash_partitions[p].buckets; } } - // Request huge pages to reduce TLB misses + // request huge pages to reduce TLB misses row_hash_region.advise(MADV_HUGEPAGE); } @@ -666,6 +647,7 @@ struct parse_state_t { std::vector partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0); std::vector partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0); #endif +// initialize the row hash tables in parallel #pragma omp parallel for schedule(static) num_threads(num_threads) for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { size_t p = (size_t)part_id; @@ -675,6 +657,7 @@ struct parse_state_t { size_t local_max_probes = 0; #endif const auto& part = row_hash_partitions[p]; + // Each worker owns its subtable, so row_insert_into remains the plain serial probe loop. for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { size_t idx = row_order[pos]; #ifdef MPS_FAST_PERF_COUNTERS @@ -745,24 +728,9 @@ struct parse_state_t { #endif } - size_t row_lookup_dense_ordered(std::string_view name) const - { - std::string_view prefix; - uint64_t value = 0; - size_t suffix_width = 0; - if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; } - if (prefix != row_dense_prefix || !row_dense_has_expected_width(value, suffix_width)) { - return SIZE_MAX; - } - if (value < row_dense_min_id || value > row_dense_max_id) { return SIZE_MAX; } - return (size_t)(value - row_dense_min_id); - } - size_t row_lookup(std::string_view name) const { - if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) { - return row_lookup_dense_ordered(name); - } + if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); } return row_lookup_hash(name); } @@ -771,10 +739,10 @@ struct parse_state_t { const char* start = cursor.ptr; const char* p = start; - size_t prefix_len = row_dense_prefix.size(); + size_t prefix_len = row_dense.prefix.size(); if (prefix_len > 0) { if ((size_t)(cursor.end - p) < prefix_len || - std::memcmp(p, row_dense_prefix.data(), prefix_len) != 0) { + std::memcmp(p, row_dense.prefix.data(), prefix_len) != 0) { cursor.read_field(); return SIZE_MAX; } @@ -783,21 +751,12 @@ struct parse_state_t { const char* digits_start = p; uint64_t value = 0; - while (p < cursor.end && is_decimal_digit(*p)) { - uint64_t digit = (uint64_t)(*p - '0'); - if (value > (std::numeric_limits::max() - digit) / 10) { - cursor.ptr = start; - cursor.read_field(); - return SIZE_MAX; - } - value = value * 10 + digit; - p++; - } + fp64::parse_u64_digits_advance(p, cursor.end, value); size_t suffix_width = (size_t)(p - digits_start); - if (suffix_width == 0 || p >= cursor.end || *p > ' ' || - !row_dense_has_expected_width(value, suffix_width) || value < row_dense_min_id || - value > row_dense_max_id) { + if (suffix_width == 0 || suffix_width > dense_suffix_max_digits || p >= cursor.end || + *p > ' ' || !row_dense.suffix_width_ok(value, suffix_width) || value < row_dense.min_id || + value > row_dense.max_id) { cursor.ptr = start; cursor.read_field(); return SIZE_MAX; @@ -805,12 +764,12 @@ struct parse_state_t { cursor.ptr = p; cursor.skip_ws(); - return (size_t)(value - row_dense_min_id); + return (size_t)(value - row_dense.min_id); } size_t read_row_lookup(cursor_t& cursor) const { - if (__likely(row_index_mode == row_index_mode_t::dense_ordered)) { + if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return read_row_lookup_dense_ordered(cursor); } @@ -820,13 +779,14 @@ struct parse_state_t { size_t row_lookup_hash(std::string_view name) const { - if (__unlikely(name.size() > HASH_KEY_BYTES)) { + if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { auto it = row_names_long.find(name); return it != row_names_long.end() ? it->second : SIZE_MAX; } hash_key_t key = make_key(name.data(), name.size()); uint32_t hash = fnv1a_hash(name.data(), name.size()); - if (__likely(row_hash_partition_count != 0)) { + if (LIKELY(row_hash_partition_count != 0)) { + // Lookups mirror the build routing and probe only the selected subtable. const auto& part = row_hash_partitions[row_hash_partition_for(hash)]; return row_lookup_in(part.slots, part.buckets, part.mask, key, hash); } @@ -845,43 +805,9 @@ struct parse_state_t { return SIZE_MAX; } - size_t col_lookup_dense_ordered(std::string_view name) const - { - std::string_view prefix; - uint64_t value = 0; - size_t suffix_width = 0; - if (!parse_trailing_u64(name, prefix, value, suffix_width)) { return SIZE_MAX; } - if (prefix != col_dense_prefix || !col_dense_has_expected_width(value, suffix_width)) { - return SIZE_MAX; - } - if (value < col_dense_min_id || value > col_dense_max_id) { return SIZE_MAX; } - return (size_t)(value - col_dense_min_id); - } - - void dense_col_name(size_t idx, std::string& out) const - { - uint64_t value = col_dense_min_id + idx; - char digits_buf[32]; - auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value); - if (ec != std::errc()) { - out.assign(col_dense_prefix); - return; - } - size_t digits_len = (size_t)(digits_end - digits_buf); - size_t width = col_dense_zero_padded ? std::max(col_dense_pad_width, digits_len) : digits_len; - out.resize(col_dense_prefix.size() + width); - std::memcpy(out.data(), col_dense_prefix.data(), col_dense_prefix.size()); - char* suffix = out.data() + col_dense_prefix.size(); - if (width > digits_len) { - std::memset(suffix, '0', width - digits_len); - suffix += width - digits_len; - } - std::memcpy(suffix, digits_buf, digits_len); - } - size_t row_insert(std::string_view name, size_t index) { - if (__unlikely(name.size() > HASH_KEY_BYTES)) { + if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { row_names_long[name] = index; return 0; } @@ -906,7 +832,8 @@ struct parse_state_t { if (slot >= &slots[buckets]) { slot = &slots[0]; } if (slot->count == 0) { key_store(slot->key, key); // Writes 32 bytes, including garbage in last 4 - slot->count = (uint32_t)(index + 1); // Overwrite last 4 bytes with actual count + slot->count = (uint32_t)(index + 1); // Overwrite last 4 bytes with actual count. i trust + // the compiler to optimize this return i + 1; } if (key_cmpeq(slot->key, key)) { @@ -914,7 +841,8 @@ struct parse_state_t { return i + 1; } } - __builtin_trap(); + // can't happen, the table is properly sized to fit all rows + __builtin_unreachable(); } }; @@ -922,31 +850,13 @@ struct parse_state_t { // Section parsers // ============================================================================= -static std::string_view read_rest_of_line_trimmed(cursor_t& cursor) -{ - const char* begin = cursor.ptr; - const char* end = begin; - while (end < cursor.end && *end != '\n' && *end != '\r') { - ++end; - } - - while (begin < end && (*begin == ' ' || *begin == '\t')) { - ++begin; - } - while (end > begin && (end[-1] == ' ' || end[-1] == '\t')) { - --end; - } - cursor.ptr = end; - return std::string_view(begin, (size_t)(end - begin)); -} - template static void parse_name_section(parse_state_t& state) { scoped_timer_t timer("parse_name"); if (peek(state.cursor) == "ROWS") { return; } expect(state.cursor, "NAME"); - if (!state.cursor.eol()) { state.problem_name_sv = read_rest_of_line_trimmed(state.cursor); } + if (!state.cursor.eol()) { state.problem_name_sv = state.cursor.read_rest_of_line_trimmed(); } expect_eol(state.cursor); } @@ -974,19 +884,18 @@ static void parse_objname_section(parse_state_t& state) { scoped_timer_t timer("parse_objname"); if (accept(state.cursor, "OBJNAME")) { - if (state.cursor.eol()) { expect_eol(state.cursor); } - state.objective_name_sv = state.cursor.read_field(); + if (!state.cursor.eol()) { state.objective_name_sv = state.cursor.read_rest_of_line_trimmed(); } accept_comment(state.cursor); expect_eol(state.cursor); } } -struct RowChunkBoundary { +struct row_chunk_boundary_t { const char* start; const char* end; }; -struct RowChunkInfo { +struct row_chunk_info_t { size_t constraints = 0; bool malformed = false; std::vector objective_names; @@ -1007,7 +916,7 @@ static bool parse_rows_line_fast(const char*& p, char& row_type, std::string_view& row_name) { - p = cursor_t::simd_scan(p, end); + p = cursor_t::simd_scan(p, end); if (p >= end) { return false; } if (*p == '\n') { p++; @@ -1019,26 +928,29 @@ static bool parse_rows_line_fast(const char*& p, } row_type = *p++; - p = cursor_t::simd_scan(p, end); + p = cursor_t::simd_scan(p, end); const char* name_start = p; - p = cursor_t::simd_scan(p, end); + p = cursor_t::simd_scan(p, end); if (name_start == p) { return false; } row_name = std::string_view(name_start, (size_t)(p - name_start)); // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and // field 3 may start with '$' to comment the rest of the record. + // could be SIMD'd, but in practice the newline is right after the row name p = rows_find_next_line(p, end); return true; } -static std::vector compute_row_chunk_boundaries(const char* rows_start, - const char* rows_end, - int num_threads) +// row chunks are established based on byte count, thus boundaries can land in the middle of a row +// this cleans up chunks to have row line boundaries +static std::vector compute_row_chunk_boundaries(const char* rows_start, + const char* rows_end, + int num_threads) { scoped_timer_t timer("rows_compute_chunk_boundaries"); - std::vector boundaries((size_t)num_threads); + std::vector boundaries((size_t)num_threads); size_t total_size = (size_t)(rows_end - rows_start); size_t chunk_size = total_size / (size_t)num_threads; @@ -1057,6 +969,7 @@ static std::vector compute_row_chunk_boundaries(const char* ro return boundaries; } +// reads the row section in chunks and inserts into the worker's hash table partition template static bool parse_rows_section_parallel_impl(parse_state_t& state, const char* rows_start, @@ -1066,7 +979,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, scoped_timer_t timer("parse_rows_parallel"); auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads); - std::vector infos((size_t)num_threads); + std::vector infos((size_t)num_threads); { scoped_timer_t timer("rows_count_parallel"); @@ -1075,7 +988,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows); const char* p = boundaries[(size_t)t].start; const char* end = boundaries[(size_t)t].end; - RowChunkInfo info; + row_chunk_info_t info; while (p < end) { char row_type = 0; @@ -1104,10 +1017,12 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, } } - for (const auto& info : infos) { - if (info.malformed) { return false; } + if (std::any_of( + infos.begin(), infos.end(), [](const row_chunk_info_t& info) { return info.malformed; })) { + return false; } + // prefix sum to do a paralle scatter of every row entries into the global output arrays std::vector offsets((size_t)num_threads + 1, 0); { scoped_timer_t timer("rows_prefix_sum"); @@ -1133,7 +1048,7 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, } for (const auto& info : infos) { for (std::string_view name : info.objective_names) { - state.add_ignored_objective_name(name); + if (name != state.objective_name_sv) { state.ignored_objective_names.insert(name); } } } @@ -1141,7 +1056,6 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, std::string_view dense_prefix; uint64_t dense_base_id = 0; size_t dense_pad_width = 0; - bool dense_zero_padded = false; if (dense_candidate) { std::string_view first_name; @@ -1157,9 +1071,8 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) { dense_candidate = false; } else { - dense_base_id = first_value; - dense_pad_width = first_suffix_width; - dense_zero_padded = dense_suffix_is_zero_padded(first_name, first_suffix_width); + dense_base_id = first_value; + dense_pad_width = dense_initial_pad_width(first_name, first_suffix_width); } } @@ -1175,6 +1088,13 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, size_t out = offsets[(size_t)t]; bool local_dense_ok = dense_candidate; + dense_name_index_t dense_index; + if (local_dense_ok) { + dense_index.prefix.assign(dense_prefix); + dense_index.min_id = dense_base_id; + dense_index.max_id = dense_base_id; + dense_index.pad_width = dense_pad_width; + } while (p < end) { char row_type = 0; @@ -1194,14 +1114,9 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, state.problem.row_types_[out] = row_type; if (local_dense_ok) { - std::string_view prefix; - uint64_t value = 0; - size_t suffix_width = 0; - uint64_t expected = dense_base_id + out; - local_dense_ok = - parse_trailing_u64(row_name, prefix, value, suffix_width) && prefix == dense_prefix && - value == expected && - dense_suffix_width_ok(value, suffix_width, dense_zero_padded, dense_pad_width); + size_t observed_count = out; + observe_dense_name( + local_dense_ok, dense_index, observed_count, row_name, dense_base_id + out); } out++; } @@ -1217,12 +1132,10 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, } state.row_dense_candidate = dense_candidate; if (dense_candidate) { - state.row_dense_prefix = dense_prefix; - state.row_dense_min_id = dense_base_id; - state.row_dense_max_id = dense_base_id + total_rows - 1; - state.row_dense_base_id = dense_base_id; - state.row_dense_pad_width = dense_pad_width; - state.row_dense_zero_padded = dense_zero_padded; + state.row_dense.prefix.assign(dense_prefix); + state.row_dense.min_id = dense_base_id; + state.row_dense.max_id = dense_base_id + total_rows - 1; + state.row_dense.pad_width = dense_pad_width; } } @@ -1238,9 +1151,6 @@ static void parse_rows_section_serial_impl(parse_state_t& state, const auto row_type = state.cursor.ptr[0]; state.cursor.advance(1); state.cursor.skip_ws(); - // if (row_type != "E" && row_type != "L" && row_type != "G" && row_type != "N") { - // state.cursor.error("expected E, L, G, or N, got '%s'", row_type.data()); - // } auto row_name = state.cursor.read_field(); // ROWS fields after the row name are unused; tolerate annotations/comments there. @@ -1252,7 +1162,12 @@ static void parse_rows_section_serial_impl(parse_state_t& state, const } else { size_t row_idx = state.row_names_sv.size(); state.row_names_sv.push_back(row_name); - state.observe_row_name_for_dense_index(row_name, row_idx); + observe_dense_name( + state.row_dense_candidate, + state.row_dense, + row_idx, + row_name, + row_idx == 0 ? std::numeric_limits::max() : state.row_dense.min_id + row_idx); state.problem.row_types_.push_back(row_type); } expect_eol(state.cursor); @@ -1272,20 +1187,17 @@ static void parse_rows_section(parse_state_t& state, const char* rows_ size_t rows_bytes = (size_t)(rows_end - state.cursor.ptr); int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); bool parsed_parallel = false; - if (rows_bytes >= 512ull * 1024ull * 1024ull && num_threads > 1) { + if (rows_bytes >= 512 * MiB && num_threads > 1) { parsed_parallel = parse_rows_section_parallel_impl(state, state.cursor.ptr, rows_end, num_threads); + // serial fallback in case a likely malformed chunk has been encounter + // makes error reporting much easier if (!parsed_parallel) { state.row_names_sv.clear(); state.problem.row_types_.clear(); - state.row_dense_candidate = true; - state.row_dense_prefix = {}; - state.row_dense_min_id = 0; - state.row_dense_max_id = 0; - state.row_dense_base_id = 0; - state.row_dense_pad_width = 0; - state.row_dense_zero_padded = false; - state.cursor.ptr = rows_start; + state.row_dense_candidate = true; + state.row_dense.reset(); + state.cursor.ptr = rows_start; parse_rows_section_serial_impl(state, rows_end); } } else { @@ -1303,71 +1215,61 @@ static void parse_rows_section(parse_state_t& state, const char* rows_ } } -// ============================================================================= -// Parallel COLUMNS parser -// ============================================================================= +// Columns parser -struct MarkerInfo { +// integer variable markers +struct marker_info_t { enum Type { INTORG, INTEND }; Type type; size_t after_local_var_idx; // SIZE_MAX means "before first variable" }; -struct RowCountBlock { +struct row_count_block_t { size_t block_id = 0; size_t storage_offset = 0; }; -struct DenseColChunkStats { - bool candidate = true; - std::string_view prefix; - uint64_t first_id = 0; - uint64_t last_id = 0; - size_t pad_width = 0; - bool zero_padded = false; - size_t count = 0; -}; - -struct ChunkResult { +// Each column parsing worker owns chunks of the global CSC which are parsed in parallel and then +// later scattered into the final CSR +struct chunk_result_t { std::vector values; std::vector row_indices; std::vector col_offsets; std::vector var_names; chunk_name_arena_t var_name_arena; - std::vector markers; + std::vector markers; std::vector> objective_entries; // local_col_idx -> coefficient - // Sparse per-row scratch: each touched 4096-row block stores counts after parsing, - // then the same slots become CSR write cursors. This avoids scanning/allocating - // chunks*n_rows entries when a chunk only touches clustered row ranges. The - // block payloads live in one arena per chunk so scatter has hugepage-friendly - // write-position metadata instead of many independent 32 KiB allocations. + // COLUMNS is parsed as chunk-local CSC. To build the global CSR, each chunk needs row counts + // first, then row-local write cursors for scatter. Store those counts only for touched + // 4096-row blocks instead of allocating a dense chunks*n_rows matrix + // The same slots are rewritten as write cursors after the global CSR row offsets are known std::vector row_count_storage; - std::vector row_count_blocks; + std::vector row_count_blocks; std::vector row_count_block_dir; - std::string_view first_var_name; - std::string_view last_var_name; - DenseColChunkStats dense_col_stats; + dense_observe_state_t dense_col_stats; }; -struct ChunkBoundary { +struct chunk_boundary_t { const char* start; const char* end; }; -struct BoundsChunkBoundary { +struct bounds_chunk_boundary_t { const char* start; const char* end; }; -static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx) +// enables representing row counts per chunk as a sparse representation w/ 4096 granularity +// works well since nnzs are often clustered around the same matrix blocks +static inline int64_t& column_row_count_slot(chunk_result_t& result, size_t row_idx) { size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; int32_t block_pos = result.row_count_block_dir[block_id]; - if (__unlikely(block_pos < 0)) { + if (UNLIKELY(block_pos < 0)) { block_pos = (int32_t)result.row_count_blocks.size(); result.row_count_block_dir[block_id] = block_pos; - RowCountBlock block; + row_count_block_t block; block.block_id = block_id; block.storage_offset = result.row_count_storage.size(); result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0); @@ -1377,68 +1279,17 @@ static inline int64_t& column_row_count_slot(ChunkResult& result, size_t row_idx .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local]; } -static void observe_dense_col_name(DenseColChunkStats& stats, std::string_view name) -{ - if (!stats.candidate) { return; } - - std::string_view prefix; - uint64_t value = 0; - size_t suffix_width = 0; - if (!parse_trailing_u64(name, prefix, value, suffix_width)) { - stats.candidate = false; - return; - } - - if (stats.count == 0) { - stats.prefix = prefix; - stats.first_id = value; - stats.last_id = value; - stats.pad_width = suffix_width; - stats.zero_padded = dense_suffix_is_zero_padded(name, suffix_width); - stats.count = 1; - return; - } - - if (prefix != stats.prefix) { - stats.candidate = false; - return; - } - if (stats.last_id == std::numeric_limits::max() || value != stats.last_id + 1) { - stats.candidate = false; - return; - } - if (!dense_suffix_width_ok(value, suffix_width, stats.zero_padded, stats.pad_width)) { - stats.candidate = false; - return; - } - stats.last_id = value; - stats.count++; -} - -static bool dense_col_chunk_padding_compatible(const DenseColChunkStats& stats, - bool global_zero_padded, +static bool dense_col_chunk_padding_compatible(const dense_observe_state_t& stats, size_t global_pad_width) { - if (global_zero_padded) { - return stats.pad_width == global_pad_width || - (!stats.zero_padded && decimal_digits_u64(stats.first_id) >= global_pad_width); + if (global_pad_width > 0) { + return stats.index.pad_width == global_pad_width || + (stats.index.pad_width == 0 && + decimal_digits_u64(stats.index.min_id) >= global_pad_width); } - return !stats.zero_padded; -} - -// Read first field (column name) from a line without modifying any state -static std::string_view peek_line_column_name(const char* line_start, const char* end) -{ - const char* p = line_start; - while (p < end && *p <= ' ' && *p != '\n') - p++; - const char* field_start = p; - while (p < end && *p > ' ') - p++; - return std::string_view(field_start, (size_t)(p - field_start)); + return stats.index.pad_width == 0; } -// Find the start of the next line static const char* find_next_line(const char* p, const char* end) { while (p < end && *p != '\n') @@ -1471,16 +1322,15 @@ static const char* find_line_start(const char* section_start, const char* p) return p; } -static std::vector compute_bounds_chunk_boundaries(const char* section_start, - const char* section_end, - int num_threads) +static std::vector compute_bounds_chunk_boundaries( + const char* section_start, const char* section_end, int num_threads) { scoped_timer_t timer("bounds_compute_chunk_boundaries"); const size_t total_size = (size_t)(section_end - section_start); const size_t chunk_size = total_size / (size_t)num_threads; - std::vector boundaries((size_t)num_threads); + std::vector boundaries((size_t)num_threads); boundaries[0].start = section_start; for (int t = 0; t < num_threads; ++t) { if (t == num_threads - 1) { @@ -1506,19 +1356,17 @@ static std::vector compute_bounds_chunk_boundaries(const ch return boundaries; } -static std::vector compute_chunk_boundaries(const char* columns_start, - const char* columns_end, - int num_threads) +static std::vector compute_chunk_boundaries(const char* columns_start, + const char* columns_end, + int num_threads) { scoped_timer_t timer("compute_chunk_boundaries"); size_t total_size = (size_t)(columns_end - columns_start); size_t chunk_size = total_size / (size_t)num_threads; - std::vector boundaries(num_threads); + std::vector boundaries(num_threads); - // Parallel boundary finding - each thread finds its own end at a column transition - // #pragma omp parallel for for (int t = 0; t < num_threads; t++) { if (t == 0) { boundaries[t].start = columns_start; } @@ -1533,7 +1381,7 @@ static std::vector compute_chunk_boundaries(const char* columns_s if (line_start < columns_end) line_start++; // Read column name at this line - std::string_view col_name = peek_line_column_name(line_start, columns_end); + std::string_view col_name = cursor_t::peek_field_at(line_start, columns_end); // Scan forward until column name changes (to avoid splitting a column) const char* boundary = line_start; @@ -1541,9 +1389,9 @@ static std::vector compute_chunk_boundaries(const char* columns_s const char* next_line = find_next_line(boundary, columns_end); if (next_line >= columns_end) break; - std::string_view next_col = peek_line_column_name(next_line, columns_end); + std::string_view next_col = cursor_t::peek_field_at(next_line, columns_end); if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') { - // Found a column transition (and it's not a MARKER line) + // Found a column transition. Marker-state fixup later handles any split near markers. boundary = next_line; break; } @@ -1562,11 +1410,11 @@ static std::vector compute_chunk_boundaries(const char* columns_s } template -static ChunkResult parse_columns_chunk(const char* chunk_start, - const char* chunk_end, - const parse_state_t& state) +static chunk_result_t parse_columns_chunk(const char* chunk_start, + const char* chunk_end, + const parse_state_t& state) { - ChunkResult result; + chunk_result_t result; if (chunk_start >= chunk_end) { result.col_offsets.push_back(0); @@ -1576,7 +1424,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, size_t chunk_size = (size_t)(chunk_end - chunk_start); size_t estimated_nnz = chunk_size / 100; size_t estimated_cols = estimated_nnz / 10; - if (__unlikely(state.problem.n_constraints_ > (i_t)std::numeric_limits::max())) { + if (UNLIKELY(state.problem.n_constraints_ > (i_t)std::numeric_limits::max())) { state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices"); } result.values.reserve(estimated_nnz); @@ -1585,8 +1433,8 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, result.var_names.reserve(estimated_cols); result.var_name_arena.reserve(std::max(4096, estimated_cols * 16)); result.objective_entries.reserve(estimated_cols); - size_t n_row_blocks = ((size_t)state.problem.n_constraints_ + COLUMN_ROW_COUNT_BLOCK_ROWS - 1) / - COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t n_row_blocks = + cuda::ceil_div((size_t)state.problem.n_constraints_, COLUMN_ROW_COUNT_BLOCK_ROWS); result.row_count_block_dir.resize(n_row_blocks, -1); size_t estimated_touched_blocks = std::min(n_row_blocks, std::max(16, estimated_nnz)); result.row_count_blocks.reserve(estimated_touched_blocks); @@ -1598,31 +1446,35 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, cursor.skip_ws(); while (!cursor.done()) { - if (__unlikely(*cursor.ptr == 'R')) { + if (UNLIKELY(*cursor.ptr == 'R')) { auto next = cursor.peek_field(); // RHS section is mandatory right after COLUMNS section if (next == "RHS") { break; } } auto [var_name, field2] = cursor.read_two_fields(); - if (__unlikely(!field2.empty() && field2[0] == '$')) { + if (UNLIKELY(!field2.empty() && field2[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; } // Check for integer marker - if (__unlikely(field2[0] == '\'' && field2 == "'MARKER'")) { + if (UNLIKELY(field2[0] == '\'' && field2 == "'MARKER'")) { auto marker_type = cursor.read_field(); - MarkerInfo marker; + marker_info_t marker; marker.after_local_var_idx = result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1; if (marker_type == "'INTORG'") { - marker.type = MarkerInfo::INTORG; + marker.type = marker_info_t::INTORG; + } else if (marker_type == "'INTEND'") { + marker.type = marker_info_t::INTEND; } else { - marker.type = MarkerInfo::INTEND; + cursor.error("unknown integer marker type in COLUMNS: %.*s", + (int)marker_type.size(), + marker_type.data()); } result.markers.push_back(marker); @@ -1649,32 +1501,33 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end); } // usually EOL directly follows - if (__unlikely(!cursor.eol())) { cursor.skip_ws(); } + if (UNLIKELY(!cursor.eol())) { cursor.skip_ws(); } accept_comment(cursor); if (prev_var_name != var_name) { std::string_view owned_var_name = result.var_name_arena.copy(var_name); result.var_names.push_back(owned_var_name); - observe_dense_col_name(result.dense_col_stats, owned_var_name); + observe_dense_name(result.dense_col_stats.candidate, + result.dense_col_stats.index, + result.dense_col_stats.count, + owned_var_name); result.col_offsets.push_back(result.values.size()); prev_var_name = owned_var_name; - if (result.first_var_name.empty()) { result.first_var_name = owned_var_name; } - result.last_var_name = owned_var_name; } auto add_entry = [&](std::string_view rn, double val) { size_t row_idx = state.row_lookup(rn); - if (__likely(row_idx != SIZE_MAX)) { + if (LIKELY(row_idx != SIZE_MAX)) { assert(row_idx <= (size_t)std::numeric_limits::max()); result.values.push_back(val); result.row_indices.push_back((uint32_t)row_idx); column_row_count_slot(result, row_idx)++; - } else if (__likely(rn == state.objective_name_sv)) { + } else if (LIKELY(rn == state.objective_name_sv)) { result.objective_entries.push_back({result.var_names.size() - 1, val}); - } else if (state.is_ignored_objective_name(rn)) { + } else if (state.ignored_objective_names.count(rn)) { return; } else { - state.cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data()); + cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data()); } }; @@ -1683,7 +1536,7 @@ static ChunkResult parse_columns_chunk(const char* chunk_start, // Optional second entry on same line if (!cursor.eol()) { auto row_name2 = cursor.read_field(); - if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) { + if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; @@ -1714,8 +1567,8 @@ struct column_merge_shape_t { }; template -static column_merge_shape_t compute_column_merge_shape(const std::vector& chunks, - i_t n_rows) +static column_merge_shape_t compute_column_merge_shape( + const std::vector& chunks, i_t n_rows) { column_merge_shape_t shape; shape.num_chunks = (int)chunks.size(); @@ -1752,7 +1605,7 @@ static column_merge_shape_t compute_column_merge_shape(const std::vector static void detect_dense_column_metadata(parse_state_t& state, - const std::vector& chunks, + const std::vector& chunks, const column_merge_shape_t& shape) { scoped_timer_t timer("columns_dense_metadata"); @@ -1763,7 +1616,6 @@ static void detect_dense_column_metadata(parse_state_t& state, uint64_t dense_min_id = 0; uint64_t dense_max_id = 0; size_t dense_pad_width = 0; - bool dense_zero_padded = false; for (int t = 0; t < shape.num_chunks && dense_ok; ++t) { const auto& stats = chunks[(size_t)t].dense_col_stats; @@ -1773,28 +1625,28 @@ static void detect_dense_column_metadata(parse_state_t& state, break; } if (!have_first) { - have_first = true; - dense_prefix = stats.prefix; - expected_next_id = stats.first_id; - dense_min_id = stats.first_id; - dense_pad_width = stats.pad_width; - dense_zero_padded = stats.zero_padded; - } - if (stats.prefix != dense_prefix || stats.first_id != expected_next_id || - !dense_col_chunk_padding_compatible(stats, dense_zero_padded, dense_pad_width)) { + have_first = true; + dense_prefix = stats.index.prefix; + expected_next_id = stats.index.min_id; + dense_min_id = stats.index.min_id; + dense_pad_width = stats.index.pad_width; + } + if (stats.index.prefix != dense_prefix || stats.index.min_id != expected_next_id || + !dense_col_chunk_padding_compatible(stats, dense_pad_width)) { dense_ok = false; break; } - if (stats.last_id < stats.first_id || stats.last_id - stats.first_id + 1 != stats.count) { + if (stats.index.max_id < stats.index.min_id || + stats.index.max_id - stats.index.min_id + 1 != stats.count) { dense_ok = false; break; } - dense_max_id = stats.last_id; - if (stats.last_id == std::numeric_limits::max()) { + dense_max_id = stats.index.max_id; + if (stats.index.max_id == std::numeric_limits::max()) { dense_ok = false; break; } - expected_next_id = stats.last_id + 1; + expected_next_id = stats.index.max_id + 1; } if (!have_first || dense_max_id < dense_min_id || @@ -1802,20 +1654,18 @@ static void detect_dense_column_metadata(parse_state_t& state, dense_ok = false; } - state.col_dense_ordered = dense_ok; + state.col_index_mode = dense_ok ? index_mode_t::dense_ordered : index_mode_t::hash; if (dense_ok) { - state.col_dense_prefix_storage.assign(dense_prefix); - state.col_dense_prefix = state.col_dense_prefix_storage; - state.col_dense_min_id = dense_min_id; - state.col_dense_max_id = dense_max_id; - state.col_dense_pad_width = dense_pad_width; - state.col_dense_zero_padded = dense_zero_padded; + state.col_dense.prefix.assign(dense_prefix); + state.col_dense.min_id = dense_min_id; + state.col_dense.max_id = dense_max_id; + state.col_dense.pad_width = dense_pad_width; } } template static std::vector build_csr_row_offsets(parse_state_t& state, - const std::vector& chunks, + const std::vector& chunks, const column_merge_shape_t& shape) { std::vector global_row_counts((size_t)shape.n_rows, 0); @@ -1846,7 +1696,7 @@ static std::vector build_csr_row_offsets(parse_state_t& state, } template -static void convert_counts_to_write_positions(std::vector& chunks, +static void convert_counts_to_write_positions(std::vector& chunks, const column_merge_shape_t& shape, const std::vector& row_offsets, std::vector& global_row_counts) @@ -1870,7 +1720,8 @@ static void convert_counts_to_write_positions(std::vector& chunks, } } -static void materialize_chunk_row_count_storage(std::vector& chunks, int num_threads) +static void materialize_chunk_row_count_storage(std::vector& chunks, + int num_threads) { scoped_timer_t timer("columns_row_count_storage_hugepages"); #pragma omp parallel for num_threads(num_threads) @@ -1901,7 +1752,7 @@ static void allocate_column_outputs(parse_state_t& state, } #pragma omp section { - if (!state.col_dense_ordered) { + if (state.col_index_mode != index_mode_t::dense_ordered) { state.var_name_arenas.clear(); state.var_name_arenas.resize((size_t)shape.num_chunks); state.var_names_sv.resize(shape.total_cols); @@ -1916,7 +1767,7 @@ static void allocate_column_outputs(parse_state_t& state, template static void scatter_column_chunks_to_csr(parse_state_t& state, - std::vector& chunks, + std::vector& chunks, const column_merge_shape_t& shape, int num_threads) { @@ -1942,7 +1793,7 @@ static void scatter_column_chunks_to_csr(parse_state_t& state, size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; int32_t block_pos = chunk.row_count_block_dir[block_id]; - RowCountBlock& block = chunk.row_count_blocks[(size_t)block_pos]; + row_count_block_t& block = chunk.row_count_blocks[(size_t)block_pos]; int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; i_t dest = (i_t)write_pos++; state.problem.A_[dest] = (f_t)chunk.values[idx]; @@ -1958,7 +1809,7 @@ static void scatter_column_chunks_to_csr(parse_state_t& state, #endif } - if (!state.col_dense_ordered) { + if (state.col_index_mode != index_mode_t::dense_ordered) { scoped_timer_t names_timer("scatter_var_names"); #pragma omp parallel for num_threads(num_threads) for (int t = 0; t < shape.num_chunks; t++) { @@ -1975,13 +1826,13 @@ static void scatter_column_chunks_to_csr(parse_state_t& state, } struct global_marker_t { - MarkerInfo::Type type; + marker_info_t::Type type; size_t global_var_idx; }; template static void apply_column_integer_markers(parse_state_t& state, - const std::vector& chunks, + const std::vector& chunks, const column_merge_shape_t& shape) { scoped_timer_t timer("columns_apply_markers"); @@ -1999,7 +1850,7 @@ static void apply_column_integer_markers(parse_state_t& state, } } - std::sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { + std::stable_sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) { if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true; if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false; return a.global_var_idx < b.global_var_idx; @@ -2010,7 +1861,7 @@ static void apply_column_integer_markers(parse_state_t& state, for (size_t v = 0; v < shape.total_cols; v++) { while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX || all_markers[marker_idx].global_var_idx < v)) { - is_integer = all_markers[marker_idx].type == MarkerInfo::INTORG; + is_integer = all_markers[marker_idx].type == marker_info_t::INTORG; marker_idx++; } state.problem.var_types_[v] = is_integer ? 'I' : 'C'; @@ -2019,7 +1870,7 @@ static void apply_column_integer_markers(parse_state_t& state, template static void assign_column_objective_entries(parse_state_t& state, - const std::vector& chunks, + const std::vector& chunks, const column_merge_shape_t& shape) { scoped_timer_t timer("columns_objective_entries"); @@ -2034,7 +1885,7 @@ static void assign_column_objective_entries(parse_state_t& state, template static void merge_chunk_results_to_csr(parse_state_t& state, - std::vector& chunks, + std::vector& chunks, int num_threads) { scoped_timer_t timer("merge_chunks_to_csr"); @@ -2071,11 +1922,10 @@ static void parse_columns_section_parallel(parse_state_t& state, size_t chunk_limited_threads = std::max(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES); num_threads = std::max(1, std::min(num_threads, (int)chunk_limited_threads)); - // Compute chunk boundaries auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads); // Parse chunks in parallel - std::vector results(num_threads); + std::vector results(num_threads); { scoped_timer_t timer("parse_columns_chunk_parallel"); @@ -2145,7 +1995,7 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) return; } // Other objectives, ignored currently. cold path - if (state.is_ignored_objective_name(row_name)) { return; } + if (state.ignored_objective_names.count(row_name)) { return; } // Unexpected! error_unknown_row(cursor, row_start, "RHS"); }; @@ -2175,6 +2025,8 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) } } +// does the job on 99% of instances, in the vast majority of cases bound names are sequential with +// occasional sparsity static size_t find_var_after_hint(const std::vector& var_names, std::string_view var_name, size_t hint_idx) @@ -2230,7 +2082,7 @@ static bool apply_bound_record(std::string_view bound_type, if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits::infinity()); } set_type('I'); } else if (bound_type == "SC") { - if (__unlikely(!has_value)) { + if (UNLIKELY(!has_value)) { error("SC bound requires an upper bound value", bound_type); return false; } @@ -2252,7 +2104,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, { const size_t bounds_bytes = (size_t)(bounds_body_end - bounds_body_start); const int num_threads = phase_thread_count(MPS_BOUNDS_THREAD_CAP); - const bool use_dense_lookup = state.col_dense_ordered; + const bool use_dense_lookup = state.col_index_mode == index_mode_t::dense_ordered; const size_t min_parallel_bytes = use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES; if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; } @@ -2261,7 +2113,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint", nvtx::colors::bounds); - struct BoundsParallelStats { + struct bounds_parallel_stats_t { size_t lines = 0; size_t dense_hits = 0; size_t dense_misses = 0; @@ -2273,7 +2125,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, char error_msg[192] = {}; }; - std::vector stats((size_t)num_threads); + std::vector stats((size_t)num_threads); auto boundaries = compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads); @@ -2297,12 +2149,14 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, size_t prev_var = SIZE_MAX; size_t hint_idx = 0; auto lookup_var = [&](std::string_view var_name) { - if (use_dense_lookup) { return state.col_lookup_dense_ordered(var_name); } + if (use_dense_lookup) { return state.col_dense.lookup(var_name); } + // quite often variables are in order, so a cheap lookup trick is to look for the variable + // right after this one return find_var_after_hint(state.var_names_sv, var_name, hint_idx); }; try { while (cursor.ptr < cursor.end) { - if (__unlikely(*cursor.ptr == '$')) { + if (UNLIKELY(*cursor.ptr == '$')) { cursor.skip_to_eol(); expect_eol(cursor); local.comments++; @@ -2310,8 +2164,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } auto bound_type = cursor.read_field(); - if (__unlikely(bound_type.empty())) { break; } - if (__unlikely(bound_type[0] == '$')) { + if (UNLIKELY(bound_type.empty())) { break; } + if (UNLIKELY(bound_type[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); local.comments++; @@ -2321,7 +2175,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, auto bound_name = cursor.read_field(); (void)bound_name; auto var_name = cursor.read_field(); - if (__unlikely(!var_name.empty() && var_name[0] == '$')) { + if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); local.comments++; @@ -2329,7 +2183,7 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } size_t var_idx = lookup_var(var_name); - if (__unlikely(var_idx == SIZE_MAX)) { + if (UNLIKELY(var_idx == SIZE_MAX)) { local.dense_misses++; break; } @@ -2435,36 +2289,14 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, } template -static void parse_bounds_section(parse_state_t& state, - cursor_t& cursor, - bool allow_parallel_dense = false) +static void init_variable_bounds_defaults(parse_state_t& state) { size_t n_vars = (size_t)state.problem.n_vars_; - - // Initialize bounds with defaults { scoped_timer_t timer("bounds_init_defaults"); - const bool parallel_init = - n_vars >= MPS_BOUNDS_PARALLEL_INIT_MIN_VARS && omp_get_max_threads() >= 2; - - if (parallel_init) { -#pragma omp parallel sections num_threads(2) - { -#pragma omp section - { - state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); - } -#pragma omp section - { - state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); - } - } - } else { - state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); - state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); - } + state.problem.variable_lower_bounds_.resize(n_vars, f_t{0}); + state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits::infinity()); } - { scoped_timer_t timer("bounds_madvise_pretouch"); materialize_vector_hugepages("variable_lower_bounds", @@ -2474,6 +2306,35 @@ static void parse_bounds_section(parse_state_t& state, state.problem.variable_upper_bounds_, materialize_touch_t::write_4kb); } +} + +template +static void apply_unspecified_integer_bounds(parse_state_t& state, HasBound&& has_bound) +{ + scoped_timer_t timer("bounds_integer_defaults"); + size_t n_vars = (size_t)state.problem.n_vars_; + for (size_t i = 0; i < n_vars; ++i) { + if (!has_bound(i) && state.problem.var_types_[i] == 'I') { + state.problem.variable_lower_bounds_[i] = f_t{0}; + state.problem.variable_upper_bounds_[i] = f_t{1}; + } + } +} + +template +static void init_variable_bounds_without_bounds_section(parse_state_t& state) +{ + init_variable_bounds_defaults(state); + apply_unspecified_integer_bounds(state, [](size_t) { return false; }); +} + +template +static void parse_bounds_section(parse_state_t& state, + cursor_t& cursor, + bool allow_parallel_dense = false) +{ + size_t n_vars = (size_t)state.problem.n_vars_; + init_variable_bounds_defaults(state); std::vector bound_seen((n_vars + 63) / 64, 0); auto has_bound = [&](size_t var_idx) { @@ -2482,18 +2343,9 @@ static void parse_bounds_section(parse_state_t& state, auto mark_bound = [&](size_t var_idx) { bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63); }; - auto apply_unspecified_integer_bounds = [&]() { - scoped_timer_t timer("bounds_integer_defaults"); - for (size_t i = 0; i < n_vars; ++i) { - if (!has_bound(i) && state.problem.var_types_[i] == 'I') { - state.problem.variable_lower_bounds_[i] = f_t{0}; - state.problem.variable_upper_bounds_[i] = f_t{1}; - } - } - }; if (!accept_section(cursor, "BOUNDS")) { - apply_unspecified_integer_bounds(); + apply_unspecified_integer_bounds(state, has_bound); return; } @@ -2523,17 +2375,18 @@ static void parse_bounds_section(parse_state_t& state, auto bound_name = cursor.read_field(); (void)bound_name; auto var_name = cursor.read_field(); - if (__unlikely(!var_name.empty() && var_name[0] == '$')) { + if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; } // optimized lookup using hint (bounds often in same order as columns) - size_t var_idx = SIZE_MAX; + size_t var_idx = SIZE_MAX; + // handle annoying bounds-only vars that weren't declared in COLUMNS typename parse_state_t::bounds_only_var_t* aux_var = nullptr; - if (__likely(state.col_dense_ordered)) { - var_idx = state.col_lookup_dense_ordered(var_name); + if (LIKELY(state.col_index_mode == index_mode_t::dense_ordered)) { + var_idx = state.col_dense.lookup(var_name); if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; } } else { var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx); @@ -2586,15 +2439,12 @@ static void parse_bounds_section(parse_state_t& state, expect_eol(cursor); } } - apply_unspecified_integer_bounds(); + apply_unspecified_integer_bounds(state, has_bound); } template -static void parse_ranges_section(parse_state_t& state, cursor_t& cursor) +static void init_constraint_bounds_from_rows(parse_state_t& state) { - scoped_timer_t timer("parse_ranges"); - - // Initialize constraint bounds from row_types and b_ state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_); state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_); @@ -2612,6 +2462,13 @@ static void parse_ranges_section(parse_state_t& state, cursor_t& curso state.problem.constraint_upper_bounds_[i] = std::numeric_limits::infinity(); } } +} + +template +static void parse_ranges_section(parse_state_t& state, cursor_t& cursor) +{ + scoped_timer_t timer("parse_ranges"); + init_constraint_bounds_from_rows(state); if (!accept_section(cursor, "RANGES")) { return; } @@ -2654,7 +2511,7 @@ static void parse_ranges_section(parse_state_t& state, cursor_t& curso accept_comment(cursor); if (!cursor.eol()) { auto row_name2 = cursor.read_field(); - if (__unlikely(!row_name2.empty() && row_name2[0] == '$')) { + if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; @@ -2667,10 +2524,14 @@ static void parse_ranges_section(parse_state_t& state, cursor_t& curso } } +// quadratric stuff is bare bones for now, optimize if needed + template static void build_var_name_map_if_needed(parse_state_t& state) { - if (state.col_dense_ordered || !state.var_names_map.empty()) { return; } + if (state.col_index_mode == index_mode_t::dense_ordered || !state.var_names_map.empty()) { + return; + } scoped_timer_t timer("quadratic_build_var_name_map"); state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2); for (size_t i = 0; i < state.var_names_sv.size(); ++i) { @@ -2681,7 +2542,7 @@ static void build_var_name_map_if_needed(parse_state_t& state) template static size_t lookup_quadratic_var(parse_state_t& state, std::string_view name) { - if (state.col_dense_ordered) { return state.col_lookup_dense_ordered(name); } + if (state.col_index_mode == index_mode_t::dense_ordered) { return state.col_dense.lookup(name); } auto it = state.var_names_map.find(name); return it == state.var_names_map.end() ? SIZE_MAX : it->second; } @@ -2695,14 +2556,14 @@ static void build_quadratic_csr(parse_state_t& state, const size_t n_vars = (size_t)state.problem.n_vars_; if (entries.empty()) { return; } - struct ExpandedEntry { + struct expanded_entry_t { size_t row; size_t col; size_t seq; f_t value; }; - std::vector expanded; + std::vector expanded; expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size()); size_t seq = 0; for (const auto& [row_i, col_i, value] : entries) { @@ -2779,14 +2640,14 @@ static void parse_quadratic_sections(parse_state_t& state, cursor_t& c if (active_entries == nullptr) { break; } auto var1 = cursor.read_field(); - if (__unlikely(var1.empty())) { break; } - if (__unlikely(var1[0] == '$')) { + if (UNLIKELY(var1.empty())) { break; } + if (UNLIKELY(var1[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; } auto var2 = cursor.read_field(); - if (__unlikely(!var2.empty() && var2[0] == '$')) { + if (UNLIKELY(!var2.empty() && var2[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); continue; @@ -2847,37 +2708,29 @@ static void parse_rhs_range(parse_state_t& state, mps_phase_range_t ra } template -static void parse_bounds_range(parse_state_t& state, - mps_phase_range_t range, - const char* fallback_ptr) +static void parse_bounds_range(parse_state_t& state, mps_phase_range_t range) { - if (range.present) { - cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); - parse_bounds_section(state, cursor, range.present); - } else { - cursor_t cursor(fallback_ptr, 16); - parse_bounds_section(state, cursor, range.present); + if (!range.present) { + init_variable_bounds_without_bounds_section(state); + return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_bounds_section(state, cursor, true); } template -static void parse_ranges_range(parse_state_t& state, - mps_phase_range_t range, - const char* fallback_ptr) +static void parse_ranges_range(parse_state_t& state, mps_phase_range_t range) { - if (range.present) { - cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); - parse_ranges_section(state, cursor); - } else { - cursor_t cursor(fallback_ptr, 16); - parse_ranges_section(state, cursor); + if (!range.present) { + init_constraint_bounds_from_rows(state); + return; } + cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); + parse_ranges_section(state, cursor); } template -static void parse_quadratic_range(parse_state_t& state, - mps_phase_range_t range, - const char*) +static void parse_quadratic_range(parse_state_t& state, mps_phase_range_t range) { if (!range.present) { return; } cursor_t cursor(range.begin, (size_t)(range.end - range.begin)); @@ -2917,16 +2770,17 @@ static void materialize_problem_names(parse_state_t& state) { scoped_timer_t timer("materialize_problem_var_names"); - size_t n = state.col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size(); + const bool col_dense_ordered = state.col_index_mode == index_mode_t::dense_ordered; + size_t n = col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size(); state.problem.var_names_.resize(n); - if (state.col_dense_ordered && n >= 1'000'000 && num_threads > 1) { + if (col_dense_ordered && n >= 1'000'000 && num_threads > 1) { #pragma omp parallel for schedule(static) num_threads(num_threads) for (size_t i = 0; i < n; ++i) { - state.dense_col_name(i, state.problem.var_names_[i]); + state.col_dense.format_name(i, state.problem.var_names_[i]); } - } else if (state.col_dense_ordered) { + } else if (col_dense_ordered) { for (size_t i = 0; i < n; ++i) { - state.dense_col_name(i, state.problem.var_names_[i]); + state.col_dense.format_name(i, state.problem.var_names_[i]); } } else if (n >= 1'000'000 && num_threads > 1) { #pragma omp parallel for schedule(static) num_threads(num_threads) @@ -2969,7 +2823,7 @@ static std::size_t init_problem_storage( problem.objective_scaling_factor_ = f_t{1}; problem.objective_offset_ = f_t{0}; - std::size_t reserve_size = std::max(reserve_hint, 1024 * 1024); + std::size_t reserve_size = std::max(reserve_hint, 1 * MiB); std::size_t reserve_dim = std::max((size_t)1000, reserve_size / 1000); problem.A_offsets_.reserve(reserve_dim); problem.b_.reserve(reserve_dim); @@ -2984,26 +2838,14 @@ static std::size_t init_problem_storage( return reserve_dim; } -static const char* trailing_endata_cursor_end(mps_phase_registry_t& registry) -{ - mps_phase_range_t quadratic = registry.range(mps_phase_kind::quadratic); - if (quadratic.present) { return quadratic.end; } - mps_phase_range_t bounds = registry.range(mps_phase_kind::bounds); - if (bounds.present) { return bounds.end; } - mps_phase_range_t ranges = registry.range(mps_phase_kind::ranges); - if (ranges.present) { return ranges.end; } - return registry.range(mps_phase_kind::rhs).end; -} - template static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( Stream& stream, const char* total_timer_name, const char* producer_task_name) { - omp_set_max_active_levels(2); + omp_max_active_levels_guard_t omp_active_levels(2); input_stream_view_t input = stream.view(); - timer_io_context_t timer_io_context(input.compressed_size); - auto total_timer = std::make_unique(total_timer_name); + auto total_timer = std::make_unique(total_timer_name); cuopt::linear_programming::io::mps_data_model_t problem; std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint()); @@ -3159,7 +3001,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ { run_parser_task([&] { MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges); - parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges), input.data); + parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges)); phase_end("ranges"); }); } @@ -3168,7 +3010,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ { run_parser_task([&] { MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds); - parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds), input.data); + parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds)); phase_end("bounds"); }); } @@ -3177,8 +3019,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ { run_parser_task([&] { MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic); - parse_quadratic_range( - state, input.registry->range(mps_phase_kind::quadratic), input.data); + parse_quadratic_range(state, input.registry->range(mps_phase_kind::quadratic)); phase_end("quadratic"); }); } @@ -3192,93 +3033,34 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ append_bounds_only_variables(state); input.size = stream.size(); - cursor.ptr = trailing_endata_cursor_end(*input.registry); cursor.end = input.data + input.size; - if (!cursor.done()) { expect(cursor, "ENDATA"); } + if (!input.registry->endata_ready() || !input.registry->endata_present()) { + cursor.ptr = + input.registry->endata_ready() ? input.registry->endata_begin() : input.data + input.size; + cursor.error("missing ENDATA"); + } + cursor.ptr = input.registry->endata_begin(); + expect(cursor, "ENDATA"); total_timer.reset(); flush_timers(); return problem; } -struct small_raw_read_t { - bool use_small_path = false; +struct padded_memory_input_t { std::vector buffer; - std::size_t size = 0; + std::size_t input_size = 0; + std::size_t compressed_size = 0; }; -static small_raw_read_t try_read_small_raw_file(const std::string& path) -{ - FILE* file = std::fopen(path.c_str(), "rb"); - if (file == nullptr) { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to open raw MPS file '%s': %s", - path.c_str(), - std::strerror(errno)); - } - std::unique_ptr file_guard(file, &std::fclose); - - if (std::fseek(file, 0, SEEK_END) != 0) { - mps_parser_fail(error_type_t::RuntimeError, "Failed to seek raw MPS file '%s'", path.c_str()); - } - long file_size_long = std::ftell(file); - if (file_size_long < 0) { - mps_parser_fail( - error_type_t::RuntimeError, "Failed to determine raw MPS file size '%s'", path.c_str()); - } - std::size_t file_size = (std::size_t)file_size_long; - if (file_size > MPS_SMALL_RAW_FILE_BYTES) { return {}; } - if (std::fseek(file, 0, SEEK_SET) != 0) { - mps_parser_fail(error_type_t::RuntimeError, "Failed to rewind raw MPS file '%s'", path.c_str()); - } - - if (file_size > std::numeric_limits::max() - input_buffer_padding_bytes) { - mps_parser_fail(error_type_t::OutOfMemoryError, "small raw input padding size overflow"); - } - std::vector buffer(file_size + input_buffer_padding_bytes); - if (file_size != 0 && std::fread(buffer.data(), 1, file_size, file) != file_size) { - mps_parser_fail(error_type_t::RuntimeError, "Failed to read raw MPS file '%s'", path.c_str()); - } - return {true, std::move(buffer), file_size}; -} - -template -static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_small_raw_file( - std::vector buffer, std::size_t input_size) +static padded_memory_input_t read_compressed_mps_file(const std::string& path) { - auto total_timer = std::make_unique("parse_mps_fast_file_raw_small (total)"); - const char* data = buffer.data(); - const char* end = data + input_size; - - mps_phase_registry_t registry; - mps_section_block_scanner_t scanner(data, 1, registry); - scanner.observe_block(0, data, end); - scanner.publish_ready(input_size); - - cuopt::linear_programming::io::mps_data_model_t problem; - std::size_t reserve_dim = init_problem_storage(problem, input_size); + std::vector buffer = cuopt::linear_programming::io::detail::file_to_string(path); + if (buffer.empty()) { buffer.push_back('\0'); } - cursor_t cursor(data, input_size); - parse_state_t state(problem, cursor); - state.row_names_sv.reserve(reserve_dim); - - parse_header_range(state, registry.range(mps_phase_kind::header)); - parse_rows_range(state, registry.range(mps_phase_kind::rows)); - parse_columns_range(state, registry.range(mps_phase_kind::columns), 1); - materialize_problem_names(state); - parse_rhs_range(state, registry.range(mps_phase_kind::rhs)); - parse_ranges_range(state, registry.range(mps_phase_kind::ranges), data); - parse_bounds_range(state, registry.range(mps_phase_kind::bounds), data); - parse_quadratic_range(state, registry.range(mps_phase_kind::quadratic), data); - append_bounds_only_variables(state); - - cursor.ptr = trailing_endata_cursor_end(registry); - cursor.end = end; - if (!cursor.done()) { expect(cursor, "ENDATA"); } - - total_timer.reset(); - flush_timers(); - return problem; + std::size_t input_size = buffer.size() - 1; + buffer.resize(input_size + input_buffer_padding_bytes, '\0'); + return {std::move(buffer), input_size, get_file_size(path)}; } template @@ -3286,22 +3068,30 @@ cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( const std::string& path, FileReadMethod read_method) { FileReadMethod effective_method = effective_file_read_method(path, read_method); - if (effective_method == FileReadMethod::Lz4) { - Lz4InputStream stream(path); - return parse_mps_fast_stream( - stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode"); - } - if (effective_method == FileReadMethod::Read) { - small_raw_read_t small_raw = try_read_small_raw_file(path); - if (small_raw.use_small_path) { - return parse_mps_fast_small_raw_file(std::move(small_raw.buffer), small_raw.size); - } - RawInputStream stream(path); - return parse_mps_fast_stream( - stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); - } - mps_parser_fail(error_type_t::RuntimeError, - "single-path parser supports raw read and LZ4 inputs only"); + switch (effective_method) { + case FileReadMethod::Lz4: { + lz4_input_stream_t stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode"); + } + case FileReadMethod::Gzip: + case FileReadMethod::Bzip2: { + padded_memory_input_t input = read_compressed_mps_file(path); + memory_input_stream_t stream( + std::move(input.buffer), input.input_size, input.compressed_size); + const char* timer_name = effective_method == FileReadMethod::Gzip + ? "parse_mps_fast_file_gzip (total)" + : "parse_mps_fast_file_bzip2 (total)"; + return parse_mps_fast_stream( + stream, timer_name, "task_memory_scan"); + } + case FileReadMethod::Read: { + raw_input_stream_t stream(path); + return parse_mps_fast_stream( + stream, "parse_mps_fast_file_raw (total)", "task_raw_read"); + } + } + __builtin_unreachable(); } template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index dc9ae86abc..e69de29bb2 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -1,288 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights -// reserved. SPDX-License-Identifier: Apache-2.0 - -#include "file_reader.hpp" -#include "nvtx_ranges.hpp" - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace mps_fast { - -using cuopt::linear_programming::io::error_type_t; -using cuopt::linear_programming::io::mps_parser_expects; -using cuopt::linear_programming::io::mps_parser_fail; - -namespace { - -constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; -constexpr std::size_t raw_input_max_read_threads = 8; -constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull; - -bool path_has_suffix(const std::string& path, const char* suffix) noexcept -{ - std::size_t suffix_len = std::strlen(suffix); - return path.size() >= suffix_len && - path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; -} - -std::size_t get_file_size(int fd, const std::string& path) -{ - struct stat st; - if (::fstat(fd, &st) != 0) { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to stat file '%s': %s", - path.c_str(), - std::strerror(errno)); - } - return (std::size_t)st.st_size; -} - -std::size_t system_page_size() -{ - static std::size_t page_size = [] { - long value = ::sysconf(_SC_PAGESIZE); - return value > 0 ? (std::size_t)value : (std::size_t)4096; - }(); - return page_size; -} - -std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) -{ - if (alignment == 0) { return value; } - std::size_t remainder = value % alignment; - if (remainder == 0) { return value; } - std::size_t increment = alignment - remainder; - if (value > std::numeric_limits::max() - increment) { - mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); - } - return value + increment; -} - -std::size_t add_input_padding(std::size_t size) -{ - if (size > std::numeric_limits::max() - input_buffer_padding_bytes) { - mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow"); - } - return size + input_buffer_padding_bytes; -} - -} // namespace - -RawInputStream::RawInputStream(const std::string& path) : path_(path) -{ - MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); - buffered_fd_ = ::open(path.c_str(), O_RDONLY); - if (buffered_fd_ < 0) { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to open raw MPS file '%s': %s", - path.c_str(), - std::strerror(errno)); - } - - file_size_ = get_file_size(buffered_fd_, path); - fd_ = buffered_fd_; - bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes; - if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) { - use_direct_io = raw_direct[0] != '0'; - } - if (use_direct_io) { -#ifdef O_DIRECT - int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); - if (direct_fd >= 0) { - fd_ = direct_fd; - direct_io_ = true; - } -#endif - } - window_bytes_ = raw_input_window_bytes; - window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); - - output_mapped_size_ = round_up_to_multiple( - std::max(add_input_padding(file_size_), 1), system_page_size()); - output_region_ = mmap_region_t::anonymous( - output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); - output_data_ = output_region_.char_data(); - output_region_.advise(MADV_HUGEPAGE); - - block_done_.resize(window_count_, 0); - block_end_.resize(window_count_, 0); - section_scanner_ = - std::make_unique(output_data_, window_count_, registry_); -} - -RawInputStream::~RawInputStream() -{ - if (fd_ >= 0) { ::close(fd_); } - if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); } -} - -const char* RawInputStream::data() const noexcept { return output_data_; } -char* RawInputStream::mutable_data() noexcept { return output_data_; } -std::size_t RawInputStream::size() const noexcept { return output_view_size_; } -std::size_t RawInputStream::compressed_size() const noexcept { return file_size_; } -std::size_t RawInputStream::reserve_size_hint() const noexcept { return file_size_; } -mps_phase_registry_t& RawInputStream::registry() noexcept { return registry_; } -input_stream_view_t RawInputStream::view() noexcept -{ - return {output_data_, output_data_, output_view_size_, file_size_, ®istry_}; -} - -void RawInputStream::run_decode_tasks() -{ - MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io); - if (file_size_ == 0) { - output_view_size_ = 0; - section_scanner_->publish_ready(0); - return; - } - - std::size_t hw_threads = - std::max(1, (std::size_t)std::thread::hardware_concurrency()); - std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); - thread_count = std::max(1, std::min(thread_count, window_count_)); - - std::atomic_size_t next_window{0}; - std::exception_ptr first_error = nullptr; - std::mutex error_mutex; - std::atomic_bool stop{false}; - - auto mark_error = [&](std::exception_ptr eptr) { - std::lock_guard lock(error_mutex); - if (!first_error) { - first_error = eptr; - stop.store(true, std::memory_order_release); - } - }; - - auto read_window = [&](std::size_t index) { - MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); - std::size_t offset = index * window_bytes_; - std::size_t size = std::min(window_bytes_, file_size_ - offset); - std::size_t done = 0; - { - MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); - while (done < size) { - ssize_t got = - ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); - if (got < 0) { - if (errno == EINTR) { continue; } - if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) { - got = ::pread( - buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); - if (got >= 0) { - done += (std::size_t)got; - continue; - } - if (errno == EINTR) { continue; } - } - mps_parser_fail(error_type_t::RuntimeError, - "Failed to pread raw MPS file '%s': %s", - path_.c_str(), - std::strerror(errno)); - } - if (got == 0) { - mps_parser_fail(error_type_t::RuntimeError, - "Unexpected EOF while reading raw MPS file '%s'", - path_.c_str()); - } - done += (std::size_t)got; - } - } - - { - MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); - section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); - frontier_mutex_.lock(); - block_done_[index] = 1; - block_end_[index] = offset + size; - std::size_t before = ready_bytes_; - while (next_block_ < block_done_.size() && block_done_[next_block_]) { - ready_bytes_ = block_end_[next_block_]; - ++next_block_; - } - std::size_t after = ready_bytes_; - frontier_mutex_.unlock(); - if (after > before) { section_scanner_->publish_ready(after); } - } - }; - - std::vector workers; - workers.reserve(thread_count); - for (std::size_t t = 0; t < thread_count; ++t) { - workers.emplace_back([&, t] { - std::string thread_name = "raw-input-read-" + std::to_string(t); - nvtx::name_current_thread(thread_name.c_str()); - MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io); - while (!stop.load(std::memory_order_acquire)) { - std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); - if (index >= window_count_) { break; } - try { - read_window(index); - } catch (...) { - mark_error(std::current_exception()); - return; - } - } - }); - } - for (auto& worker : workers) { - worker.join(); - } - if (first_error) { std::rethrow_exception(first_error); } - - output_view_size_ = ready_bytes_; - section_scanner_->publish_ready(output_view_size_); -} - -bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); } - -void drop_file_cache(const std::string& path) -{ - MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io); - int fd = ::open(path.c_str(), O_RDONLY); - if (fd < 0) { return; } - - ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); - ::close(fd); -} - -FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method) -{ - if (has_lz4_extension(path)) { return FileReadMethod::Lz4; } - if (method == FileReadMethod::Lz4) { - mps_parser_fail( - error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str()); - } - return method; -} - -const char* file_read_method_name(FileReadMethod method) noexcept -{ - switch (method) { - case FileReadMethod::Read: return "read"; - case FileReadMethod::Lz4: return "lz4"; - default: return "unknown"; - } -} - -} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index bab63c76cf..b0089be257 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -23,12 +23,12 @@ struct lz4_pipeline_t; /** * @brief File reading method selection */ -enum class FileReadMethod { Read, Lz4 }; +enum class FileReadMethod { Read, Lz4, Gzip, Bzip2 }; /** * @brief Return the effective method for a path. * - * .lz4 inputs are decompressed; all other inputs use raw input reads. + * Compressed inputs are auto-detected by extension; all other inputs use raw input reads. */ FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method); @@ -41,6 +41,8 @@ const char* file_read_method_name(FileReadMethod method) noexcept; * @brief True when the file name has an lz4 extension. */ bool has_lz4_extension(const std::string& path) noexcept; +bool has_gzip_extension(const std::string& path) noexcept; +bool has_bzip2_extension(const std::string& path) noexcept; /** * @brief Ask the OS to evict clean cached pages for this file. @@ -49,6 +51,17 @@ bool has_lz4_extension(const std::string& path) noexcept; */ void drop_file_cache(const std::string& path); +/** + * @brief OS memory page size, queried once and cached. + */ +std::size_t system_page_size(); + +/** + * @brief File size in bytes; fails with a parser error if it cannot be determined. + */ +std::size_t get_file_size(int fd, const std::string& path); +std::size_t get_file_size(const std::string& path); + struct input_stream_view_t { const char* data = nullptr; char* mutable_data = nullptr; @@ -57,13 +70,13 @@ struct input_stream_view_t { mps_phase_registry_t* registry = nullptr; }; -class Lz4InputStream { +class lz4_input_stream_t { public: - explicit Lz4InputStream(const std::string& path); - ~Lz4InputStream(); + explicit lz4_input_stream_t(const std::string& path); + ~lz4_input_stream_t(); - Lz4InputStream(const Lz4InputStream&) = delete; - Lz4InputStream& operator=(const Lz4InputStream&) = delete; + lz4_input_stream_t(const lz4_input_stream_t&) = delete; + lz4_input_stream_t& operator=(const lz4_input_stream_t&) = delete; const char* data() const noexcept; char* mutable_data() noexcept; @@ -97,21 +110,17 @@ class Lz4InputStream { bool dict_id_ = false; mps_phase_registry_t registry_; std::mutex commit_mutex_; - std::mutex frontier_mutex_; - std::vector block_done_; - std::vector block_end_; std::unique_ptr section_scanner_; - std::size_t next_block_ = 0; - std::size_t ready_bytes_ = 0; + std::size_t block_slot_count_ = 0; }; -class RawInputStream { +class raw_input_stream_t { public: - explicit RawInputStream(const std::string& path); - ~RawInputStream(); + explicit raw_input_stream_t(const std::string& path); + ~raw_input_stream_t(); - RawInputStream(const RawInputStream&) = delete; - RawInputStream& operator=(const RawInputStream&) = delete; + raw_input_stream_t(const raw_input_stream_t&) = delete; + raw_input_stream_t& operator=(const raw_input_stream_t&) = delete; const char* data() const noexcept; char* mutable_data() noexcept; @@ -144,4 +153,31 @@ class RawInputStream { std::size_t ready_bytes_ = 0; }; +class memory_input_stream_t { + public: + memory_input_stream_t(std::vector buffer, + std::size_t input_size, + std::size_t compressed_size); + + memory_input_stream_t(const memory_input_stream_t&) = delete; + memory_input_stream_t& operator=(const memory_input_stream_t&) = delete; + + const char* data() const noexcept; + char* mutable_data() noexcept; + std::size_t size() const noexcept; + std::size_t compressed_size() const noexcept; + std::size_t reserve_size_hint() const noexcept; + mps_phase_registry_t& registry() noexcept; + input_stream_view_t view() noexcept; + + void run_decode_tasks(); + + private: + std::vector buffer_; + std::size_t input_size_ = 0; + std::size_t compressed_size_ = 0; + mps_phase_registry_t registry_; + std::unique_ptr section_scanner_; +}; + } // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index bb6657e303..b25e330999 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -7,6 +7,8 @@ #include +#include + #ifdef _OPENMP #include #endif @@ -14,7 +16,6 @@ #include #include #include -#include #include #include @@ -51,10 +52,19 @@ constexpr uint32_t lz4_block_size_mask = 0x7FFFFFFFu; constexpr std::size_t lz4_pipeline_batch_bytes = 64ull * 1024ull * 1024ull; constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull; constexpr std::size_t lz4_input_max_io_threads = 8; -constexpr std::size_t lz4_no_content_size_reserve_ratio = 16; +constexpr std::size_t lz4_no_content_size_reserve_ratio = 128; using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int); +std::size_t estimate_lz4_no_content_size(std::size_t compressed_size) +{ + constexpr std::size_t max_size = std::numeric_limits::max(); + if (compressed_size > max_size / lz4_no_content_size_reserve_ratio) { + return max_size - input_buffer_padding_bytes; + } + return compressed_size * lz4_no_content_size_reserve_ratio; +} + #if defined(MPS_PARSER_WITH_LZ4) struct lz4_runtime_t { void* handle = nullptr; @@ -138,8 +148,6 @@ int open_lz4_fd(const std::string& path) return fd; } -std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment); - uint32_t read_le32(const char* ptr) { const auto* p = reinterpret_cast(ptr); @@ -168,67 +176,6 @@ std::size_t block_max_size_from_bd(unsigned char bd) } } -std::size_t checked_size(uint64_t value, const char* label) -{ - if (value > (uint64_t)std::numeric_limits::max()) { - mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 %s exceeds size_t", label); - } - return (std::size_t)value; -} - -std::size_t get_file_size(int fd, const std::string& path) -{ - struct stat st; - if (::fstat(fd, &st) != 0) { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to stat file '%s': %s", - path.c_str(), - std::strerror(errno)); - } - if (st.st_size < 0) { - mps_parser_fail( - error_type_t::RuntimeError, "Invalid negative file size for '%s'", path.c_str()); - } - return (std::size_t)st.st_size; -} - -std::size_t system_page_size() -{ - static std::size_t page_size = [] { - long value = ::sysconf(_SC_PAGESIZE); - return value > 0 ? (std::size_t)value : (std::size_t)4096; - }(); - return page_size; -} - -std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) -{ - if (alignment == 0) { return value; } - std::size_t remainder = value % alignment; - if (remainder == 0) { return value; } - std::size_t increment = alignment - remainder; - if (value > std::numeric_limits::max() - increment) { - mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); - } - return value + increment; -} - -std::size_t checked_mul(std::size_t a, std::size_t b, const char* label) -{ - if (a != 0 && b > std::numeric_limits::max() / a) { - mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label); - } - return a * b; -} - -std::size_t checked_add(std::size_t a, std::size_t b, const char* label) -{ - if (a > std::numeric_limits::max() - b) { - mps_parser_fail(error_type_t::OutOfMemoryError, "%s size overflow", label); - } - return a + b; -} - bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) { std::size_t done = 0; @@ -332,7 +279,7 @@ class lz4_resident_windows_t { } // namespace -Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) +lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) { MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io); @@ -384,7 +331,7 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) mps_parser_fail(error_type_t::ValidationError, "truncated LZ4 frame while reading content size"); } - content_size_ = checked_size(read_le64(header + offset), "content size"); + content_size_ = (std::size_t)read_le64(header + offset); offset += 8; } if (dict_id_) { @@ -403,14 +350,13 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) std::size_t reserve_size = content_size_; if (!content_size_present_) { - reserve_size = - checked_mul(compressed_size_, lz4_no_content_size_reserve_ratio, "LZ4 output reserve"); + reserve_size = estimate_lz4_no_content_size(compressed_size_); reserve_size = std::max(reserve_size, block_max_size_); } - reserve_size = checked_add(reserve_size, input_buffer_padding_bytes, "LZ4 output padding"); + reserve_size += input_buffer_padding_bytes; constexpr std::size_t huge_alignment = 2 * 1024 * 1024; - output_mapped_size_ = round_up_to_multiple(reserve_size, system_page_size()); + output_mapped_size_ = cuda::round_up(reserve_size, system_page_size()); output_region_ = mmap_region_t::anonymous_aligned(output_mapped_size_, huge_alignment, PROT_NONE, @@ -418,36 +364,34 @@ Lz4InputStream::Lz4InputStream(const std::string& path) : path_(path) "LZ4 output buffer"); output_data_ = output_region_.char_data(); - std::size_t block_slots = - std::max(1, (reserve_size + block_max_size_ - 1) / block_max_size_ + 1); - block_done_.resize(block_slots, 0); - block_end_.resize(block_slots, 0); + block_slot_count_ = std::max(1, cuda::ceil_div(reserve_size, block_max_size_) + 1); section_scanner_ = - std::make_unique(output_data_, block_slots, registry_); + std::make_unique(output_data_, block_slot_count_, registry_); } -Lz4InputStream::~Lz4InputStream() +lz4_input_stream_t::~lz4_input_stream_t() { if (fd_ >= 0) { ::close(fd_); } } -const char* Lz4InputStream::data() const noexcept { return output_data_; } -char* Lz4InputStream::mutable_data() noexcept { return output_data_; } -std::size_t Lz4InputStream::size() const noexcept { return output_view_size_; } -std::size_t Lz4InputStream::compressed_size() const noexcept { return compressed_size_; } -std::size_t Lz4InputStream::reserve_size_hint() const noexcept +const char* lz4_input_stream_t::data() const noexcept { return output_data_; } +char* lz4_input_stream_t::mutable_data() noexcept { return output_data_; } +std::size_t lz4_input_stream_t::size() const noexcept { return output_view_size_; } +std::size_t lz4_input_stream_t::compressed_size() const noexcept { return compressed_size_; } +std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept { - return content_size_present_ ? content_size_ - : std::max(compressed_size_ * 6, 1024 * 1024); + return content_size_present_ + ? content_size_ + : std::max(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024); } -mps_phase_registry_t& Lz4InputStream::registry() noexcept { return registry_; } -input_stream_view_t Lz4InputStream::view() noexcept +mps_phase_registry_t& lz4_input_stream_t::registry() noexcept { return registry_; } +input_stream_view_t lz4_input_stream_t::view() noexcept { return {output_data_, output_data_, output_view_size_, compressed_size_, ®istry_}; } -void Lz4InputStream::commit_up_to(std::size_t bytes) +void lz4_input_stream_t::commit_up_to(std::size_t bytes) { MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc); std::lock_guard lock(commit_mutex_); @@ -455,7 +399,7 @@ void Lz4InputStream::commit_up_to(std::size_t bytes) if (bytes > output_mapped_size_) { mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping"); } - std::size_t new_committed = round_up_to_multiple(bytes, system_page_size()); + std::size_t new_committed = cuda::round_up(bytes, system_page_size()); if (new_committed > output_mapped_size_) new_committed = output_mapped_size_; std::size_t add = new_committed - output_committed_size_; void* target = output_data_ + output_committed_size_; @@ -476,15 +420,17 @@ struct resident_block_desc_t { }; struct lz4_pipeline_t { - explicit lz4_pipeline_t(Lz4InputStream& input_) + explicit lz4_pipeline_t(lz4_input_stream_t& input_) : input(input_), - window_count((input.compressed_size_ + window_bytes - 1) / window_bytes), + window_count(cuda::ceil_div(input.compressed_size_, window_bytes)), windows(window_count), io_threads(std::min(lz4_input_max_io_threads, window_count)), window_done(window_count, 0), window_refs(window_count), window_scanned(window_count), - window_released(window_count) + window_released(window_count), + block_done(input.block_slot_count_, 0), + block_end(input.block_slot_count_, 0) { for (std::size_t i = 0; i < window_count; ++i) { std::size_t offset = i * window_bytes; @@ -516,9 +462,8 @@ struct lz4_pipeline_t { void finalize() { - input.output_view_size_ = input.ready_bytes_; - input.commit_up_to( - checked_add(input.output_view_size_, input_buffer_padding_bytes, "LZ4 output padding")); + input.output_view_size_ = ready_bytes; + input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes); input.section_scanner_->publish_ready(input.output_view_size_); } @@ -698,15 +643,15 @@ struct lz4_pipeline_t { std::size_t after = 0; { MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic); - std::lock_guard lock(input.frontier_mutex_); - input.block_done_[block.index] = 1; - input.block_end_[block.index] = block.decompressed_offset + actual_size; - before = input.ready_bytes_; - while (input.next_block_ < input.block_done_.size() && input.block_done_[input.next_block_]) { - input.ready_bytes_ = input.block_end_[input.next_block_]; - ++input.next_block_; + std::lock_guard lock(frontier_mutex); + block_done[block.index] = 1; + block_end[block.index] = block.decompressed_offset + actual_size; + before = ready_bytes; + while (next_block < block_done.size() && block_done[next_block]) { + ready_bytes = block_end[next_block]; + ++next_block; } - after = input.ready_bytes_; + after = ready_bytes; } if (after > before) { MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic); @@ -792,7 +737,7 @@ struct lz4_pipeline_t { batch_decoded_bytes += block.decompressed_size; batch.push_back(block); blocks_scanned.fetch_add(1, std::memory_order_relaxed); - if (blocks_scanned.load(std::memory_order_relaxed) > input.block_done_.size()) { + if (blocks_scanned.load(std::memory_order_relaxed) > block_done.size()) { mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 input block count exceeded reserved metadata slots"); } @@ -898,7 +843,7 @@ struct lz4_pipeline_t { } } - Lz4InputStream& input; + lz4_input_stream_t& input; const std::size_t window_bytes = lz4_pipeline_batch_bytes; const std::size_t window_count; std::vector windows; @@ -927,9 +872,16 @@ struct lz4_pipeline_t { std::vector> crossing_payloads; std::vector readers; std::vector decoders; + + // Tracks the contiguous decoded-byte frontier across out-of-order block completions. + std::mutex frontier_mutex; + std::vector block_done; + std::vector block_end; + std::size_t next_block = 0; + std::size_t ready_bytes = 0; }; -void Lz4InputStream::run_decode_tasks() +void lz4_input_stream_t::run_decode_tasks() { MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io); lz4_pipeline_t pipeline(*this); diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp index d7b299917b..389f563efa 100644 --- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -6,6 +6,8 @@ #include #include +#include + #include #include #include @@ -74,7 +76,7 @@ class mmap_region_t { static mmap_region_t anonymous_aligned( std::size_t size, std::size_t alignment, int prot, int flags, const char* context) { - if (alignment == 0 || (alignment & (alignment - 1)) != 0) { + if (!cuda::is_power_of_two(alignment)) { mps_parser_fail(error_type_t::RuntimeError, "mmap aligned allocation requires power-of-two alignment"); } diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index 498b106955..9eee8708e0 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -121,6 +121,31 @@ mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const return ranges_[idx]; } +void mps_phase_registry_t::publish_endata(const char* begin, bool present) +{ + std::lock_guard lock(mutex_); + endata_begin_ = begin; + endata_present_ = present; + endata_ready_.store(true, std::memory_order_release); +} + +bool mps_phase_registry_t::endata_ready() const +{ + return endata_ready_.load(std::memory_order_acquire); +} + +const char* mps_phase_registry_t::endata_begin() const +{ + assert(endata_ready()); + return endata_begin_; +} + +bool mps_phase_registry_t::endata_present() const +{ + assert(endata_ready()); + return endata_present_; +} + static section_record_match_t is_section_record(const char* line_start, const char* line_end, mps_section_kind* kind) @@ -397,6 +422,12 @@ void mps_section_block_scanner_t::publish_section_ranges() registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false}); } } + + if (available(endata)) { + registry_.publish_endata(endata, true); + } else if (final_ready && final_boundary != nullptr) { + registry_.publish_endata(final_boundary, false); + } } } // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index 74bf89da7f..9fcffa6ea7 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -51,6 +51,11 @@ class mps_phase_registry_t { // acquire load in ready() pairs with publish()'s release store before ranges_. mps_phase_range_t range(mps_phase_kind phase) const; + void publish_endata(const char* begin, bool present); + bool endata_ready() const; + const char* endata_begin() const; + bool endata_present() const; + private: static constexpr std::size_t phase_count = 7; @@ -61,6 +66,9 @@ class mps_phase_registry_t { omp_event_handle_t events_[phase_count]{}; bool has_event_[phase_count]{}; bool event_fulfilled_[phase_count]{}; + const char* endata_begin_ = nullptr; + bool endata_present_ = false; + std::atomic endata_ready_{false}; mutable std::mutex mutex_; }; diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp index f8a6d04d1e..fac9e64d78 100644 --- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -59,24 +59,24 @@ inline std::uint32_t color_for_name(std::string_view name) noexcept return colors::generic; } -class scoped_range { +class scoped_range_t { public: - explicit scoped_range(const char* name, - std::uint32_t color = colors::generic, - std::uint32_t category = 0) + explicit scoped_range_t(const char* name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) { push(name, color, category); } - explicit scoped_range(std::string name, - std::uint32_t color = colors::generic, - std::uint32_t category = 0) + explicit scoped_range_t(std::string name, + std::uint32_t color = colors::generic, + std::uint32_t category = 0) : owned_name_(std::move(name)) { push(owned_name_.c_str(), color, category); } - ~scoped_range() { end(); } + ~scoped_range_t() { end(); } void end() { @@ -88,8 +88,8 @@ class scoped_range { #endif } - scoped_range(const scoped_range&) = delete; - scoped_range& operator=(const scoped_range&) = delete; + scoped_range_t(const scoped_range_t&) = delete; + scoped_range_t& operator=(const scoped_range_t&) = delete; private: void push(const char* name, std::uint32_t color, std::uint32_t category) @@ -132,4 +132,4 @@ inline void name_current_thread(const char* name) #define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b #define MPS_FAST_NVTX_CONCAT(a, b) MPS_FAST_NVTX_CONCAT_INNER(a, b) #define MPS_NVTX_RANGE(name, color) \ - ::mps_fast::nvtx::scoped_range MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color) + ::mps_fast::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color) diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp index 77b92d90e9..5823381098 100644 --- a/cpp/src/io/file_to_string.cpp +++ b/cpp/src/io/file_to_string.cpp @@ -22,9 +22,9 @@ #include #endif // MPS_PARSER_WITH_ZLIB -#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) +#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) || defined(MPS_PARSER_WITH_LZ4) #include -#endif // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB +#endif // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB || MPS_PARSER_WITH_LZ4 namespace { using cuopt::linear_programming::io::error_type_t; @@ -207,6 +207,163 @@ std::vector zlib_file_to_string(const std::string& file) } // end namespace #endif // MPS_PARSER_WITH_ZLIB +#ifdef MPS_PARSER_WITH_LZ4 +namespace { +// Minimal liblz4 frame ABI declarations; keep in sync with lz4frame.h. +struct LZ4F_dctx; +using LZ4F_errorCode_t = size_t; +struct LZ4F_frameInfo_t { + int blockSizeID; + int blockMode; + int contentChecksumFlag; + int frameType; + unsigned long long contentSize; + unsigned dictID; + int blockChecksumFlag; +}; +using LZ4F_createDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx**, unsigned); +using LZ4F_freeDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx*); +using LZ4F_getFrameInfo_t = LZ4F_errorCode_t (*)(LZ4F_dctx*, + LZ4F_frameInfo_t*, + const void*, + size_t*); +using LZ4F_decompress_t = + LZ4F_errorCode_t (*)(LZ4F_dctx*, void*, size_t*, const void*, size_t*, const void*); +using LZ4F_isError_t = unsigned (*)(LZ4F_errorCode_t); +using LZ4F_getErrorName_t = const char* (*)(LZ4F_errorCode_t); + +std::vector lz4_file_to_string(const std::string& file) +{ + struct DlCloseDeleter { + void operator()(void* fp) + { + mps_parser_expects_fatal( + dlclose(fp) == 0, error_type_t::ValidationError, "Error closing liblz4.so!"); + } + }; + struct Lz4DctxDeleter { + void operator()(LZ4F_dctx* f) + { + if (f != nullptr) { + const LZ4F_errorCode_t err = fptr(f); + mps_parser_expects_fatal( + !is_error(err), error_type_t::ValidationError, "Error closing lz4 file!"); + } + } + LZ4F_freeDecompressionContext_t fptr = nullptr; + LZ4F_isError_t is_error = nullptr; + }; + + void* raw_lz4handle = nullptr; + for (const char* soname : {"liblz4.so.1", "liblz4.so"}) { + raw_lz4handle = dlopen(soname, RTLD_LAZY); + if (raw_lz4handle != nullptr) break; + } + std::unique_ptr lz4handle{raw_lz4handle}; + mps_parser_expects(lz4handle != nullptr, + error_type_t::ValidationError, + "Could not open .lz4 file since liblz4 was not found " + "(tried liblz4.so.1, liblz4.so). In order to open .lz4 files directly, " + "please ensure liblz4 is installed. Alternatively, decompress the .lz4 file " + "manually and open the uncompressed file. Given path: %s", + file.c_str()); + + LZ4F_createDecompressionContext_t LZ4F_createDecompressionContext = + reinterpret_cast( + dlsym(lz4handle.get(), "LZ4F_createDecompressionContext")); + LZ4F_freeDecompressionContext_t LZ4F_freeDecompressionContext = + reinterpret_cast( + dlsym(lz4handle.get(), "LZ4F_freeDecompressionContext")); + LZ4F_getFrameInfo_t LZ4F_getFrameInfo = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_getFrameInfo")); + LZ4F_decompress_t LZ4F_decompress = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_decompress")); + LZ4F_isError_t LZ4F_isError = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_isError")); + LZ4F_getErrorName_t LZ4F_getErrorName = + reinterpret_cast(dlsym(lz4handle.get(), "LZ4F_getErrorName")); + mps_parser_expects( + LZ4F_createDecompressionContext != nullptr && LZ4F_freeDecompressionContext != nullptr && + LZ4F_getFrameInfo != nullptr && LZ4F_decompress != nullptr && LZ4F_isError != nullptr && + LZ4F_getErrorName != nullptr, + error_type_t::ValidationError, + "Error loading liblz4! Library version might be incompatible. Please decompress the .lz4 " + "file manually and open the uncompressed file. Given path: %s", + file.c_str()); + + std::unique_ptr fp{fopen(file.c_str(), "rb")}; + mps_parser_expects(fp != nullptr, + error_type_t::ValidationError, + "Error opening input file! Given path: %s", + file.c_str()); + mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0, + error_type_t::ValidationError, + "Error seeking input file! Given path: %s", + file.c_str()); + const long compressed_size = ftell(fp.get()); + mps_parser_expects(compressed_size != -1L, + error_type_t::ValidationError, + "Error sizing input file! Given path: %s", + file.c_str()); + std::vector compressed(compressed_size); + rewind(fp.get()); + mps_parser_expects(fread(compressed.data(), sizeof(char), compressed_size, fp.get()) == + static_cast(compressed_size), + error_type_t::ValidationError, + "Error reading input file! Given path: %s", + file.c_str()); + + constexpr unsigned lz4f_version = 100; + LZ4F_dctx* raw_dctx = nullptr; + LZ4F_errorCode_t lz4_status = LZ4F_createDecompressionContext(&raw_dctx, lz4f_version); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Could not open lz4 compressed file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + std::unique_ptr dctx{raw_dctx, + {LZ4F_freeDecompressionContext, LZ4F_isError}}; + + const char* src = compressed.data(); + size_t src_size = compressed.size(); + LZ4F_frameInfo_t frame_info{}; + size_t src_used = src_size; + lz4_status = LZ4F_getFrameInfo(dctx.get(), &frame_info, src, &src_used); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Error reading lz4 frame info for input file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + src += src_used; + src_size -= src_used; + + std::vector buf; + if (frame_info.contentSize > 0) { buf.reserve((size_t)frame_info.contentSize + 1); } + const size_t readbufsize = 1ull << 24; // 16MiB + std::vector readbuf(readbufsize); + while (lz4_status != 0) { + size_t dst_size = readbuf.size(); + src_used = src_size; + lz4_status = LZ4F_decompress(dctx.get(), readbuf.data(), &dst_size, src, &src_used, nullptr); + mps_parser_expects(!LZ4F_isError(lz4_status), + error_type_t::ValidationError, + "Error in lz4 decompression of input file '%s': %s", + file.c_str(), + LZ4F_getErrorName(lz4_status)); + if (dst_size > 0) { buf.insert(buf.end(), begin(readbuf), begin(readbuf) + dst_size); } + src += src_used; + src_size -= src_used; + mps_parser_expects(src_used != 0 || dst_size != 0 || lz4_status == 0, + error_type_t::ValidationError, + "Stalled lz4 decompression of input file! Given path: %s", + file.c_str()); + } + buf.push_back('\0'); + return buf; +} +} // end namespace +#endif // MPS_PARSER_WITH_LZ4 + namespace cuopt::linear_programming::io::detail { std::vector file_to_string(const std::string& file) @@ -223,6 +380,12 @@ std::vector file_to_string(const std::string& file) } #endif // MPS_PARSER_WITH_ZLIB +#ifdef MPS_PARSER_WITH_LZ4 + if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".lz4") { + return lz4_file_to_string(file); + } +#endif // MPS_PARSER_WITH_LZ4 + // Faster than using C++ I/O std::unique_ptr fp{fopen(file.c_str(), "r")}; mps_parser_expects(fp != nullptr, diff --git a/cpp/src/io/file_to_string.hpp b/cpp/src/io/file_to_string.hpp index 94b2df821d..3b1924e12c 100644 --- a/cpp/src/io/file_to_string.hpp +++ b/cpp/src/io/file_to_string.hpp @@ -17,6 +17,7 @@ namespace cuopt::linear_programming::io::detail { // The dispatcher looks at the extension: // - ".bz2" → libbz2 (dlopen'd at runtime), if MPS_PARSER_WITH_BZIP2. // - ".gz" → libz (dlopen'd at runtime), if MPS_PARSER_WITH_ZLIB. +// - ".lz4" → liblz4 (dlopen'd at runtime), if MPS_PARSER_WITH_LZ4. // - otherwise → plain fopen. // The returned buffer's size includes the null terminator. std::vector file_to_string(const std::string& file); diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp index 1baaf011e5..96a881c880 100644 --- a/cpp/src/utilities/perf_counters.hpp +++ b/cpp/src/utilities/perf_counters.hpp @@ -16,6 +16,37 @@ namespace mps_fast { +// Utils to return to total resident set size (used physical pages) +static size_t parse_status_kb_line(const char* line, const char* key) +{ + size_t key_len = std::strlen(key); + if (std::strncmp(line, key, key_len) != 0) { return 0; } + const char* p = line + key_len; + while (*p == ' ' || *p == '\t') { + ++p; + } + char* end_ptr = nullptr; + size_t value = std::strtol(p, &end_ptr, 10); + return value; +} + +static std::pair current_process_rss_kb() +{ + FILE* file = std::fopen("/proc/self/status", "r"); + if (file == nullptr) { return {0, 0}; } + + size_t rss_kb = 0; + size_t hwm_kb = 0; + char line[256]; + while (std::fgets(line, sizeof(line), file) != nullptr) { + if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); } + if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); } + if (rss_kb != 0 && hwm_kb != 0) { break; } + } + std::fclose(file); + return {rss_kb, hwm_kb}; +} + struct perf_counter_spec_t { const char* name; uint32_t type; diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp index 2e087ec4ee..ad6fab51fc 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -815,6 +815,49 @@ void lz4_and_raw_paths_match_on_multiblock_input() expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds"); } +void gzip_bzip2_and_raw_paths_match() +{ + std::string mps; + mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n"; + mps += " X1 OBJ 1 R1 2.5\n X2 R1 -3.25 R2 4\n"; + mps += "RHS\n RHS1 R1 7 R2 8\nBOUNDS\n BV BND X1\n UP BND X2 10\nENDATA\n"; + + TempMpsFile raw_file(std::move(mps)); + TempOwnedPath gzip_file(raw_file.path + ".gz"); + TempOwnedPath bzip2_file(raw_file.path + ".bz2"); + + const std::string gzip_cmd = "gzip -c " + raw_file.path + " > " + gzip_file.path; + const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path; + if (std::system(gzip_cmd.c_str()) != 0) { throw skip_test("gzip CLI unavailable"); } + if (std::system(bzip2_cmd.c_str()) != 0) { throw skip_test("bzip2 CLI unavailable"); } + + auto raw = + mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); + auto gzip = + mps_fast::parse_mps_fast_file(gzip_file.path, mps_fast::FileReadMethod::Read); + auto bzip2 = + mps_fast::parse_mps_fast_file(bzip2_file.path, mps_fast::FileReadMethod::Read); + + expect_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity"); + expect_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity"); + expect_vector_eq(gzip.A_, raw.A_, "gzip A values"); + expect_vector_eq(bzip2.A_, raw.A_, "bzip2 A values"); + expect_vector_eq(gzip.A_indices_, raw.A_indices_, "gzip A indices"); + expect_vector_eq(bzip2.A_indices_, raw.A_indices_, "bzip2 A indices"); + expect_vector_eq(gzip.A_offsets_, raw.A_offsets_, "gzip A offsets"); + expect_vector_eq(bzip2.A_offsets_, raw.A_offsets_, "bzip2 A offsets"); + expect_vector_eq(gzip.c_, raw.c_, "gzip objective"); + expect_vector_eq(bzip2.c_, raw.c_, "bzip2 objective"); + expect_vector_eq(gzip.b_, raw.b_, "gzip rhs"); + expect_vector_eq(bzip2.b_, raw.b_, "bzip2 rhs"); + expect_vector_eq(gzip.variable_lower_bounds_, raw.variable_lower_bounds_, "gzip lower bounds"); + expect_vector_eq(bzip2.variable_lower_bounds_, raw.variable_lower_bounds_, "bzip2 lower bounds"); + expect_vector_eq(gzip.variable_upper_bounds_, raw.variable_upper_bounds_, "gzip upper bounds"); + expect_vector_eq(bzip2.variable_upper_bounds_, raw.variable_upper_bounds_, "bzip2 upper bounds"); + expect_vector_eq(gzip.var_types_, raw.var_types_, "gzip var types"); + expect_vector_eq(bzip2.var_types_, raw.var_types_, "bzip2 var types"); +} + } // namespace int main() @@ -846,6 +889,7 @@ int main() {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary}, {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered}, {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input}, + {"GzipBzip2AndRawPathsMatch", gzip_bzip2_and_raw_paths_match}, }; int failed = 0; From 62c8dcda56b95d59e4495dd238c8ad051ec6257c Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Thu, 11 Jun 2026 08:52:39 -0700 Subject: [PATCH 09/22] further cleanup --- cpp/CMakeLists.txt | 13 +- cpp/cuopt_cli.cpp | 7 +- .../cuopt/linear_programming/io/parser.hpp | 59 +- .../fast_fp64_parser.hpp | 6 +- .../io/experimental_mps_fast/fast_parser.cpp | 2 +- .../io/experimental_mps_fast/file_reader.cpp | 335 ++++++++ .../experimental_mps_fast/lz4_file_reader.cpp | 8 + cpp/src/io/file_to_string.cpp | 19 +- cpp/tests/linear_programming/CMakeLists.txt | 46 +- .../fast_fp64_parser_test.cpp | 99 +-- .../fast_parser_edge_test.cpp | 794 ++++++++---------- cpp/tests/linear_programming/parser_test.cpp | 40 +- 12 files changed, 784 insertions(+), 644 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e134d49d02..4ecb1e9a46 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,7 +54,6 @@ option(SKIP_ROUTING_BUILD "Skip building routing components" OFF) option(SKIP_GRPC_BUILD "Skip building gRPC and protobuf components" OFF) option(WRITE_FATBIN "Enable fatbin writing" ON) option(HOST_LINEINFO "Build with debug line information for host code" OFF) -option(MPS_FAST_TIMERS "Enable experimental fast MPS parser phase timer printouts" OFF) message(VERBOSE "cuOpt: Enable nvcc -lineinfo: ${CMAKE_CUDA_LINEINFO}") message(VERBOSE "cuOpt: Build cuOpt unit-tests: ${BUILD_TESTS}") @@ -65,7 +64,6 @@ message(VERBOSE "cuOpt: Skip C/Python adapters: ${SKIP_C_PYTHON_ADAPTERS}") message(VERBOSE "cuOpt: Skip routing build: ${SKIP_ROUTING_BUILD}") message(VERBOSE "cuOpt: Build with debug line information for host code: ${HOST_LINEINFO}") message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}") -message(VERBOSE "cuOpt: Fast MPS parser timers: ${MPS_FAST_TIMERS}") # ################################################################################################## # - compiler options ------------------------------------------------------------------------------ @@ -204,8 +202,7 @@ endif () find_package(OpenMP REQUIRED) message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") -# MPS/QPS parser supports compressed inputs via bzip2 and zlib; the experimental fast MPS parser -# supports LZ4 via runtime-loaded liblz4. +# MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON) option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON) option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON) @@ -464,12 +461,16 @@ if (HOST_LINEINFO) set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") endif () +# Needed for the fast MPS parser if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2") endif () +# TODO: figure out a set of flags for ARM that fits the range of CPUs we wish to support (neoverse?) +# NEON should be universal on aarch64 and enough for our purposes (parsing) though + # Apply -UNDEBUG only to solver source files (not gRPC infrastructure). # Must happen before gRPC files are appended to CUOPT_SRC_FILES. # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO). @@ -519,10 +520,6 @@ target_compile_definitions(cuopt PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API ) -if (MPS_FAST_TIMERS) - target_compile_definitions(cuopt PRIVATE MPS_FAST_TIMERS=1) -endif () - target_compile_options(cuopt PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" "$<$:${CUOPT_CUDA_FLAGS}>" diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 714d76dbf5..55c506721a 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -307,10 +307,11 @@ int main(int argc, char* argv[]) program.add_argument("--mps-reader") .help( - "MPS reader implementation: default uses the production parser; fast uses the experimental " + "MPS reader implementation: default uses the production parser; experimental-fast uses the " + "experimental " "SIMD parser for LP/MIP .mps and .mps.lz4 files") .default_value(std::string("default")) - .choices("default", "fast"); + .choices("default", "experimental-fast"); program.add_argument("--dump-hyper-params") .help("print hyper-parameters only in config file format and exit") @@ -415,7 +416,7 @@ int main(int argc, char* argv[]) const auto mps_reader_arg = program.get("--mps-reader"); auto mps_reader = cuopt::linear_programming::io::mps_reader_type_t::default_reader; - if (mps_reader_arg == "fast") { + if (mps_reader_arg == "experimental-fast") { mps_reader = cuopt::linear_programming::io::mps_reader_type_t::fast_experimental; } diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index 08254f84b3..4e46d43224 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -20,8 +20,7 @@ namespace cuopt::linear_programming::io { /** * @brief Selects which MPS reader implementation should be used by dispatching entry points. * - * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP problems - * from raw .mps, .mps.lz4, .mps.gz, and .mps.bz2 files. + * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP/QP problems. */ enum class mps_reader_type_t { default_reader, fast_experimental }; @@ -52,10 +51,8 @@ mps_data_model_t read_mps(const std::string& mps_file_path, bool fixed_mps_format = false); /** - * @brief Reads a raw LP/MIP MPS problem with the experimental SIMD-optimized reader. - * - * This prototype reader supports raw .mps plus .mps.lz4/.mps.gz/.mps.bz2 files. It does not - * support LP, QPS, quadratic constraint sections, or fixed-format forcing. + * @brief Reads a raw LP/MIP/QP MPS problem with the experimental SIMD-optimized reader. SOCP is + * unsupported for now. * * @param[in] mps_file_path Path to a raw or compressed .mps file. * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file. @@ -127,11 +124,6 @@ mps_data_model_t read_lp(const std::string& lp_file_path); template mps_data_model_t read_lp_from_string(std::string_view lp_contents); -template -inline mps_data_model_t read(const std::string& path, - mps_reader_type_t mps_reader, - bool fixed_mps_format = false); - /** * @brief Reads an optimization problem from a file, dispatching on the file * extension. Extension matching is case-insensitive. @@ -146,39 +138,30 @@ inline mps_data_model_t read(const std::string& path, * want both formats to "just work" without an explicit format flag. * * @param[in] path Path to the input file. + * @param[in] mps_reader Selects the MPS reader implementation for MPS/QPS inputs. * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format; * ignored for LP inputs. False by default. * @return mps_data_model_t The parsed problem. */ -template -inline mps_data_model_t read(const std::string& path, bool fixed_mps_format = false) -{ - return read(path, mps_reader_type_t::default_reader, fixed_mps_format); -} - template inline mps_data_model_t read(const std::string& path, mps_reader_type_t mps_reader, - bool fixed_mps_format) + bool fixed_mps_format = false) { std::string lower(path); std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); - const bool is_mps_lz4 = lower.ends_with(".mps.lz4"); - const bool is_mps_gzip = lower.ends_with(".mps.gz"); - const bool is_mps_bzip = lower.ends_with(".mps.bz2"); - const bool is_qps_lz4 = lower.ends_with(".qps.lz4"); - const bool is_lp_lz4 = lower.ends_with(".lp.lz4"); - if (lower.ends_with(".mps") || is_mps_lz4 || is_mps_gzip || is_mps_bzip || - lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") || - is_qps_lz4) { + if (lower.ends_with(".mps.lz4") || lower.ends_with(".mps.bz2") || lower.ends_with(".mps.gz") || + lower.ends_with(".mps") || lower.ends_with(".qps.lz4") || lower.ends_with(".qps.bz2") || + lower.ends_with(".qps.gz") || lower.ends_with(".qps")) { if (mps_reader == mps_reader_type_t::fast_experimental) { if (fixed_mps_format) { throw std::logic_error( "experimental fast MPS reader does not support fixed MPS format forcing"); } - if (!lower.ends_with(".mps") && !is_mps_lz4 && !is_mps_gzip && !is_mps_bzip) { + if (lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") || + lower.ends_with(".qps.lz4")) { throw std::logic_error( "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 " "LP/MIP files only"); @@ -187,8 +170,8 @@ inline mps_data_model_t read(const std::string& path, } return read_mps(path, fixed_mps_format); } - if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2") || - is_lp_lz4) { + if (lower.ends_with(".lp.lz4") || lower.ends_with(".lp.bz2") || lower.ends_with(".lp.gz") || + lower.ends_with(".lp")) { return read_lp(path); } throw std::logic_error( @@ -199,4 +182,22 @@ inline mps_data_model_t read(const std::string& path, path); } +/** + * @brief Reads an optimization problem from a file, dispatching on the file + * extension. Extension matching is case-insensitive. + * + * Uses the default MPS reader. See the 3-argument read() overload for routing + * details and supported extensions. + * + * @param[in] path Path to the input file. + * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format; + * ignored for LP inputs. False by default. + * @return mps_data_model_t The parsed problem. + */ +template +inline mps_data_model_t read(const std::string& path, bool fixed_mps_format = false) +{ + return read(path, mps_reader_type_t::default_reader, fixed_mps_format); +} + } // namespace cuopt::linear_programming::io diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index 0f947aa644..e446494639 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -325,7 +325,11 @@ static inline double fallback_strtod(std::string_view s) char* parse_end = nullptr; errno = 0; - return std::strtod(stack_buf, &parse_end); + double value = std::strtod(stack_buf, &parse_end); + if (parse_end != stack_buf + s.size() || errno == ERANGE) { + mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token"); + } + return value; } // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 33bf916e05..bc9000f8f3 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -11,7 +11,7 @@ #include "nvtx_ranges.hpp" #include -#ifdef MPS_FAST_PERF_COUNTERS +#if defined(MPS_FAST_PERF_COUNTERS) || defined(MPS_FAST_TIMERS) #include #endif diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index e69de29bb2..5eae15a46a 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -0,0 +1,335 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights +// reserved. SPDX-License-Identifier: Apache-2.0 + +#include "file_reader.hpp" +#include "nvtx_ranges.hpp" + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mps_fast { + +using cuopt::linear_programming::io::error_type_t; +using cuopt::linear_programming::io::mps_parser_fail; + +namespace { + +constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; +constexpr std::size_t raw_input_max_read_threads = 8; +constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull; + +bool path_has_suffix(const std::string& path, const char* suffix) noexcept +{ + std::size_t suffix_len = std::strlen(suffix); + return path.size() >= suffix_len && + path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; +} + +std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) +{ + if (alignment == 0) { return value; } + std::size_t remainder = value % alignment; + if (remainder == 0) { return value; } + std::size_t increment = alignment - remainder; + if (value > std::numeric_limits::max() - increment) { + mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); + } + return value + increment; +} + +std::size_t add_input_padding(std::size_t size) +{ + if (size > std::numeric_limits::max() - input_buffer_padding_bytes) { + mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow"); + } + return size + input_buffer_padding_bytes; +} + +} // namespace + +std::size_t get_file_size(int fd, const std::string& path) +{ + struct stat st; + if (::fstat(fd, &st) != 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to stat file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + if (st.st_size < 0) { + mps_parser_fail(error_type_t::RuntimeError, "Negative file size for '%s'", path.c_str()); + } + return (std::size_t)st.st_size; +} + +std::size_t get_file_size(const std::string& path) +{ + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + std::size_t size = get_file_size(fd, path); + ::close(fd); + return size; +} + +std::size_t system_page_size() +{ + static std::size_t page_size = [] { + long value = ::sysconf(_SC_PAGESIZE); + return value > 0 ? (std::size_t)value : (std::size_t)4096; + }(); + return page_size; +} + +raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); + buffered_fd_ = ::open(path.c_str(), O_RDONLY); + if (buffered_fd_ < 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to open raw MPS file '%s': %s", + path.c_str(), + std::strerror(errno)); + } + + file_size_ = get_file_size(buffered_fd_, path); + fd_ = buffered_fd_; + bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes; + if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) { + use_direct_io = raw_direct[0] != '0'; + } + if (use_direct_io) { +#ifdef O_DIRECT + int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); + if (direct_fd >= 0) { + fd_ = direct_fd; + direct_io_ = true; + } +#endif + } + window_bytes_ = raw_input_window_bytes; + window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); + + output_mapped_size_ = round_up_to_multiple( + std::max(add_input_padding(file_size_), 1), system_page_size()); + output_region_ = mmap_region_t::anonymous( + output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); + output_data_ = output_region_.char_data(); + output_region_.advise(MADV_HUGEPAGE); + + block_done_.resize(window_count_, 0); + block_end_.resize(window_count_, 0); + section_scanner_ = + std::make_unique(output_data_, window_count_, registry_); +} + +raw_input_stream_t::~raw_input_stream_t() +{ + if (fd_ >= 0) { ::close(fd_); } + if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); } +} + +const char* raw_input_stream_t::data() const noexcept { return output_data_; } +char* raw_input_stream_t::mutable_data() noexcept { return output_data_; } +std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; } +std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; } +std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; } +mps_phase_registry_t& raw_input_stream_t::registry() noexcept { return registry_; } +input_stream_view_t raw_input_stream_t::view() noexcept +{ + return {output_data_, output_data_, output_view_size_, file_size_, ®istry_}; +} + +void raw_input_stream_t::run_decode_tasks() +{ + MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io); + if (file_size_ == 0) { + output_view_size_ = 0; + section_scanner_->publish_ready(0); + return; + } + + std::size_t hw_threads = + std::max(1, (std::size_t)std::thread::hardware_concurrency()); + std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); + thread_count = std::max(1, std::min(thread_count, window_count_)); + + std::atomic_size_t next_window{0}; + std::exception_ptr first_error = nullptr; + std::mutex error_mutex; + std::atomic_bool stop{false}; + + auto mark_error = [&](std::exception_ptr eptr) { + std::lock_guard lock(error_mutex); + if (!first_error) { + first_error = eptr; + stop.store(true, std::memory_order_release); + } + }; + + auto read_window = [&](std::size_t index) { + MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); + std::size_t offset = index * window_bytes_; + std::size_t size = std::min(window_bytes_, file_size_ - offset); + std::size_t done = 0; + { + MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); + while (done < size) { + ssize_t got = + ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) { + got = ::pread( + buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); + if (got >= 0) { + done += (std::size_t)got; + continue; + } + if (errno == EINTR) { continue; } + } + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread raw MPS file '%s': %s", + path_.c_str(), + std::strerror(errno)); + } + if (got == 0) { + mps_parser_fail(error_type_t::RuntimeError, + "Unexpected EOF while reading raw MPS file '%s'", + path_.c_str()); + } + done += (std::size_t)got; + } + } + + { + MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); + section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); + frontier_mutex_.lock(); + block_done_[index] = 1; + block_end_[index] = offset + size; + std::size_t before = ready_bytes_; + while (next_block_ < block_done_.size() && block_done_[next_block_]) { + ready_bytes_ = block_end_[next_block_]; + ++next_block_; + } + std::size_t after = ready_bytes_; + frontier_mutex_.unlock(); + if (after > before) { section_scanner_->publish_ready(after); } + } + }; + + std::vector workers; + workers.reserve(thread_count); + for (std::size_t t = 0; t < thread_count; ++t) { + workers.emplace_back([&, t] { + std::string thread_name = "raw-input-read-" + std::to_string(t); + nvtx::name_current_thread(thread_name.c_str()); + MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io); + while (!stop.load(std::memory_order_acquire)) { + std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); + if (index >= window_count_) { break; } + try { + read_window(index); + } catch (...) { + mark_error(std::current_exception()); + return; + } + } + }); + } + for (auto& worker : workers) { + worker.join(); + } + if (first_error) { std::rethrow_exception(first_error); } + + output_view_size_ = ready_bytes_; + section_scanner_->publish_ready(output_view_size_); +} + +memory_input_stream_t::memory_input_stream_t(std::vector buffer, + std::size_t input_size, + std::size_t compressed_size) + : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size) +{ + section_scanner_ = std::make_unique(buffer_.data(), 1, registry_); +} + +const char* memory_input_stream_t::data() const noexcept { return buffer_.data(); } +char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); } +std::size_t memory_input_stream_t::size() const noexcept { return input_size_; } +std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; } +std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; } +mps_phase_registry_t& memory_input_stream_t::registry() noexcept { return registry_; } +input_stream_view_t memory_input_stream_t::view() noexcept +{ + return {buffer_.data(), buffer_.data(), input_size_, compressed_size_, ®istry_}; +} + +void memory_input_stream_t::run_decode_tasks() +{ + MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io); + section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_); + section_scanner_->publish_ready(input_size_); +} + +bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); } +bool has_gzip_extension(const std::string& path) noexcept { return path_has_suffix(path, ".gz"); } +bool has_bzip2_extension(const std::string& path) noexcept { return path_has_suffix(path, ".bz2"); } + +void drop_file_cache(const std::string& path) +{ + MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io); + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { return; } + ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); + ::close(fd); +} + +FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method) +{ + if (has_lz4_extension(path)) { return FileReadMethod::Lz4; } + if (has_gzip_extension(path)) { return FileReadMethod::Gzip; } + if (has_bzip2_extension(path)) { return FileReadMethod::Bzip2; } + if (method == FileReadMethod::Lz4) { + mps_parser_fail( + error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str()); + } + return method; +} + +const char* file_read_method_name(FileReadMethod method) noexcept +{ + switch (method) { + case FileReadMethod::Read: return "read"; + case FileReadMethod::Lz4: return "lz4"; + case FileReadMethod::Gzip: return "gzip"; + case FileReadMethod::Bzip2: return "bzip2"; + default: return "unknown"; + } +} + +} // namespace mps_fast diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index b25e330999..9c47ba63c7 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -662,8 +662,16 @@ struct lz4_pipeline_t { void wait_range_ready(std::size_t begin, std::size_t size) { if (size == 0) return; + if (begin > input.compressed_size_ || size > input.compressed_size_ - begin) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading resident window"); + } std::size_t first = begin / window_bytes; std::size_t last = (begin + size - 1) / window_bytes; + if (last >= window_done.size()) { + mps_parser_fail(error_type_t::ValidationError, + "truncated LZ4 frame while reading resident window"); + } for (std::size_t wi = first; wi <= last; ++wi) { MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); std::unique_lock lock(window_mutex); diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp index 5823381098..30d9c41f9f 100644 --- a/cpp/src/io/file_to_string.cpp +++ b/cpp/src/io/file_to_string.cpp @@ -9,6 +9,8 @@ #include +#include +#include #include #include #include @@ -368,22 +370,21 @@ namespace cuopt::linear_programming::io::detail { std::vector file_to_string(const std::string& file) { + std::string lower(file); + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { + return (char)std::tolower(c); + }); + #ifdef MPS_PARSER_WITH_BZIP2 - if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".bz2") { - return bz2_file_to_string(file); - } + if (lower.ends_with(".bz2")) { return bz2_file_to_string(file); } #endif // MPS_PARSER_WITH_BZIP2 #ifdef MPS_PARSER_WITH_ZLIB - if (file.size() > 3 && file.substr(file.size() - 3, 3) == ".gz") { - return zlib_file_to_string(file); - } + if (lower.ends_with(".gz")) { return zlib_file_to_string(file); } #endif // MPS_PARSER_WITH_ZLIB #ifdef MPS_PARSER_WITH_LZ4 - if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".lz4") { - return lz4_file_to_string(file); - } + if (lower.ends_with(".lz4")) { return lz4_file_to_string(file); } #endif // MPS_PARSER_WITH_LZ4 // Faster than using C++ I/O diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt index fcceb4af56..6db30755c3 100644 --- a/cpp/tests/linear_programming/CMakeLists.txt +++ b/cpp/tests/linear_programming/CMakeLists.txt @@ -21,43 +21,15 @@ ConfigureTest(MPS_PARSER_TEST ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp LABELS numopt) -function(ConfigureStandaloneMpsFastTest CMAKE_TEST_NAME TEST_SOURCE) - add_executable(${CMAKE_TEST_NAME} ${TEST_SOURCE}) - target_include_directories(${CMAKE_TEST_NAME} - PRIVATE - "${CUOPT_TEST_DIR}/../src" - "${CUOPT_TEST_DIR}/../src/io" - "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast" - ) - target_compile_features(${CMAKE_TEST_NAME} PRIVATE cxx_std_20) - target_compile_options(${CMAKE_TEST_NAME} - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" - ) - target_link_libraries(${CMAKE_TEST_NAME} - PRIVATE - cuopt - simde::simde - ${CUOPT_PRIVATE_CUDA_LIBS} - ) - if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") - target_link_options(${CMAKE_TEST_NAME} PRIVATE -Wl,--enable-new-dtags) - endif() - - add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) - set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "numopt") - - install( - TARGETS ${CMAKE_TEST_NAME} - COMPONENT testing - DESTINATION bin/gtests/libcuopt - EXCLUDE_FROM_ALL - ) -endfunction() - -ConfigureStandaloneMpsFastTest(MPS_FAST_FP64_PARSER_TEST - ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp) -ConfigureStandaloneMpsFastTest(MPS_FAST_PARSER_EDGE_TEST - ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp) +ConfigureTest(MPS_FAST_PARSER_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp + LABELS numopt) +target_include_directories(MPS_FAST_PARSER_TEST + PRIVATE + "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast" +) +target_link_libraries(MPS_FAST_PARSER_TEST PRIVATE simde::simde) # ################################################################################################## # - C API Tests---------------------------------------------------------------------- diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp index 36171267cf..f07d84ebde 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp @@ -3,19 +3,16 @@ #include "fast_fp64_parser.hpp" +#include + #include #include #include #include #include #include -#include -#include -#include #include #include -#include -#include #include #include #include @@ -24,22 +21,6 @@ namespace { uint64_t bits(double value) { return std::bit_cast(value); } -[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); } - -void expect_true(bool condition, const std::string& message) -{ - if (!condition) { fail(message); } -} - -void expect_eq_ptr(const char* got, const char* expected, std::string_view context) -{ - if (got != expected) { - std::ostringstream out; - out << context << ": pointer mismatch got_delta=" << (got - expected); - fail(out.str()); - } -} - double reference_strtod(std::string_view token) { std::string normalized(token); @@ -49,7 +30,7 @@ double reference_strtod(std::string_view token) char* end = nullptr; errno = 0; double value = std::strtod(normalized.c_str(), &end); - expect_eq_ptr(end, normalized.c_str() + normalized.size(), token); + ASSERT_EQ(end, normalized.c_str() + normalized.size()); return value; } @@ -65,22 +46,17 @@ double parse_padded_token(std::string_view token) padded.append(40, ' '); const char* p = padded.data(); double value = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size()); - expect_eq_ptr(p, padded.data() + token.size(), token); + ASSERT_EQ(p, padded.data() + token.size()); return value; } -void expect_bitwise_strtod(std::string_view token) +void check_bitwise_strtod(std::string_view token) { - double ref = reference_strtod(token); - uint64_t token_bits = bits(parse_token(token)); - uint64_t padded_bits = bits(parse_padded_token(token)); - uint64_t ref_bits = bits(ref); - if (token_bits != ref_bits || padded_bits != ref_bits) { - std::ostringstream out; - out << "bitwise mismatch for '" << token << "' ref=0x" << std::hex << ref_bits << " token=0x" - << token_bits << " padded=0x" << padded_bits; - fail(out.str()); - } + const double ref = reference_strtod(token); + const uint64_t ref_bits = bits(ref); + EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'"; + EXPECT_EQ(ref_bits, bits(parse_padded_token(token))) + << "padded parse mismatch for '" << token << "'"; } std::string random_token(std::mt19937_64& rng) @@ -133,7 +109,9 @@ std::string random_token(std::mt19937_64& rng) return token; } -void common_table_matches_strtod_bitwise() +} // namespace + +TEST(FastFp64ParserTest, CommonTableMatchesStrtodBitwise) { std::setlocale(LC_NUMERIC, "C"); const std::vector cases = { @@ -168,64 +146,29 @@ void common_table_matches_strtod_bitwise() }; for (std::string_view token : cases) { - expect_bitwise_strtod(token); + check_bitwise_strtod(token); } } -void cursor_advances_to_token_end() +TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd) { std::setlocale(LC_NUMERIC, "C"); std::string text = "123.45 ABC"; const char* p = text.data(); double value = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size()); - expect_true(bits(value) == bits(reference_strtod("123.45")), "parsed value mismatch"); - expect_eq_ptr(p, text.data() + 6, "cursor_advances_to_token_end"); - expect_true(std::string_view(p, 5) == " ABC", "cursor did not stop before trailing field"); + EXPECT_EQ(bits(reference_strtod("123.45")), bits(value)); + EXPECT_EQ(text.data() + 6, p); + EXPECT_EQ(std::string_view(" ABC"), std::string_view(p, 5)); } -void fixed_seed_random_differential() +TEST(FastFp64ParserTest, FixedSeedRandomDifferential) { std::setlocale(LC_NUMERIC, "C"); std::mt19937_64 rng(0x4d50535f46415354ULL); for (int i = 0; i < 100000; ++i) { std::string token = random_token(rng); - expect_true(token.size() <= 25U, "generated token exceeds MPS numeric token length"); - expect_bitwise_strtod(token); - } -} - -} // namespace - -int main() -{ - struct TestCase { - const char* name; - void (*fn)(); - }; - - const TestCase tests[] = { - {"CommonTableMatchesStrtodBitwise", common_table_matches_strtod_bitwise}, - {"CursorAdvancesToTokenEnd", cursor_advances_to_token_end}, - {"FixedSeedRandomDifferential", fixed_seed_random_differential}, - }; - - int failed = 0; - for (const TestCase& test : tests) { - std::cout << "[ RUN ] " << test.name << '\n'; - try { - test.fn(); - std::cout << "[ OK ] " << test.name << '\n'; - } catch (const std::exception& e) { - ++failed; - std::cerr << "[ FAILED ] " << test.name << ": " << e.what() << '\n'; - } - } - - if (failed != 0) { - std::cerr << failed << " test(s) failed\n"; - return 1; + ASSERT_LE(token.size(), 25U); + check_bitwise_strtod(token); } - std::cout << "[ PASSED ] " << std::size(tests) << " test(s)\n"; - return 0; } diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp index ad6fab51fc..aa05736616 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -6,6 +6,8 @@ #include +#include + #include #include #include @@ -13,10 +15,7 @@ #include #include #include -#include -#include #include -#include #include #include #include @@ -28,50 +27,6 @@ namespace { -struct skip_test : std::runtime_error { - using std::runtime_error::runtime_error; -}; - -[[noreturn]] void fail(const std::string& message) { throw std::runtime_error(message); } - -void expect_true(bool condition, const std::string& message) -{ - if (!condition) { fail(message); } -} - -template -void expect_eq(const A& got, const B& expected, std::string_view context) -{ - if (!(got == expected)) { - std::ostringstream out; - out << context << ": got=" << got << " expected=" << expected; - fail(out.str()); - } -} - -template -void expect_vector_eq(const VecA& got, const VecB& expected, std::string_view context) -{ - if (got.size() != expected.size()) { - std::ostringstream out; - out << context << ": size got=" << got.size() << " expected=" << expected.size(); - fail(out.str()); - } - for (size_t i = 0; i < got.size(); ++i) { - if (!(got[i] == expected[i])) { - std::ostringstream out; - out << context << ": first mismatch at " << i; - fail(out.str()); - } - } -} - -void expect_near_inf(double value, int sign, std::string_view context) -{ - expect_true(std::isinf(value), std::string(context) + ": expected infinity"); - expect_true(std::signbit(value) == (sign < 0), std::string(context) + ": wrong infinity sign"); -} - struct TempMpsFile { explicit TempMpsFile(std::string contents) { @@ -81,20 +36,20 @@ struct TempMpsFile { "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps", static_cast(getpid())); int fd = mkstemps(path_template, 4); - if (fd < 0) { fail(std::string("mkstemps failed: ") + std::strerror(errno)); } + if (fd < 0) { FAIL() << "mkstemps failed: " << std::strerror(errno); } path = path_template; FILE* file = fdopen(fd, "wb"); if (file == nullptr) { close(fd); - fail(std::string("fdopen failed: ") + std::strerror(errno)); + FAIL() << "fdopen failed: " << std::strerror(errno); } if (!contents.empty() && std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) { std::fclose(file); - fail(std::string("failed to write temporary MPS file: ") + std::strerror(errno)); + FAIL() << "failed to write temporary MPS file: " << std::strerror(errno); } if (std::fclose(file) != 0) { - fail(std::string("failed to close temporary MPS file: ") + std::strerror(errno)); + FAIL() << "failed to close temporary MPS file: " << std::strerror(errno); } } @@ -122,164 +77,50 @@ struct TempOwnedPath { std::string path; }; -template -void expect_throws(Fn&& fn, std::string_view context) -{ - try { - fn(); - } catch (const std::exception&) { - return; - } - fail(std::string(context) + ": expected exception"); -} - -void expect_fast_parse_error(std::string_view fixture_name, std::string contents) -{ - TempMpsFile file(std::move(contents)); - expect_throws( - [&] { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - fixture_name); -} - std::string_view range_text(const mps_fast::mps_phase_range_t& range) { if (!range.present) { return {}; } return std::string_view(range.begin, static_cast(range.end - range.begin)); } -void scanner_finds_section_split_across_blocks() -{ - const std::string mps = - "NAME EDGE\n" - "ROWS\n" - " N OBJ\n" - " L rowA\n" - "COLUMNS\n" - " x1 OBJ 1\n" - " x1 rowA 2\n" - "RHS\n" - " rhs rowA 3\n" - "ENDATA\n"; - - const size_t columns_pos = mps.find("COLUMNS"); - expect_true(columns_pos != std::string::npos, "failed to place COLUMNS split"); - const size_t split = columns_pos + 3; - - mps_fast::mps_phase_registry_t registry; - mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); - - scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); - scanner.publish_ready(0); - scanner.observe_block(0, mps.data(), mps.data() + split); - scanner.publish_ready(mps.size()); - - expect_true(registry.ready(mps_fast::mps_phase_kind::header), "header not ready"); - expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready"); - expect_true(registry.ready(mps_fast::mps_phase_kind::columns), "columns not ready"); - expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready"); - expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), "quadratic sentinel not ready"); - - expect_true(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS"), - "columns range begins at wrong boundary"); - expect_true(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS"), - "rhs range begins at wrong boundary"); -} - -void scanner_rejects_unknown_column_one_records_after_rows() -{ - const std::string mps = - "NAME BAD\n" - "ROWS\n" - " N OBJ\n" - "FOO\n" - "COLUMNS\n" - " x OBJ 1\n" - "ENDATA\n"; - - expect_throws( - [&] { - mps_fast::mps_phase_registry_t registry; - mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry); - scanner.observe_block(0, mps.data(), mps.data() + mps.size()); - scanner.publish_ready(mps.size()); - }, - "unknown column-1 record after ROWS"); -} - uint64_t bits(double value) { return std::bit_cast(value); } -void expect_double_bitwise_eq(double got, double expected, std::string_view context) -{ - if (bits(got) != bits(expected)) { - std::ostringstream out; - out << context << ": got=0x" << std::hex << bits(got) << " expected=0x" << bits(expected); - fail(out.str()); - } -} - -template -void expect_double_vector_bitwise_eq(const VecA& got, - const VecB& expected, - std::string_view context) -{ - if (got.size() != expected.size()) { - std::ostringstream out; - out << context << ": size got=" << got.size() << " expected=" << expected.size(); - fail(out.str()); - } - for (size_t i = 0; i < got.size(); ++i) { - if (bits(got[i]) != bits(expected[i])) { - std::ostringstream out; - out << context << ": first bitwise mismatch at " << i << " got=0x" << std::hex << bits(got[i]) - << " expected=0x" << bits(expected[i]); - fail(out.str()); - } - } -} - -void expect_models_match_reference_bitwise( +void check_models_match_reference_bitwise( const mps_fast::parser_model_t& fast, const cuopt::linear_programming::io::mps_data_model_t& reference, std::string_view context) { - expect_eq(fast.n_vars_, reference.n_vars_, std::string(context) + " n_vars"); - expect_eq(fast.n_constraints_, reference.n_constraints_, std::string(context) + " n_constraints"); - expect_eq(fast.nnz_, reference.nnz_, std::string(context) + " nnz"); - expect_eq(fast.maximize_, reference.maximize_, std::string(context) + " maximize"); - expect_eq(fast.problem_name_, reference.problem_name_, std::string(context) + " problem_name"); - expect_eq( - fast.objective_name_, reference.objective_name_, std::string(context) + " objective_name"); - - expect_double_bitwise_eq(fast.objective_scaling_factor_, - reference.objective_scaling_factor_, - std::string(context) + " objective_scaling_factor"); - expect_double_bitwise_eq(fast.objective_offset_, - reference.objective_offset_, - std::string(context) + " objective_offset"); - - expect_double_vector_bitwise_eq(fast.A_, reference.A_, std::string(context) + " A"); - expect_vector_eq(fast.A_indices_, reference.A_indices_, std::string(context) + " A_indices"); - expect_vector_eq(fast.A_offsets_, reference.A_offsets_, std::string(context) + " A_offsets"); - expect_double_vector_bitwise_eq(fast.b_, reference.b_, std::string(context) + " b"); - expect_double_vector_bitwise_eq(fast.c_, reference.c_, std::string(context) + " c"); - expect_double_vector_bitwise_eq(fast.variable_lower_bounds_, - reference.variable_lower_bounds_, - std::string(context) + " variable_lower_bounds"); - expect_double_vector_bitwise_eq(fast.variable_upper_bounds_, - reference.variable_upper_bounds_, - std::string(context) + " variable_upper_bounds"); - expect_double_vector_bitwise_eq(fast.constraint_lower_bounds_, - reference.constraint_lower_bounds_, - std::string(context) + " constraint_lower_bounds"); - expect_double_vector_bitwise_eq(fast.constraint_upper_bounds_, - reference.constraint_upper_bounds_, - std::string(context) + " constraint_upper_bounds"); - expect_vector_eq(fast.var_types_, reference.var_types_, std::string(context) + " var_types"); - expect_vector_eq(fast.row_types_, reference.row_types_, std::string(context) + " row_types"); - expect_vector_eq(fast.var_names_, reference.var_names_, std::string(context) + " var_names"); - expect_vector_eq(fast.row_names_, reference.row_names_, std::string(context) + " row_names"); + EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars"; + EXPECT_EQ(reference.n_constraints_, fast.n_constraints_) + << std::string(context) + " n_constraints"; + EXPECT_EQ(reference.nnz_, fast.nnz_) << std::string(context) + " nnz"; + EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize"; + EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name"; + EXPECT_EQ(reference.objective_name_, fast.objective_name_) + << std::string(context) + " objective_name"; + + EXPECT_EQ(bits(reference.objective_scaling_factor_), bits(fast.objective_scaling_factor_)) + << std::string(context) + " objective_scaling_factor"; + EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_)) + << std::string(context) + " objective_offset"; + + EXPECT_EQ(reference.A_, fast.A_) << std::string(context) + " A"; + EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices"; + EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets"; + EXPECT_EQ(reference.b_, fast.b_) << std::string(context) + " b"; + EXPECT_EQ(reference.c_, fast.c_) << std::string(context) + " c"; + EXPECT_EQ(reference.variable_lower_bounds_, fast.variable_lower_bounds_) + << std::string(context) + " variable_lower_bounds"; + EXPECT_EQ(reference.variable_upper_bounds_, fast.variable_upper_bounds_) + << std::string(context) + " variable_upper_bounds"; + EXPECT_EQ(reference.constraint_lower_bounds_, fast.constraint_lower_bounds_) + << std::string(context) + " constraint_lower_bounds"; + EXPECT_EQ(reference.constraint_upper_bounds_, fast.constraint_upper_bounds_) + << std::string(context) + " constraint_upper_bounds"; + EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types"; + EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types"; + EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names"; + EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names"; } void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) @@ -287,7 +128,7 @@ void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) TempMpsFile file(std::move(contents)); auto fast = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); auto reference = cuopt::linear_programming::io::read_mps(file.path, false); - expect_models_match_reference_bitwise(fast, reference, fixture_name); + check_models_match_reference_bitwise(fast, reference, fixture_name); } std::string row_name(size_t i) @@ -297,27 +138,27 @@ std::string row_name(size_t i) return out.str(); } -size_t find_var(const mps_fast::parser_model_t& model, std::string_view name) +int find_var_index(const mps_fast::parser_model_t& model, std::string_view name) { for (size_t i = 0; i < model.var_names_.size(); ++i) { - if (model.var_names_[i] == name) { return i; } + if (model.var_names_[i] == name) { return static_cast(i); } } - fail("variable not found: " + std::string(name)); + return -1; } -void expect_model_shapes(const mps_fast::parser_model_t& model, - int rows, - int vars, - int nnz, - std::string_view context) +void check_model_shapes(const mps_fast::parser_model_t& model, + int rows, + int vars, + int nnz, + std::string_view context) { - expect_eq(model.n_constraints_, rows, std::string(context) + " rows"); - expect_eq(model.n_vars_, vars, std::string(context) + " vars"); - expect_eq(model.nnz_, nnz, std::string(context) + " nnz"); - expect_eq( - model.A_offsets_.size(), static_cast(rows + 1), std::string(context) + " offsets"); - expect_eq(model.A_.size(), static_cast(nnz), std::string(context) + " values"); - expect_eq(model.A_indices_.size(), static_cast(nnz), std::string(context) + " indices"); + EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows"; + EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars"; + EXPECT_EQ(nnz, model.nnz_) << std::string(context) + " nnz"; + EXPECT_EQ(static_cast(rows + 1), model.A_offsets_.size()) + << std::string(context) + " offsets"; + EXPECT_EQ(static_cast(nnz), model.A_.size()) << std::string(context) + " values"; + EXPECT_EQ(static_cast(nnz), model.A_indices_.size()) << std::string(context) + " indices"; } std::string section_split_fixture() @@ -335,14 +176,69 @@ std::string section_split_fixture() "ENDATA\n"; } -void scanner_finds_headers_split_at_every_byte() +std::string to_crlf(std::string text) +{ + std::string converted; + converted.reserve(text.size() + text.size() / 8); + for (char c : text) { + if (c == '\n') { + converted += "\r\n"; + } else { + converted.push_back(c); + } + } + return converted; +} + +} // namespace + +TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks) +{ + const std::string mps = + "NAME EDGE\n" + "ROWS\n" + " N OBJ\n" + " L rowA\n" + "COLUMNS\n" + " x1 OBJ 1\n" + " x1 rowA 2\n" + "RHS\n" + " rhs rowA 3\n" + "ENDATA\n"; + + const size_t columns_pos = mps.find("COLUMNS"); + EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split"; + const size_t split = columns_pos + 3; + + mps_fast::mps_phase_registry_t registry; + mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); + + scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); + scanner.publish_ready(0); + scanner.observe_block(0, mps.data(), mps.data() + split); + scanner.publish_ready(mps.size()); + + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::header)) << "header not ready"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) << "columns not ready"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic)) + << "quadratic sentinel not ready"; + + EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS")) + << "columns range begins at wrong boundary"; + EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS")) + << "rhs range begins at wrong boundary"; +} + +TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte) { const std::string mps = section_split_fixture(); const std::vector headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"}; for (std::string_view header : headers) { const size_t pos = mps.find(header); - expect_true(pos != std::string::npos, "missing header in split fixture"); + EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture"; for (size_t offset = 1; offset < header.size(); ++offset) { const size_t split = pos + offset; mps_fast::mps_phase_registry_t registry; @@ -352,18 +248,40 @@ void scanner_finds_headers_split_at_every_byte() scanner.observe_block(0, mps.data(), mps.data() + split); scanner.publish_ready(mps.size()); - expect_true(registry.ready(mps_fast::mps_phase_kind::rows), "rows not ready after split"); - expect_true(registry.ready(mps_fast::mps_phase_kind::columns), - "columns not ready after split"); - expect_true(registry.ready(mps_fast::mps_phase_kind::rhs), "rhs not ready after split"); - expect_true(registry.ready(mps_fast::mps_phase_kind::bounds), "bounds not ready after split"); - expect_true(registry.ready(mps_fast::mps_phase_kind::quadratic), - "quadratic sentinel not ready after split"); + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready after split"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) + << "columns not ready after split"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready after split"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::bounds)) + << "bounds not ready after split"; + EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic)) + << "quadratic sentinel not ready after split"; } } } -void bounds_defaults_and_types_match_reference() +TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows) +{ + const std::string mps = + "NAME BAD\n" + "ROWS\n" + " N OBJ\n" + "FOO\n" + "COLUMNS\n" + " x OBJ 1\n" + "ENDATA\n"; + + EXPECT_THROW( + { + mps_fast::mps_phase_registry_t registry; + mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry); + scanner.observe_block(0, mps.data(), mps.data() + mps.size()); + scanner.publish_ready(mps.size()); + }, + std::logic_error); +} + +TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference) { verify_fixture_bitwise("bounds_defaults_and_types", "NAME BOUNDS_EDGE\n" @@ -390,7 +308,7 @@ void bounds_defaults_and_types_match_reference() "ENDATA\n"); } -void duplicate_bounds_last_statement_wins() +TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins) { const std::string contents = "NAME BOUNDS_DUP\n" @@ -412,12 +330,12 @@ void duplicate_bounds_last_statement_wins() TempMpsFile file(contents); auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_eq(model.n_vars_, 1, "n_vars"); - expect_eq(model.variable_lower_bounds_.at(0), 2.0, "duplicate lower bound"); - expect_eq(model.variable_upper_bounds_.at(0), 3.0, "duplicate upper bound"); + EXPECT_EQ(1, model.n_vars_) << "n_vars"; + EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound"; + EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound"; } -void nondense_row_and_column_names_use_hash_path() +TEST(FastMpsParserEdgeTest, NondenseRowAndColumnNamesUseHashPath) { verify_fixture_bitwise("nondense_row_and_column_names", "NAME HASH_NAMES\n" @@ -440,7 +358,7 @@ void nondense_row_and_column_names_use_hash_path() "ENDATA\n"); } -void missing_optional_bounds_fast_path() +TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath) { TempMpsFile file( "NAME OPTIONALS\n" @@ -455,13 +373,13 @@ void missing_optional_bounds_fast_path() auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_eq(model.n_vars_, 1, "missing optional n_vars"); - expect_eq(model.n_constraints_, 1, "missing optional n_constraints"); - expect_eq(model.variable_lower_bounds_.at(0), 0.0, "missing BOUNDS lower default"); - expect_near_inf(model.variable_upper_bounds_.at(0), 1, "missing BOUNDS upper default"); + EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars"; + EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default"; + EXPECT_EQ(std::numeric_limits::infinity(), model.variable_upper_bounds_.at(0)); } -void bounds_only_variables_are_appended_deterministically() +TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically) { TempMpsFile file( "NAME BOUNDS_ONLY\n" @@ -481,25 +399,28 @@ void bounds_only_variables_are_appended_deterministically() auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes(model, 1, 4, 1, "bounds-only"); - expect_eq(model.var_names_.at(0), std::string("XMAIN"), "main var name"); - expect_eq(model.var_names_.at(1), std::string("AUX_A"), "bounds-only sorted name 1"); - expect_eq(model.var_names_.at(2), std::string("AUX_S"), "bounds-only sorted name 2"); - expect_eq(model.var_names_.at(3), std::string("AUX_Z"), "bounds-only sorted name 3"); - - size_t aux_a = find_var(model, "AUX_A"); - size_t aux_s = find_var(model, "AUX_S"); - size_t aux_z = find_var(model, "AUX_Z"); - expect_eq(model.var_types_.at(aux_a), 'I', "bounds-only BV type"); - expect_eq(model.variable_lower_bounds_.at(aux_a), 0.0, "bounds-only BV lb"); - expect_eq(model.variable_upper_bounds_.at(aux_a), 1.0, "bounds-only BV ub"); - expect_eq(model.var_types_.at(aux_s), 'S', "bounds-only SC type"); - expect_eq(model.variable_upper_bounds_.at(aux_s), 5.0, "bounds-only SC ub"); - expect_eq(model.variable_lower_bounds_.at(aux_z), -3.0, "bounds-only duplicate lb"); - expect_eq(model.variable_upper_bounds_.at(aux_z), 9.0, "bounds-only duplicate ub"); -} - -void integer_markers_assign_types_and_default_bounds() + check_model_shapes(model, 1, 4, 1, "bounds-only"); + EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name"; + EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1"; + EXPECT_EQ(std::string("AUX_S"), model.var_names_.at(2)) << "bounds-only sorted name 2"; + EXPECT_EQ(std::string("AUX_Z"), model.var_names_.at(3)) << "bounds-only sorted name 3"; + + const int aux_a = find_var_index(model, "AUX_A"); + const int aux_s = find_var_index(model, "AUX_S"); + const int aux_z = find_var_index(model, "AUX_Z"); + ASSERT_GE(aux_a, 0); + ASSERT_GE(aux_s, 0); + ASSERT_GE(aux_z, 0); + EXPECT_EQ('I', model.var_types_.at(aux_a)) << "bounds-only BV type"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(aux_a)) << "bounds-only BV lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(aux_a)) << "bounds-only BV ub"; + EXPECT_EQ('S', model.var_types_.at(aux_s)) << "bounds-only SC type"; + EXPECT_EQ(5.0, model.variable_upper_bounds_.at(aux_s)) << "bounds-only SC ub"; + EXPECT_EQ(-3.0, model.variable_lower_bounds_.at(aux_z)) << "bounds-only duplicate lb"; + EXPECT_EQ(9.0, model.variable_upper_bounds_.at(aux_z)) << "bounds-only duplicate ub"; +} + +TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds) { TempMpsFile file( "NAME MARKERS\n" @@ -520,20 +441,23 @@ void integer_markers_assign_types_and_default_bounds() auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes(model, 1, 3, 3, "integer markers"); - size_t xint = find_var(model, "XINT"); - size_t xcont = find_var(model, "XCONT"); - size_t xbin = find_var(model, "XBIN"); - expect_eq(model.var_types_.at(xint), 'I', "XINT type"); - expect_eq(model.var_types_.at(xcont), 'C', "XCONT type"); - expect_eq(model.var_types_.at(xbin), 'I', "XBIN type"); - expect_eq(model.variable_lower_bounds_.at(xint), 0.0, "XINT default lb"); - expect_eq(model.variable_upper_bounds_.at(xint), 1.0, "XINT default ub"); - expect_eq(model.variable_lower_bounds_.at(xbin), 0.0, "XBIN default lb"); - expect_eq(model.variable_upper_bounds_.at(xbin), 1.0, "XBIN default ub"); -} - -void numeric_parsing_integration_matches_reference_bitwise() + check_model_shapes(model, 1, 3, 3, "integer markers"); + const int xint = find_var_index(model, "XINT"); + const int xcont = find_var_index(model, "XCONT"); + const int xbin = find_var_index(model, "XBIN"); + ASSERT_GE(xint, 0); + ASSERT_GE(xcont, 0); + ASSERT_GE(xbin, 0); + EXPECT_EQ('I', model.var_types_.at(xint)) << "XINT type"; + EXPECT_EQ('C', model.var_types_.at(xcont)) << "XCONT type"; + EXPECT_EQ('I', model.var_types_.at(xbin)) << "XBIN type"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xint)) << "XINT default lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xint)) << "XINT default ub"; + EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xbin)) << "XBIN default lb"; + EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xbin)) << "XBIN default ub"; +} + +TEST(FastMpsParserEdgeTest, NumericParsingIntegrationMatchesReferenceBitwise) { verify_fixture_bitwise("numeric_parsing_integration", "NAME NUMBERS\n" @@ -559,21 +483,7 @@ void numeric_parsing_integration_matches_reference_bitwise() "ENDATA\n"); } -std::string to_crlf(std::string text) -{ - std::string converted; - converted.reserve(text.size() + text.size() / 8); - for (char c : text) { - if (c == '\n') { - converted += "\r\n"; - } else { - converted.push_back(c); - } - } - return converted; -} - -void crlf_line_endings_match_reference_bitwise() +TEST(FastMpsParserEdgeTest, CrlfLineEndingsMatchReferenceBitwise) { verify_fixture_bitwise("crlf_line_endings", to_crlf("NAME CRLF_EDGE\n" @@ -591,7 +501,7 @@ void crlf_line_endings_match_reference_bitwise() "ENDATA\n")); } -void comment_placement_supported_cases_match_reference_bitwise() +TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise) { verify_fixture_bitwise("comment_placement_supported_cases", "* leading star comment\n" @@ -618,7 +528,7 @@ void comment_placement_supported_cases_match_reference_bitwise() "ENDATA\n"); } -void objective_metadata_selects_named_objective() +TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective) { TempMpsFile file( "NAME OBJMETA\n" @@ -640,81 +550,118 @@ void objective_metadata_selects_named_objective() auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_true(model.maximize_, "OBJSENSE MAX not applied"); - expect_eq(model.problem_name_, std::string("OBJMETA"), "problem name"); - expect_eq(model.objective_name_, std::string("COST"), "objective name"); - expect_eq(model.objective_offset_, -7.0, "objective RHS offset"); - size_t x1 = find_var(model, "X1"); - size_t x2 = find_var(model, "X2"); - expect_eq(model.c_.at(x1), 5.0, "named objective coefficient X1"); - expect_eq(model.c_.at(x2), -2.0, "named objective coefficient X2"); + EXPECT_TRUE(model.maximize_) << "OBJSENSE MAX not applied"; + EXPECT_EQ(std::string("OBJMETA"), model.problem_name_) << "problem name"; + EXPECT_EQ(std::string("COST"), model.objective_name_) << "objective name"; + EXPECT_EQ(-7.0, model.objective_offset_) << "objective RHS offset"; + const int x1 = find_var_index(model, "X1"); + const int x2 = find_var_index(model, "X2"); + ASSERT_GE(x1, 0); + ASSERT_GE(x2, 0); + EXPECT_EQ(5.0, model.c_.at(x1)) << "named objective coefficient X1"; + EXPECT_EQ(-2.0, model.c_.at(x2)) << "named objective coefficient X2"; } -void malformed_inputs_report_errors() +TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) { - expect_fast_parse_error("bad objsense", - "NAME BADOBJ\n" - "OBJSENSE\n" - " SIDEWAYS\n" - "ROWS\n" - " N OBJ\n" - " L R1\n" - "COLUMNS\n" - " X1 OBJ 1 R1 2\n" - "RHS\n" - " RHS1 R1 0\n" - "ENDATA\n"); - - expect_fast_parse_error("unknown row in columns", - "NAME BADCOLROW\n" - "ROWS\n" - " N OBJ\n" - " L R1\n" - "COLUMNS\n" - " X1 MISSING 1\n" - "RHS\n" - " RHS1 R1 0\n" - "ENDATA\n"); - - expect_fast_parse_error("unknown row in rhs", - "NAME BADRHSROW\n" - "ROWS\n" - " N OBJ\n" - " L R1\n" - "COLUMNS\n" - " X1 OBJ 1 R1 2\n" - "RHS\n" - " RHS1 MISSING 1\n" - "ENDATA\n"); - - expect_fast_parse_error("unknown bound type", - "NAME BADBOUND\n" - "ROWS\n" - " N OBJ\n" - " L R1\n" - "COLUMNS\n" - " X1 OBJ 1 R1 2\n" - "RHS\n" - " RHS1 R1 0\n" - "BOUNDS\n" - " XX B X1 1\n" - "ENDATA\n"); - - expect_fast_parse_error("semi-continuous bound without value", - "NAME BADSC\n" - "ROWS\n" - " N OBJ\n" - " L R1\n" - "COLUMNS\n" - " X1 OBJ 1 R1 2\n" - "RHS\n" - " RHS1 R1 0\n" - "BOUNDS\n" - " SC B X1\n" - "ENDATA\n"); + { + TempMpsFile file( + "NAME BADOBJ\n" + "OBJSENSE\n" + " SIDEWAYS\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + EXPECT_THROW( + { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADCOLROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 MISSING 1\n" + "RHS\n" + " RHS1 R1 0\n" + "ENDATA\n"); + EXPECT_THROW( + { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADRHSROW\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 MISSING 1\n" + "ENDATA\n"); + EXPECT_THROW( + { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADBOUND\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " XX B X1 1\n" + "ENDATA\n"); + EXPECT_THROW( + { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + std::logic_error); + } + + { + TempMpsFile file( + "NAME BADSC\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 0\n" + "BOUNDS\n" + " SC B X1\n" + "ENDATA\n"); + EXPECT_THROW( + { + (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + }, + std::logic_error); + } } -void large_columns_repeated_column_chunk_boundary() +TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary) { constexpr size_t row_count = 180000; std::string mps; @@ -740,13 +687,13 @@ void large_columns_repeated_column_chunk_boundary() TempMpsFile file(std::move(mps)); auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes( + check_model_shapes( model, static_cast(row_count), 2, static_cast(row_count + 1), "large columns"); - expect_eq(model.var_names_.at(0), std::string("XBIG"), "large repeated column name"); - expect_eq(model.var_names_.at(1), std::string("XTAIL"), "large tail column name"); + EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name"; + EXPECT_EQ(std::string("XTAIL"), model.var_names_.at(1)) << "large tail column name"; } -void large_bounds_repeated_var_stays_ordered() +TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered) { constexpr size_t repeat_count = 700000; std::string mps; @@ -763,13 +710,12 @@ void large_bounds_repeated_var_stays_ordered() TempMpsFile file(std::move(mps)); auto model = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes(model, 1, 1, 1, "large bounds"); - expect_eq(model.variable_upper_bounds_.at(0), - static_cast((repeat_count - 1) % 1000), - "large repeated bounds last value"); + check_model_shapes(model, 1, 1, 1, "large bounds"); + EXPECT_EQ(static_cast((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0)) + << "large repeated bounds last value"; } -void lz4_and_raw_paths_match_on_multiblock_input() +TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput) { constexpr size_t row_count = 70000; std::string mps; @@ -795,27 +741,27 @@ void lz4_and_raw_paths_match_on_multiblock_input() TempMpsFile raw_file(std::move(mps)); TempOwnedPath lz4_file(raw_file.path + ".lz4"); const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path; - if (std::system(cmd.c_str()) != 0) { throw skip_test("lz4 CLI unavailable"); } + if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; } auto raw = mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); auto lz4 = mps_fast::parse_mps_fast_file(lz4_file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity"); - expect_eq(lz4.var_names_.size(), raw.var_names_.size(), "lz4 var name count"); - expect_eq(lz4.row_names_.size(), raw.row_names_.size(), "lz4 row name count"); - expect_vector_eq(lz4.A_, raw.A_, "lz4 A values"); - expect_vector_eq(lz4.A_indices_, raw.A_indices_, "lz4 A indices"); - expect_vector_eq(lz4.A_offsets_, raw.A_offsets_, "lz4 A offsets"); - expect_vector_eq(lz4.c_, raw.c_, "lz4 objective"); - expect_vector_eq(lz4.b_, raw.b_, "lz4 rhs"); - expect_vector_eq(lz4.var_types_, raw.var_types_, "lz4 var types"); - expect_vector_eq(lz4.variable_lower_bounds_, raw.variable_lower_bounds_, "lz4 lower bounds"); - expect_vector_eq(lz4.variable_upper_bounds_, raw.variable_upper_bounds_, "lz4 upper bounds"); + check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity"); + EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count"; + EXPECT_EQ(raw.row_names_.size(), lz4.row_names_.size()) << "lz4 row name count"; + EXPECT_EQ(raw.A_, lz4.A_) << "lz4 A values"; + EXPECT_EQ(raw.A_indices_, lz4.A_indices_) << "lz4 A indices"; + EXPECT_EQ(raw.A_offsets_, lz4.A_offsets_) << "lz4 A offsets"; + EXPECT_EQ(raw.c_, lz4.c_) << "lz4 objective"; + EXPECT_EQ(raw.b_, lz4.b_) << "lz4 rhs"; + EXPECT_EQ(raw.var_types_, lz4.var_types_) << "lz4 var types"; + EXPECT_EQ(raw.variable_lower_bounds_, lz4.variable_lower_bounds_) << "lz4 lower bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, lz4.variable_upper_bounds_) << "lz4 upper bounds"; } -void gzip_bzip2_and_raw_paths_match() +TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch) { std::string mps; mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n"; @@ -828,8 +774,8 @@ void gzip_bzip2_and_raw_paths_match() const std::string gzip_cmd = "gzip -c " + raw_file.path + " > " + gzip_file.path; const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path; - if (std::system(gzip_cmd.c_str()) != 0) { throw skip_test("gzip CLI unavailable"); } - if (std::system(bzip2_cmd.c_str()) != 0) { throw skip_test("bzip2 CLI unavailable"); } + if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; } + if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; } auto raw = mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); @@ -838,78 +784,22 @@ void gzip_bzip2_and_raw_paths_match() auto bzip2 = mps_fast::parse_mps_fast_file(bzip2_file.path, mps_fast::FileReadMethod::Read); - expect_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity"); - expect_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity"); - expect_vector_eq(gzip.A_, raw.A_, "gzip A values"); - expect_vector_eq(bzip2.A_, raw.A_, "bzip2 A values"); - expect_vector_eq(gzip.A_indices_, raw.A_indices_, "gzip A indices"); - expect_vector_eq(bzip2.A_indices_, raw.A_indices_, "bzip2 A indices"); - expect_vector_eq(gzip.A_offsets_, raw.A_offsets_, "gzip A offsets"); - expect_vector_eq(bzip2.A_offsets_, raw.A_offsets_, "bzip2 A offsets"); - expect_vector_eq(gzip.c_, raw.c_, "gzip objective"); - expect_vector_eq(bzip2.c_, raw.c_, "bzip2 objective"); - expect_vector_eq(gzip.b_, raw.b_, "gzip rhs"); - expect_vector_eq(bzip2.b_, raw.b_, "bzip2 rhs"); - expect_vector_eq(gzip.variable_lower_bounds_, raw.variable_lower_bounds_, "gzip lower bounds"); - expect_vector_eq(bzip2.variable_lower_bounds_, raw.variable_lower_bounds_, "bzip2 lower bounds"); - expect_vector_eq(gzip.variable_upper_bounds_, raw.variable_upper_bounds_, "gzip upper bounds"); - expect_vector_eq(bzip2.variable_upper_bounds_, raw.variable_upper_bounds_, "bzip2 upper bounds"); - expect_vector_eq(gzip.var_types_, raw.var_types_, "gzip var types"); - expect_vector_eq(bzip2.var_types_, raw.var_types_, "bzip2 var types"); -} - -} // namespace - -int main() -{ - struct TestCase { - const char* name; - void (*fn)(); - }; - - const TestCase tests[] = { - {"ScannerFindsSectionSplitAcrossBlocks", scanner_finds_section_split_across_blocks}, - {"ScannerFindsHeadersSplitAtEveryByte", scanner_finds_headers_split_at_every_byte}, - {"ScannerRejectsUnknownColumnOneRecordsAfterRows", - scanner_rejects_unknown_column_one_records_after_rows}, - {"BoundsDefaultsAndTypesMatchReference", bounds_defaults_and_types_match_reference}, - {"DuplicateBoundsLastStatementWins", duplicate_bounds_last_statement_wins}, - {"NondenseRowAndColumnNamesUseHashPath", nondense_row_and_column_names_use_hash_path}, - {"MissingOptionalBoundsFastPath", missing_optional_bounds_fast_path}, - {"BoundsOnlyVariablesAreAppendedDeterministically", - bounds_only_variables_are_appended_deterministically}, - {"IntegerMarkersAssignTypesAndDefaultBounds", integer_markers_assign_types_and_default_bounds}, - {"NumericParsingIntegrationMatchesReferenceBitwise", - numeric_parsing_integration_matches_reference_bitwise}, - {"CrlfLineEndingsMatchReferenceBitwise", crlf_line_endings_match_reference_bitwise}, - {"CommentPlacementSupportedCasesMatchReferenceBitwise", - comment_placement_supported_cases_match_reference_bitwise}, - {"ObjectiveMetadataSelectsNamedObjective", objective_metadata_selects_named_objective}, - {"MalformedInputsReportErrors", malformed_inputs_report_errors}, - {"LargeColumnsRepeatedColumnChunkBoundary", large_columns_repeated_column_chunk_boundary}, - {"LargeBoundsRepeatedVarStaysOrdered", large_bounds_repeated_var_stays_ordered}, - {"Lz4AndRawPathsMatchOnMultiblockInput", lz4_and_raw_paths_match_on_multiblock_input}, - {"GzipBzip2AndRawPathsMatch", gzip_bzip2_and_raw_paths_match}, - }; - - int failed = 0; - for (const TestCase& test : tests) { - std::cout << "[ RUN ] " << test.name << '\n'; - try { - test.fn(); - std::cout << "[ OK ] " << test.name << '\n'; - } catch (const skip_test& e) { - std::cout << "[ SKIPPED ] " << test.name << ": " << e.what() << '\n'; - } catch (const std::exception& e) { - ++failed; - std::cerr << "[ FAILED ] " << test.name << ": " << e.what() << '\n'; - } - } - - if (failed != 0) { - std::cerr << failed << " test(s) failed\n"; - return 1; - } - std::cout << "[ PASSED ] " << std::size(tests) << " test(s)\n"; - return 0; + check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity"); + check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity"); + EXPECT_EQ(raw.A_, gzip.A_) << "gzip A values"; + EXPECT_EQ(raw.A_, bzip2.A_) << "bzip2 A values"; + EXPECT_EQ(raw.A_indices_, gzip.A_indices_) << "gzip A indices"; + EXPECT_EQ(raw.A_indices_, bzip2.A_indices_) << "bzip2 A indices"; + EXPECT_EQ(raw.A_offsets_, gzip.A_offsets_) << "gzip A offsets"; + EXPECT_EQ(raw.A_offsets_, bzip2.A_offsets_) << "bzip2 A offsets"; + EXPECT_EQ(raw.c_, gzip.c_) << "gzip objective"; + EXPECT_EQ(raw.c_, bzip2.c_) << "bzip2 objective"; + EXPECT_EQ(raw.b_, gzip.b_) << "gzip rhs"; + EXPECT_EQ(raw.b_, bzip2.b_) << "bzip2 rhs"; + EXPECT_EQ(raw.variable_lower_bounds_, gzip.variable_lower_bounds_) << "gzip lower bounds"; + EXPECT_EQ(raw.variable_lower_bounds_, bzip2.variable_lower_bounds_) << "bzip2 lower bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, gzip.variable_upper_bounds_) << "gzip upper bounds"; + EXPECT_EQ(raw.variable_upper_bounds_, bzip2.variable_upper_bounds_) << "bzip2 upper bounds"; + EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types"; + EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types"; } diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index 12f9ed488a..3b01f10227 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -139,23 +139,12 @@ double q_entry(const mps_data_model_t& m, int row, int col) class parser_fixture_base : public ::testing::TestWithParam { protected: - static mps_data_model_t read_mps_file(const std::string& file, - bool fixed_format = true) - { - const std::string& root = cuopt::test::get_rapids_dataset_root_dir(); - return read_mps(root + "/" + file, fixed_format); - } - - mps_data_model_t read_param_mps_file(const std::string& file, - bool fixed_format = true) const + mps_data_model_t read_mps_file(const std::string& file, + bool fixed_format = true) const { const std::string& root = cuopt::test::get_rapids_dataset_root_dir(); const auto reader = GetParam().reader; - // The experimental reader has no fixed/free parser mode. Use the same file but do not force - // fixed-format dispatch for that reader. - const bool reader_fixed_format = - reader == mps_reader_type_t::default_reader ? fixed_format : false; - return read(root + "/" + file, reader, reader_fixed_format); + return read(root + "/" + file, reader, fixed_format); } static mps_data_model_t read_lp_file(const std::string& file) @@ -386,7 +375,7 @@ TEST(mps_parser, bad_mps_files) TEST_P(good_mps_1_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-1.mps", false)); + check_model(read_mps_file("linear_programming/good-mps-1.mps", false)); } TEST_F(good_mps_1_test, mps_parser_internals) @@ -625,7 +614,7 @@ TEST(mps_parser_free_format, bad_mps_files_free_format) TEST_P(up_low_bounds_test, mps) { - check_model(read_param_mps_file("linear_programming/lp_model_with_var_bounds.mps", false)); + check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false)); } TEST_F(up_low_bounds_test, mps_parser_internals) @@ -646,12 +635,12 @@ TEST_P(good_mps_1_test, mps_free_format) { // free-format-mps-1.mps encodes the same problem as good-mps-1 with default // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker. - check_model(read_param_mps_file("linear_programming/free-format-mps-1.mps", false)); + check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false)); } TEST_P(some_var_bounds_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-some-var-bounds.mps")); + check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps")); } TEST_F(some_var_bounds_test, lp) @@ -661,7 +650,7 @@ TEST_F(some_var_bounds_test, lp) TEST_P(fixed_var_bound_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-fixed-var.mps")); + check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps")); } TEST_F(fixed_var_bound_test, lp) @@ -671,7 +660,7 @@ TEST_F(fixed_var_bound_test, lp) TEST_P(free_var_bound_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-free-var.mps")); + check_model(read_mps_file("linear_programming/good-mps-free-var.mps")); } TEST_F(free_var_bound_test, lp) @@ -681,7 +670,7 @@ TEST_F(free_var_bound_test, lp) TEST_P(lower_inf_var_bound_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps")); + check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps")); } TEST_F(lower_inf_var_bound_test, lp) @@ -699,7 +688,7 @@ TEST(mps_bounds, rhs_cost) TEST_P(upper_inf_var_bound_test, mps) { - check_model(read_param_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps")); + check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps")); } TEST_F(upper_inf_var_bound_test, lp) @@ -854,7 +843,7 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types) TEST_P(mip_with_bounds_test, mps) { - check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false)); + check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false)); } TEST_F(mip_with_bounds_test, mps_parser_internals) @@ -918,7 +907,7 @@ TEST(mps_parser, good_mps_file_mip_no_marker) TEST_P(mip_no_bounds_test, mps) { - check_model(read_param_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false)); + check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false)); } TEST_F(mip_no_bounds_test, lp) @@ -928,8 +917,7 @@ TEST_F(mip_no_bounds_test, lp) TEST_P(mip_partial_bounds_test, mps) { - check_model( - read_param_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false)); + check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false)); } TEST_F(mip_partial_bounds_test, lp) From 79e958ed4ce2e672780b9fc8ca81c6d31010fb1a Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 12 Jun 2026 02:02:35 -0700 Subject: [PATCH 10/22] cleanup for clarity --- cpp/src/io/CMakeLists.txt | 1 - .../fast_fp64_parser.hpp | 5 +- .../fast_parse_primitives.hpp | 52 +-- .../io/experimental_mps_fast/fast_parser.cpp | 338 +++++++----------- .../fast_parser_adapter.cpp | 32 -- .../hash_table_smallstr.hpp | 242 ++++++++++++- cpp/src/io/parser.cpp | 19 + 7 files changed, 413 insertions(+), 276 deletions(-) delete mode 100644 cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt index 4c99b1848b..cafcffb23f 100644 --- a/cpp/src/io/CMakeLists.txt +++ b/cpp/src/io/CMakeLists.txt @@ -5,7 +5,6 @@ set(MPS_FAST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_adapter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index e446494639..b7987738fc 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -33,6 +33,7 @@ namespace fp64 { // Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm // see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 // (8), 2021. +// verified on a large corpus of FP64 values: https://github.com/lemire/simple_fastfloat_benchmark struct power_10_lut_entry_t { uint64_t high; @@ -181,6 +182,8 @@ struct parsed_decimal_t { static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; } // SWAR 8char run of digits -> integer representation +// better and more portable than AVX2 stuff since AVX2 doesn't like swizzling across 16B lanes +// saw no real difference w/ 16B SSE static inline bool parse_8_digits(const char* p, uint32_t& out) { // comply with strict aliasing rules @@ -313,7 +316,7 @@ static inline bool parse_decimal_advance(const char*& p, const char* end, parsed static inline double fallback_strtod(std::string_view s) { char stack_buf[32]; - // The MPS specs mandate that numeric tokens are no longer than 25 characters + // The MPS specs mandate that numeric tokens are not longer than 25 characters if (s.size() >= sizeof(stack_buf)) { mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length"); } diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index d3317c50e1..f35726a118 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -7,9 +7,6 @@ #include #include -#include -#include -#include #include #include @@ -30,6 +27,8 @@ enum scan_mode { until_whitespace, }; +// util to serially scan along an in-memory input buffer +// contains optimized primitives for most parsing operations struct cursor_t { const char* start; const char* ptr; @@ -39,7 +38,8 @@ struct cursor_t { bool done() const { return ptr >= end; } - std::pair position() const + // used in error reporting + std::pair linecol_position() const { std::size_t line = 1; const char* line_start = start; @@ -55,7 +55,7 @@ struct cursor_t { [[noreturn]] void error(const char* msg, ...) { - auto [line, col] = position(); + auto [line, col] = linecol_position(); va_list args; va_start(args, msg); char msg_buf[512]; @@ -66,9 +66,7 @@ struct cursor_t { void advance(std::size_t n) { - if (ptr + n > end) { - mps_parser_fail(error_type_t::ValidationError, "cursor advanced past end of file"); - } + if (ptr + n > end) { mps_parser_fail(error_type_t::ValidationError, "Unexpected end of file"); } ptr += n; } @@ -87,10 +85,11 @@ struct cursor_t { return end; } + // scans for the first non-whitespace (or vice versa) template static const char* simd_scan(const char* p, const char* end) { - const simde__m256i v32 = simde_mm256_set1_epi8(32); + const simde__m256i v32 = simde_mm256_set1_epi8(32); // space/control characters const simde__m256i vnl = simde_mm256_set1_epi8('\n'); while (p + 32 <= end) { @@ -125,6 +124,7 @@ struct cursor_t { if (ptr < end && *ptr == '\n') { ptr++; } } + // could be SIMD but comments are usually rare void skip_comment_line() { while (!done() && *ptr != '\n' && *ptr != '\r') { @@ -140,6 +140,7 @@ struct cursor_t { } } + // useful for parsing NAME/OBJNAME which may span multiple "fields" according to the MPS spec std::string_view read_rest_of_line_trimmed() { const char* begin = ptr; @@ -173,8 +174,8 @@ struct cursor_t { const simde__m256i v32 = simde_mm256_set1_epi8(32); const simde__m256i vnl = simde_mm256_set1_epi8('\n'); - // All input streams provide trailing padding, so this unaligned 32-byte load is valid - // whenever end - ptr >= 32. + // all input streams provide trailing padding, so this 32B load is valid + // whenever end - ptr >= 32 simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32); @@ -204,6 +205,7 @@ struct cursor_t { return std::string_view(field_start, field_end - field_start); } + // read but do not consume inline __attribute__((always_inline)) std::string_view peek_field() { if (UNLIKELY(done())) { return {}; } @@ -218,6 +220,7 @@ struct cursor_t { return cursor.peek_field(); } + // usually in MPS fields go in pair. these can usually be extracted in a single 32B load inline __attribute__((always_inline)) std::pair read_two_fields() { @@ -234,31 +237,30 @@ struct cursor_t { const simde__m256i vnl = simde_mm256_set1_epi8('\n'); // Same padded-buffer contract as read_field(). - simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); - simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); - simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr); + simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32); unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32); unsigned int ws_mask = ~printable_mask; - unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); - unsigned int stop_mask = printable_mask | nl_mask; if (UNLIKELY(ws_mask == 0)) { return slow(); } int field1_end_off = __builtin_ctz(ws_mask); - unsigned int after_field1 = stop_mask & ~((1u << field1_end_off) - 1); - if (UNLIKELY(after_field1 == 0)) { return slow(); } - int field2_start_off = __builtin_ctz(after_field1); + unsigned int printable_after_field1 = printable_mask >> field1_end_off; + if (UNLIKELY(printable_after_field1 == 0)) { return slow(); } + int field2_start_off = field1_end_off + __builtin_ctz(printable_after_field1); if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); } - unsigned int ws_after_field2_start = ws_mask & ~((1u << field2_start_off) - 1); + unsigned int ws_after_field2_start = ws_mask >> field2_start_off; if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); } - int field2_end_off = __builtin_ctz(ws_after_field2_start); + int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start); - unsigned int after_field2 = stop_mask & ~((1u << field2_end_off) - 1); - if (LIKELY(after_field2 != 0)) { - ptr = ptr + __builtin_ctz(after_field2); + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + unsigned int stop_mask = printable_mask | (unsigned int)simde_mm256_movemask_epi8(is_nl); + unsigned int stop_after_field2 = stop_mask >> field2_end_off; + if (LIKELY(stop_after_field2 != 0)) { + ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2); } else { ptr = ptr + field2_end_off; skip_ws(); @@ -346,8 +348,6 @@ static inline double expect_number(cursor_t& cursor) static inline double expect_number_fast_pm_one(cursor_t& cursor) { const char* p = cursor.ptr; - // Kept bounded despite the global padding invariant: this path is also used - // on section-local cursors whose logical end may precede the physical buffer. if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') { cursor.ptr = p + 2; cursor.skip_ws(); diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index bc9000f8f3..3e47c7ee8c 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -67,12 +68,6 @@ static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB; static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB; // parser-wide thread cap switch; very small files lose to scheduling overhead static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull; -// below this, the serial row-hash build is usually cheaper than partition setup -static constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * KiB; -// number of partitions for the row hash table, used to avoid races and atomics during row hash -// table initialization -static constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; -static constexpr size_t MPS_ROW_HASH_PARTITIONS = (size_t{1} << MPS_ROW_HASH_PARTITION_BITS); // thread caps for small and large files static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16; static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32; @@ -132,12 +127,6 @@ class chunk_name_arena_t { size_t next_slab_size_ = 64 * KiB; }; -// returns the hash table partition to use for a given hash -static inline size_t row_hash_partition_for(uint32_t hash) -{ - return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); -} - struct timer_entry_t { const char* name; double elapsed_ms; @@ -452,12 +441,6 @@ static inline void observe_dense_name(bool& candidate, template struct parse_state_t { - struct row_hash_partition_t { - hash_slot_var_t* slots = nullptr; - size_t buckets = 0; - size_t mask = 0; - }; - cuopt::linear_programming::io::mps_data_model_t& problem; cursor_t& cursor; @@ -475,17 +458,7 @@ struct parse_state_t { index_mode_t col_index_mode = index_mode_t::hash; dense_name_index_t col_dense; - // Row name hash table - sized at runtime based on row count - size_t row_hash_buckets = 0; - size_t row_hash_mask = 0; // buckets - 1, for fast modulo via & - mmap_region_t row_hash_region; - hash_slot_var_t* row_names_ht = nullptr; - // compute hash, select the subtable from high hash bits, - // then run the same open-addressing probe loop inside that subtable. - size_t row_hash_partition_count = 0; - std::array row_hash_partitions = {}; - // Overflow map for row names longer than HASH_KEY_BYTES (usually very rare) - std::unordered_map row_names_long; + smallstr_hash_table_t row_hash_; // Row name lookup for labels like R0001, R0002, ... index_mode_t row_index_mode = index_mode_t::hash; @@ -495,6 +468,13 @@ struct parse_state_t { // var_names still uses STL (only used in parse_bounds, not as hot) std::unordered_map var_names_map; + mmap_region_t temp_A_region; + mmap_region_t temp_A_indices_region; + f_t* temp_A = nullptr; + i_t* temp_A_indices = nullptr; + size_t temp_csr_nnz = 0; + bool temp_csr_materialized = false; + struct bounds_only_var_t { f_t lb = f_t{0}; f_t ub = std::numeric_limits::infinity(); @@ -537,23 +517,17 @@ struct parse_state_t { return true; } - size_t row_hash_bucket_count_for(size_t n_rows) const - { -#ifdef MPS_FAST_COMPACT_ROW_HASH - // probe counts are usually low, and a smaller - // table reduces cache/TLB footprint on medium instances. - return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); -#else - return cuda::next_power_of_two(std::max((size_t)(n_rows * 2), (size_t)64)); -#endif - } - void init_row_hash_table_impl() { scoped_timer_t timer("row_hash_init_total"); size_t n_rows = row_names_sv.size(); const int num_threads = phase_thread_count(MPS_ROWS_THREAD_CAP); const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1; +#ifdef MPS_FAST_COMPACT_ROW_HASH + constexpr bool compact_row_hash = true; +#else + constexpr bool compact_row_hash = false; +#endif std::vector row_hashes; std::vector row_order; std::array partition_counts = {}; @@ -561,19 +535,17 @@ struct parse_state_t { if (use_partitioned) { scoped_timer_t timer("row_hash_partition_metadata"); - // Pre-hash once, count rows per partition, then pack row indices by partition. - // This turns the build into disjoint single-writer table fills. row_hashes.resize(n_rows); size_t inline_rows = 0; for (size_t idx = 0; idx < n_rows; ++idx) { std::string_view name = row_names_sv[idx]; if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { - row_names_long[name] = idx; + row_hash_.note_long_name(name, idx); continue; } uint32_t hash = fnv1a_hash(name.data(), name.size()); row_hashes[idx] = hash; - ++partition_counts[row_hash_partition_for(hash)]; + ++partition_counts[hash_partition_for(hash)]; ++inline_rows; } @@ -585,102 +557,55 @@ struct parse_state_t { auto next_offsets = partition_offsets; for (size_t idx = 0; idx < n_rows; ++idx) { if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; } - size_t part = row_hash_partition_for(row_hashes[idx]); + size_t part = hash_partition_for(row_hashes[idx]); row_order[next_offsets[part]++] = idx; } } if (use_partitioned) { - row_hash_partition_count = MPS_ROW_HASH_PARTITIONS; - size_t total_buckets = 0; - for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { - row_hash_partitions[p].buckets = row_hash_bucket_count_for(partition_counts[p]); - row_hash_partitions[p].mask = row_hash_partitions[p].buckets - 1; - total_buckets += row_hash_partitions[p].buckets; - } - row_hash_buckets = total_buckets; - row_hash_mask = row_hash_buckets - 1; + row_hash_.configure_partitioned_buckets(partition_counts, compact_row_hash); } else { - row_hash_partition_count = 0; - row_hash_buckets = row_hash_bucket_count_for(n_rows); - row_hash_mask = row_hash_buckets - 1; + row_hash_.configure_serial_buckets(n_rows, compact_row_hash); } - size_t row_hash_mmap_size = row_hash_buckets * sizeof(hash_slot_var_t); { scoped_timer_t timer("row_hash_mmap"); - // Use mmap for allocation - the OS provides zero'd pages - row_hash_region = mmap_region_t::anonymous( - row_hash_mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, "row hash table"); - row_names_ht = (hash_slot_var_t*)row_hash_region.data(); - if (use_partitioned) { - hash_slot_var_t* next_slots = row_names_ht; - for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { - row_hash_partitions[p].slots = next_slots; - next_slots += row_hash_partitions[p].buckets; - } - } - // request huge pages to reduce TLB misses - row_hash_region.advise(MADV_HUGEPAGE); + row_hash_.allocate_mmap("row hash table"); } - // pre-touch the 2MB huge pages to nudge the kernel into allocating them #ifdef MPS_FAST_THP_PREFAULT { scoped_timer_t timer("row_hash_thp_prefault"); - materialize_hugepages( - "row_names_ht", row_names_ht, row_hash_region.size(), materialize_touch_t::write_2mb); + materialize_hugepages("row_names_ht", + row_hash_.slots(), + row_hash_.region().size(), + materialize_touch_t::write_2mb); } #endif { scoped_timer_t timer("row_hash_insert_all"); -#ifdef MPS_FAST_PERF_COUNTERS - size_t total_probes = 0; - size_t max_probes = 0; - size_t long_names = row_names_long.size(); -#endif + row_hash_.reset_build_probe_stats(); if (use_partitioned) { scoped_timer_t timer("row_hash_insert_partitioned"); #ifdef MPS_FAST_PERF_COUNTERS std::vector perf_snapshots(MPS_ROW_HASH_PARTITIONS); - std::vector partition_total_probes(MPS_ROW_HASH_PARTITIONS, 0); - std::vector partition_max_probes(MPS_ROW_HASH_PARTITIONS, 0); #endif -// initialize the row hash tables in parallel #pragma omp parallel for schedule(static) num_threads(num_threads) for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { size_t p = (size_t)part_id; #ifdef MPS_FAST_PERF_COUNTERS thread_perf_counters_t perf_counters; - size_t local_total_probes = 0; - size_t local_max_probes = 0; #endif - const auto& part = row_hash_partitions[p]; - // Each worker owns its subtable, so row_insert_into remains the plain serial probe loop. for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { size_t idx = row_order[pos]; -#ifdef MPS_FAST_PERF_COUNTERS - size_t probes = row_insert_into( - part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx); - local_total_probes += probes; - local_max_probes = std::max(local_max_probes, probes); -#else - row_insert_into( - part.slots, part.buckets, part.mask, row_names_sv[idx], row_hashes[idx], idx); -#endif + row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx); } #ifdef MPS_FAST_PERF_COUNTERS - partition_total_probes[p] = local_total_probes; - partition_max_probes[p] = local_max_probes; - perf_snapshots[p] = perf_counters.stop(); + perf_snapshots[p] = perf_counters.stop(); #endif } #ifdef MPS_FAST_PERF_COUNTERS - for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { - total_probes += partition_total_probes[p]; - max_probes = std::max(max_probes, partition_max_probes[p]); - } print_perf_totals("row_hash_insert_partitioned", perf_snapshots); #endif } else { @@ -688,42 +613,19 @@ struct parse_state_t { thread_perf_counters_t perf_counters; #endif for (size_t idx = 0; idx < n_rows; ++idx) { -#ifdef MPS_FAST_PERF_COUNTERS - size_t probes = row_insert(row_names_sv[idx], idx); - if (probes == 0) { - ++long_names; - } else { - total_probes += probes; - max_probes = std::max(max_probes, probes); - } -#else - row_insert(row_names_sv[idx], idx); -#endif + row_hash_.insert_serial(row_names_sv[idx], idx); } #ifdef MPS_FAST_PERF_COUNTERS print_perf_totals("row_hash_insert_all", {perf_counters.stop()}); #endif } -#ifdef MPS_FAST_PERF_COUNTERS - size_t probed_rows = n_rows - long_names; - double mean_probes = probed_rows == 0 ? 0.0 : (double)total_probes / (double)probed_rows; - double load_factor = row_hash_buckets == 0 ? 0.0 : (double)n_rows / (double)row_hash_buckets; - std::fprintf(stderr, - "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n", - n_rows, - row_hash_buckets, - load_factor, - long_names, - mean_probes, - max_probes); -#endif + row_hash_.print_build_probe_report(n_rows); } - // Force the kernel to please please collapse the page range into THP pages #ifdef MPS_FAST_MADV_COLLAPSE { scoped_timer_t timer("row_hash_madv_collapse"); - row_hash_region.advise(MADV_COLLAPSE); + row_hash_.region().advise(MADV_COLLAPSE); } #endif } @@ -731,7 +633,7 @@ struct parse_state_t { size_t row_lookup(std::string_view name) const { if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); } - return row_lookup_hash(name); + return row_hash_.lookup(name); } size_t read_row_lookup_dense_ordered(cursor_t& cursor) const @@ -774,75 +676,7 @@ struct parse_state_t { } auto row_name = cursor.read_field(); - return row_lookup_hash(row_name); - } - - size_t row_lookup_hash(std::string_view name) const - { - if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { - auto it = row_names_long.find(name); - return it != row_names_long.end() ? it->second : SIZE_MAX; - } - hash_key_t key = make_key(name.data(), name.size()); - uint32_t hash = fnv1a_hash(name.data(), name.size()); - if (LIKELY(row_hash_partition_count != 0)) { - // Lookups mirror the build routing and probe only the selected subtable. - const auto& part = row_hash_partitions[row_hash_partition_for(hash)]; - return row_lookup_in(part.slots, part.buckets, part.mask, key, hash); - } - return row_lookup_in(row_names_ht, row_hash_buckets, row_hash_mask, key, hash); - } - - size_t row_lookup_in( - const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) const - { - const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; - for (size_t i = 0; i < buckets; ++i, ++slot) { - if (slot >= &slots[buckets]) { slot = &slots[0]; } - if (slot->count == 0) { return SIZE_MAX; } - if (key_cmpeq(slot->key, key)) { return slot->count - 1; } - } - return SIZE_MAX; - } - - size_t row_insert(std::string_view name, size_t index) - { - if (UNLIKELY(name.size() > HASH_KEY_BYTES)) { - row_names_long[name] = index; - return 0; - } - return row_insert_into(row_names_ht, - row_hash_buckets, - row_hash_mask, - name, - fnv1a_hash(name.data(), name.size()), - index); - } - - size_t row_insert_into(hash_slot_var_t* slots, - size_t buckets, - size_t mask, - std::string_view name, - uint32_t hash, - size_t index) - { - hash_key_t key = make_key(name.data(), name.size()); - hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; - for (size_t i = 0; i < buckets; ++i, ++slot) { - if (slot >= &slots[buckets]) { slot = &slots[0]; } - if (slot->count == 0) { - key_store(slot->key, key); // Writes 32 bytes, including garbage in last 4 - slot->count = (uint32_t)(index + 1); // Overwrite last 4 bytes with actual count. i trust - // the compiler to optimize this - return i + 1; - } - if (key_cmpeq(slot->key, key)) { - slot->count = (uint32_t)(index + 1); - return i + 1; - } - } - // can't happen, the table is properly sized to fit all rows - __builtin_unreachable(); + return row_hash_.lookup(row_name); } }; @@ -1736,19 +1570,28 @@ template static void allocate_column_outputs(parse_state_t& state, const column_merge_shape_t& shape) { - scoped_timer_t timer("allocate_csr_arrays"); + scoped_timer_t timer("allocate_temp_csr_arrays"); + size_t values_bytes = shape.total_nnz * sizeof(f_t); + size_t indices_bytes = shape.total_nnz * sizeof(i_t); + state.temp_csr_nnz = shape.total_nnz; - // problem_t uses std::vector, so these resize() calls zero-initialize large arrays. - // Running them in parallel hides part of that page-fault and initialization cost. #pragma omp parallel sections num_threads(4) { #pragma omp section { - state.problem.A_.resize(shape.total_nnz); + state.temp_A_region = mmap_region_t::anonymous( + std::max(values_bytes, 1), PROT_READ | PROT_WRITE, MAP_PRIVATE, "temp CSR values"); + state.temp_A = (f_t*)state.temp_A_region.data(); + state.temp_A_region.advise(MADV_HUGEPAGE); } #pragma omp section { - state.problem.A_indices_.resize(shape.total_nnz); + state.temp_A_indices_region = mmap_region_t::anonymous(std::max(indices_bytes, 1), + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + "temp CSR column indices"); + state.temp_A_indices = (i_t*)state.temp_A_indices_region.data(); + state.temp_A_indices_region.advise(MADV_HUGEPAGE); } #pragma omp section { @@ -1788,16 +1631,16 @@ static void scatter_column_chunks_to_csr(parse_state_t& state, size_t col_start = chunk.col_offsets[local_col]; size_t col_end = chunk.col_offsets[local_col + 1]; for (size_t idx = col_start; idx < col_end; idx++) { - i_t row = (i_t)chunk.row_indices[idx]; - size_t row_idx = (size_t)row; - size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; - size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; - int32_t block_pos = chunk.row_count_block_dir[block_id]; - row_count_block_t& block = chunk.row_count_blocks[(size_t)block_pos]; - int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; - i_t dest = (i_t)write_pos++; - state.problem.A_[dest] = (f_t)chunk.values[idx]; - state.problem.A_indices_[dest] = global_col; + i_t row = (i_t)chunk.row_indices[idx]; + size_t row_idx = (size_t)row; + size_t block_id = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS; + size_t local = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS; + int32_t block_pos = chunk.row_count_block_dir[block_id]; + row_count_block_t& block = chunk.row_count_blocks[(size_t)block_pos]; + int64_t& write_pos = chunk.row_count_storage[block.storage_offset + local]; + i_t dest = (i_t)write_pos++; + state.temp_A[dest] = (f_t)chunk.values[idx]; + state.temp_A_indices[dest] = global_col; } } #ifdef MPS_FAST_PERF_COUNTERS @@ -1905,6 +1748,66 @@ static void merge_chunk_results_to_csr(parse_state_t& state, state.problem.nnz_ = (i_t)shape.total_nnz; } +template +static void materialize_problem_csr(parse_state_t& state) +{ + scoped_timer_t timer("materialize_problem_csr"); + size_t nnz = state.temp_csr_nnz; + const char* env_threads = std::getenv("MPS_CSR_COPY_THREADS"); + int copy_threads = env_threads ? std::atoi(env_threads) : 2; + copy_threads = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP)); + + int resize_threads = copy_threads > 1 ? 2 : 1; +#pragma omp parallel sections num_threads(resize_threads) + { +#pragma omp section + { + state.problem.A_.resize(nnz); + } +#pragma omp section + { + state.problem.A_indices_.resize(nnz); + } + } + + size_t value_bytes = nnz * sizeof(f_t); + size_t index_bytes = nnz * sizeof(i_t); + size_t total_bytes = value_bytes + index_bytes; + // Copy A_ and A_indices overlapping with the other phases + // this hides the latency costs of heap alloc and default init with other parsing/IO + // instead of making it blocking for the column parse + // TODO: just have A_ and A_indices_ be mmap anon allocs directly in the mps_data_model_t + // but that'd require careful work around avoiding breaking changes and the API esp cython stuff + if (total_bytes != 0) { +#pragma omp parallel for num_threads(copy_threads) schedule(static) + for (int t = 0; t < copy_threads; ++t) { + size_t begin = (total_bytes * (size_t)t) / (size_t)copy_threads; + size_t end = (total_bytes * (size_t)(t + 1)) / (size_t)copy_threads; + if (begin < value_bytes) { + size_t value_end = std::min(end, value_bytes); + if (value_end > begin) { + std::memcpy((char*)state.problem.A_.data() + begin, + (const char*)state.temp_A + begin, + value_end - begin); + } + } + if (end > value_bytes) { + size_t index_begin = begin > value_bytes ? begin - value_bytes : 0; + size_t index_end = end - value_bytes; + std::memcpy((char*)state.problem.A_indices_.data() + index_begin, + (const char*)state.temp_A_indices + index_begin, + index_end - index_begin); + } + } + } + + state.temp_A = nullptr; + state.temp_A_indices = nullptr; + state.temp_csr_materialized = true; + state.temp_A_region.reset(); + state.temp_A_indices_region.reset(); +} + template static void parse_columns_section_parallel(parse_state_t& state, int num_threads, @@ -2891,6 +2794,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0; int header_done = 0, rows_done = 0, columns_done = 0; int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0; + int csr_done = 0; const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size); const int parser_threads = parser_thread_cap_for_size(parser_size); @@ -2988,6 +2892,14 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ }); } +#pragma omp task depend(in : columns_done) depend(out : csr_done) + { + run_parser_task([&] { + MPS_NVTX_RANGE("task_materialize_csr", nvtx::colors::alloc); + materialize_problem_csr(state); + }); + } + #pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done) { run_parser_task([&] { diff --git a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp b/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp deleted file mode 100644 index 0d14f059bc..0000000000 --- a/cpp/src/io/experimental_mps_fast/fast_parser_adapter.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -#include "fast_parser.hpp" - -#include - -#include - -namespace cuopt::linear_programming::io { - -template -mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) -{ - CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str()); - return mps_fast::parse_mps_fast_file(mps_file_path); -} - -template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); -template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); -template mps_data_model_t read_mps_fast_experimental( - const std::string& mps_file_path); -template mps_data_model_t read_mps_fast_experimental( - const std::string& mps_file_path); - -} // namespace cuopt::linear_programming::io diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp index 7d367db941..4d47b23c10 100644 --- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -5,14 +5,33 @@ #pragma once +#include "mmap_region.hpp" + +#include + #include +#include + +#include +#include #include #include #include +#ifdef MPS_FAST_PERF_COUNTERS +#include +#endif +#include +#include +#include namespace mps_fast { +// below this threshold, the serial row-hash build is usually cheaper than partition setup +inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024; +inline constexpr int MPS_ROW_HASH_PARTITION_BITS = 5; +inline constexpr size_t MPS_ROW_HASH_PARTITIONS = (1 << MPS_ROW_HASH_PARTITION_BITS); + // FNV-1a over bytes in reverse order; row names commonly share long prefixes. static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len) { @@ -30,9 +49,6 @@ static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len) } // 28-byte inline key + uint32 payload: two slots per 64-byte cache line. -// key_store writes a full 32-byte vector starting at key[0], so callers must -// publish the payload after storing the key. key_cmpeq masks those payload lanes -// away, leaving the trailing uint32 free for the row index + 1 sentinel. struct alignas(32) hash_slot_28_t { char key[28]; uint32_t count; @@ -65,4 +81,224 @@ static inline void key_store(char* slot_key, hash_key_t key) simde_mm256_store_si256(reinterpret_cast(slot_key), key); } +struct hash_partition_t { + hash_slot_var_t* slots = nullptr; + size_t buckets = 0; + size_t mask = 0; +}; + +static inline size_t hash_partition_for(uint32_t hash) +{ + return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS)); +} + +static inline size_t hash_bucket_count_for(size_t n_rows, bool compact) +{ + if (compact) { return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); } + return cuda::next_power_of_two(std::max(n_rows * 2, (size_t)64)); +} + +static inline size_t hash_lookup_in( + const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash) +{ + const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } + if (slot->count == 0) { return std::numeric_limits::max(); } + if (key_cmpeq(slot->key, key)) { return slot->count - 1; } + } + return std::numeric_limits::max(); +} + +static inline size_t hash_insert_into(hash_slot_var_t* slots, + size_t buckets, + size_t mask, + std::string_view name, + uint32_t hash, + size_t index) +{ + hash_key_t key = make_key(name.data(), name.size()); + hash_slot_var_t* slot = &slots[hash & (uint32_t)mask]; + for (size_t i = 0; i < buckets; ++i, ++slot) { + if (slot >= &slots[buckets]) { slot = &slots[0]; } + if (slot->count == 0) { + key_store(slot->key, key); + slot->count = (uint32_t)(index + 1); + return i + 1; + } + if (key_cmpeq(slot->key, key)) { + slot->count = (uint32_t)(index + 1); + return i + 1; + } + } + __builtin_unreachable(); +} + +#ifdef MPS_FAST_PERF_COUNTERS +struct hash_build_probe_stats_t { + size_t total_probes = 0; + size_t max_probes = 0; + size_t long_names = 0; + + void seed_long_names(size_t n) { long_names = n; } + + void record_insert(size_t probes) + { + if (probes == 0) { + ++long_names; + } else { + total_probes += probes; + max_probes = std::max(max_probes, probes); + } + } + + void merge(const hash_build_probe_stats_t& other) + { + total_probes += other.total_probes; + max_probes = std::max(max_probes, other.max_probes); + long_names += other.long_names; + } +}; +#endif + +class smallstr_hash_table_t { + public: + void note_long_name(std::string_view name, size_t index) { long_names_[name] = index; } + + size_t long_name_count() const { return long_names_.size(); } + + void reset_build_probe_stats() + { +#ifdef MPS_FAST_PERF_COUNTERS + build_probe_stats_ = {}; + build_probe_stats_.seed_long_names(long_names_.size()); + partition_probe_stats_ = {}; +#endif + } + + void print_build_probe_report(size_t n_rows) const + { +#ifdef MPS_FAST_PERF_COUNTERS + hash_build_probe_stats_t stats = build_probe_stats_; + if (partition_count_ != 0) { + for (size_t p = 0; p < partition_count_; ++p) { + stats.merge(partition_probe_stats_[p]); + } + } + size_t probed_rows = n_rows - stats.long_names; + double mean_probes = probed_rows == 0 ? 0.0 : (double)stats.total_probes / (double)probed_rows; + double load_factor = buckets_ == 0 ? 0.0 : (double)n_rows / (double)buckets_; + std::fprintf(stderr, + "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n", + n_rows, + buckets_, + load_factor, + stats.long_names, + mean_probes, + stats.max_probes); +#endif + } + + void configure_serial_buckets(size_t n_rows, bool compact) + { + partition_count_ = 0; + buckets_ = hash_bucket_count_for(n_rows, compact); + mask_ = buckets_ - 1; + } + + void configure_partitioned_buckets( + const std::array& partition_counts, bool compact) + { + partition_count_ = MPS_ROW_HASH_PARTITIONS; + buckets_ = 0; + for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) { + partitions_[p].buckets = hash_bucket_count_for(partition_counts[p], compact); + partitions_[p].mask = partitions_[p].buckets - 1; + buckets_ += partitions_[p].buckets; + } + mask_ = buckets_ - 1; + } + + void allocate_mmap(const char* label) + { + size_t mmap_size = buckets_ * sizeof(hash_slot_var_t); + region_ = mmap_region_t::anonymous(mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, label); + slots_ = (hash_slot_var_t*)region_.data(); + if (partition_count_ != 0) { + hash_slot_var_t* next_slots = slots_; + for (size_t p = 0; p < partition_count_; ++p) { + partitions_[p].slots = next_slots; + next_slots += partitions_[p].buckets; + } + } + region_.advise(MADV_HUGEPAGE); + } + + mmap_region_t& region() noexcept { return region_; } + const mmap_region_t& region() const noexcept { return region_; } + + hash_slot_var_t* slots() noexcept { return slots_; } + const hash_slot_var_t* slots() const noexcept { return slots_; } + + size_t buckets() const noexcept { return buckets_; } + size_t mask() const noexcept { return mask_; } + size_t partition_count() const noexcept { return partition_count_; } + + const hash_partition_t& partition(size_t p) const noexcept { return partitions_[p]; } + + size_t lookup(std::string_view name) const + { + if (name.size() > HASH_KEY_BYTES) { + auto it = long_names_.find(name); + return it != long_names_.end() ? it->second : std::numeric_limits::max(); + } + hash_key_t key = make_key(name.data(), name.size()); + uint32_t hash = fnv1a_hash(name.data(), name.size()); + if (partition_count_ != 0) { + const auto& part = partitions_[hash_partition_for(hash)]; + return hash_lookup_in(part.slots, part.buckets, part.mask, key, hash); + } + return hash_lookup_in(slots_, buckets_, mask_, key, hash); + } + + size_t insert_serial(std::string_view name, size_t index) + { + size_t probes; + if (name.size() > HASH_KEY_BYTES) { + note_long_name(name, index); + probes = 0; + } else { + probes = hash_insert_into( + slots_, buckets_, mask_, name, fnv1a_hash(name.data(), name.size()), index); + } +#ifdef MPS_FAST_PERF_COUNTERS + build_probe_stats_.record_insert(probes); +#endif + return probes; + } + + size_t insert_partition(size_t partition, std::string_view name, uint32_t hash, size_t index) + { + const auto& part = partitions_[partition]; + size_t probes = hash_insert_into(part.slots, part.buckets, part.mask, name, hash, index); +#ifdef MPS_FAST_PERF_COUNTERS + partition_probe_stats_[partition].record_insert(probes); +#endif + return probes; + } + + private: + mmap_region_t region_; + hash_slot_var_t* slots_ = nullptr; + size_t buckets_ = 0; + size_t mask_ = 0; + size_t partition_count_ = 0; + std::array partitions_{}; + std::unordered_map long_names_{}; +#ifdef MPS_FAST_PERF_COUNTERS + hash_build_probe_stats_t build_probe_stats_{}; + std::array partition_probe_stats_{}; +#endif +}; + } // namespace mps_fast diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp index 93d9d9c73c..6392833ce3 100644 --- a/cpp/src/io/parser.cpp +++ b/cpp/src/io/parser.cpp @@ -7,8 +7,13 @@ #include +#include #include +#include + +#include + namespace cuopt::linear_programming::io { template @@ -35,4 +40,18 @@ template mps_data_model_t read_mps_from_string(std::string_view mps_ template mps_data_model_t read_mps_from_string(std::string_view mps_contents, bool fixed_mps_format); +template +mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) +{ + CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str()); + return mps_fast::parse_mps_fast_file(mps_file_path); +} + +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); +template mps_data_model_t read_mps_fast_experimental( + const std::string& mps_file_path); + } // namespace cuopt::linear_programming::io From 26141370ec40acf7a80065f18c935a7269251a3e Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 12 Jun 2026 03:53:25 -0700 Subject: [PATCH 11/22] more cleanup, fix som eedge case failures --- .../fast_fp64_parser.hpp | 4 +- .../fast_parse_primitives.hpp | 4 +- .../io/experimental_mps_fast/fast_parser.cpp | 129 ++++----- .../io/experimental_mps_fast/fast_parser.hpp | 6 +- .../io/experimental_mps_fast/file_reader.cpp | 151 ++++------ .../io/experimental_mps_fast/file_reader.hpp | 148 ++++++++-- .../hash_table_smallstr.hpp | 4 +- .../experimental_mps_fast/lz4_file_reader.cpp | 270 ++++++++---------- .../io/experimental_mps_fast/mmap_region.hpp | 4 +- .../mps_section_scanner.cpp | 42 ++- .../mps_section_scanner.hpp | 21 +- .../io/experimental_mps_fast/nvtx_ranges.hpp | 21 +- cpp/src/io/mps_parser.cpp | 6 +- cpp/src/io/parser.cpp | 2 +- cpp/src/utilities/perf_counters.hpp | 4 +- .../fast_fp64_parser_test.cpp | 41 +-- .../fast_parser_edge_test.cpp | 194 +++++-------- cpp/tests/linear_programming/parser_test.cpp | 16 +- 18 files changed, 548 insertions(+), 519 deletions(-) diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index b7987738fc..f007c0f707 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -15,7 +15,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; @@ -428,4 +428,4 @@ static inline double parse_fp64_advance(const char*& p, const char* end) } } // namespace fp64 -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index f35726a118..f77e14a410 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -20,7 +20,7 @@ #define UNLIKELY(x) __builtin_expect(!!(x), 0) #endif -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { enum scan_mode { skip_whitespace, @@ -379,4 +379,4 @@ static inline bool accept_comment(cursor_t& cursor) return false; } -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 3e47c7ee8c..35e83a01aa 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,7 @@ #define MPS_FAST_COMPACT_ROW_HASH #define MPS_FAST_THP_PREFAULT -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { static constexpr size_t KiB = 1024; static constexpr size_t MiB = 1024 * KiB; @@ -171,12 +172,11 @@ enum class materialize_touch_t { // instanciate a range using mmap anon pages with hugepage hints, and materialize them // by touching each to nudge the kernel into invoking its THP mechanism -static void materialize_hugepages(const char* label, +static void materialize_hugepages([[maybe_unused]] const char* label, void* data, size_t bytes, materialize_touch_t touch) { - (void)label; if (data == nullptr || bytes == 0) return; constexpr size_t two_mb = 2 * MiB; @@ -208,7 +208,7 @@ static void materialize_vector_hugepages(const char* label, class scoped_timer_t { public: - scoped_timer_t(const char* name, double* accumulator = nullptr) + scoped_timer_t([[maybe_unused]] const char* name, double* accumulator = nullptr) #ifdef MPS_FAST_TIMERS : name_(name), accumulator_(accumulator), @@ -217,7 +217,6 @@ class scoped_timer_t { #else : accumulator_(accumulator) { - (void)name; } #endif @@ -441,7 +440,7 @@ static inline void observe_dense_name(bool& candidate, template struct parse_state_t { - cuopt::linear_programming::io::mps_data_model_t& problem; + mps_data_model_t& problem; cursor_t& cursor; // backed by the input buffer @@ -484,10 +483,7 @@ struct parse_state_t { // some writers introduce zero-column variables only in BOUNDS. std::map bounds_only_vars; - parse_state_t(cuopt::linear_programming::io::mps_data_model_t& p, cursor_t& c) - : problem(p), cursor(c) - { - } + parse_state_t(mps_data_model_t& p, cursor_t& c) : problem(p), cursor(c) {} void init_row_hash_table() { @@ -718,7 +714,8 @@ static void parse_objname_section(parse_state_t& state) { scoped_timer_t timer("parse_objname"); if (accept(state.cursor, "OBJNAME")) { - if (!state.cursor.eol()) { state.objective_name_sv = state.cursor.read_rest_of_line_trimmed(); } + if (state.cursor.eol()) { expect_eol(state.cursor); } + state.objective_name_sv = state.cursor.read_field(); accept_comment(state.cursor); expect_eol(state.cursor); } @@ -1752,10 +1749,9 @@ template static void materialize_problem_csr(parse_state_t& state) { scoped_timer_t timer("materialize_problem_csr"); - size_t nnz = state.temp_csr_nnz; - const char* env_threads = std::getenv("MPS_CSR_COPY_THREADS"); - int copy_threads = env_threads ? std::atoi(env_threads) : 2; - copy_threads = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP)); + size_t nnz = state.temp_csr_nnz; + int copy_threads = 2; + copy_threads = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP)); int resize_threads = copy_threads > 1 ? 2 : 1; #pragma omp parallel sections num_threads(resize_threads) @@ -1904,8 +1900,7 @@ static void parse_rhs_section(parse_state_t& state, cursor_t& cursor) }; while (cursor.ptr < cursor.end) { - auto rhs_name = cursor.read_field(); - (void)rhs_name; + [[maybe_unused]] auto rhs_name = cursor.read_field(); if (accept_comment(cursor)) { expect_eol(cursor); continue; @@ -2075,9 +2070,8 @@ static bool parse_bounds_section_parallel_dense(parse_state_t& state, continue; } - auto bound_name = cursor.read_field(); - (void)bound_name; - auto var_name = cursor.read_field(); + [[maybe_unused]] auto bound_name = cursor.read_field(); + auto var_name = cursor.read_field(); if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); @@ -2274,10 +2268,9 @@ static void parse_bounds_section(parse_state_t& state, { scoped_timer_t timer("parse_bounds"); while (!cursor.done()) { - auto bound_type = cursor.read_field(); - auto bound_name = cursor.read_field(); - (void)bound_name; - auto var_name = cursor.read_field(); + auto bound_type = cursor.read_field(); + [[maybe_unused]] auto bound_name = cursor.read_field(); + auto var_name = cursor.read_field(); if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) { cursor.skip_to_eol(); expect_eol(cursor); @@ -2335,7 +2328,7 @@ static void parse_bounds_section(parse_state_t& state, } cursor.error("%s", msg); }; - (void)apply_bound_record( + [[maybe_unused]] bool bound_applied = apply_bound_record( bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error); if (aux_var == nullptr) { mark_bound(var_idx); } @@ -2401,8 +2394,7 @@ static void parse_ranges_section(parse_state_t& state, cursor_t& curso }; while (cursor.ptr < cursor.end) { - auto range_name = cursor.read_field(); - (void)range_name; + [[maybe_unused]] auto range_name = cursor.read_field(); if (accept_comment(cursor)) { expect_eol(cursor); continue; @@ -2716,8 +2708,8 @@ static void append_bounds_only_variables(parse_state_t& state) } template -static std::size_t init_problem_storage( - cuopt::linear_programming::io::mps_data_model_t& problem, std::size_t reserve_hint) +static std::size_t init_problem_storage(mps_data_model_t& problem, + std::size_t reserve_hint) { problem.n_vars_ = 0; problem.n_constraints_ = 0; @@ -2741,15 +2733,30 @@ static std::size_t init_problem_storage( return reserve_dim; } -template -static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_stream( - Stream& stream, const char* total_timer_name, const char* producer_task_name) +// Contract every input stream fed to parse_mps_fast_stream must satisfy. +template +concept InputStream = requires(Stream stream) +{ + {stream.data()}->std::convertible_to; + {stream.mutable_data()}->std::convertible_to; + {stream.size()}->std::convertible_to; + {stream.compressed_size()}->std::convertible_to; + {stream.reserve_size_hint()}->std::convertible_to; + {stream.registry()}->std::same_as; + {stream.view()}->std::same_as; + {stream.run_decode_tasks()}->std::same_as; +}; + +template +static mps_data_model_t parse_mps_fast_stream(Stream& stream, + const char* total_timer_name, + const char* producer_task_name) { omp_max_active_levels_guard_t omp_active_levels(2); input_stream_view_t input = stream.view(); auto total_timer = std::make_unique(total_timer_name); - cuopt::linear_programming::io::mps_data_model_t problem; + mps_data_model_t problem; std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint()); cursor_t cursor(input.data, 0); @@ -2758,24 +2765,14 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ auto phase_end = [](const char*) { flush_timers(); }; - std::mutex task_error_mutex; - std::exception_ptr first_task_error = nullptr; - std::atomic task_failed{false}; - - auto mark_task_error = [&](std::exception_ptr eptr) { - { - std::lock_guard lock(task_error_mutex); - if (!first_task_error) { first_task_error = eptr; } - } - task_failed.store(true, std::memory_order_release); - }; + parallel_error_latch_t parser_tasks; auto run_parser_task = [&](auto&& fn) { - if (task_failed.load(std::memory_order_acquire)) { return; } + if (parser_tasks.stopped()) { return; } try { fn(); } catch (...) { - mark_task_error(std::current_exception()); + parser_tasks.capture(std::current_exception()); } }; @@ -2851,7 +2848,7 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ try { stream.run_decode_tasks(); } catch (...) { - mark_task_error(std::current_exception()); + parser_tasks.capture(std::current_exception()); unblock_phase_waiters_after_error(); } } @@ -2940,19 +2937,20 @@ static cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_ } } - if (first_task_error) { std::rethrow_exception(first_task_error); } + parser_tasks.rethrow_if_error(); append_bounds_only_variables(state); input.size = stream.size(); cursor.end = input.data + input.size; - if (!input.registry->endata_ready() || !input.registry->endata_present()) { - cursor.ptr = - input.registry->endata_ready() ? input.registry->endata_begin() : input.data + input.size; - cursor.error("missing ENDATA"); + if (!input.registry->endata_ready()) { + cursor.ptr = input.data + input.size; + cursor.error("input ended before ENDATA boundary was resolved"); + } + if (input.registry->endata_present()) { + cursor.ptr = input.registry->endata_begin(); + expect(cursor, "ENDATA"); } - cursor.ptr = input.registry->endata_begin(); - expect(cursor, "ENDATA"); total_timer.reset(); flush_timers(); @@ -2967,7 +2965,7 @@ struct padded_memory_input_t { static padded_memory_input_t read_compressed_mps_file(const std::string& path) { - std::vector buffer = cuopt::linear_programming::io::detail::file_to_string(path); + std::vector buffer = file_to_string(path); if (buffer.empty()) { buffer.push_back('\0'); } std::size_t input_size = buffer.size() - 1; @@ -2976,8 +2974,7 @@ static padded_memory_input_t read_compressed_mps_file(const std::string& path) } template -cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method) +mps_data_model_t parse_mps_fast_file(const std::string& path, FileReadMethod read_method) { FileReadMethod effective_method = effective_file_read_method(path, read_method); switch (effective_method) { @@ -3006,13 +3003,13 @@ cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( __builtin_unreachable(); } -template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method); -template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method); -template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method); -template cuopt::linear_programming::io::mps_data_model_t parse_mps_fast_file( - const std::string& path, FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); +template mps_data_model_t parse_mps_fast_file(const std::string& path, + FileReadMethod read_method); -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp index 9f6f0f107b..6047a55f05 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp @@ -10,13 +10,13 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { template -using parser_model_t = cuopt::linear_programming::io::mps_data_model_t; +using parser_model_t = mps_data_model_t; template parser_model_t parse_mps_fast_file(const std::string& path, FileReadMethod read_method = FileReadMethod::Read); -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 5eae15a46a..e874011db8 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -26,7 +26,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_fail; @@ -104,6 +104,27 @@ std::size_t system_page_size() return page_size; } +bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset) +{ + std::size_t done = 0; + while (done < bytes) { + std::size_t remaining = bytes - done; + std::size_t chunk = + std::min(remaining, (std::size_t)std::numeric_limits::max()); + ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done)); + if (got < 0) { + if (errno == EINTR) { continue; } + return false; + } + if (got == 0) { + errno = EIO; + return false; + } + done += (std::size_t)got; + } + return true; +} + raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) { MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); @@ -118,9 +139,6 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) file_size_ = get_file_size(buffered_fd_, path); fd_ = buffered_fd_; bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes; - if (const char* raw_direct = std::getenv("MPS_FAST_RAW_DIRECT_IO")) { - use_direct_io = raw_direct[0] != '0'; - } if (use_direct_io) { #ifdef O_DIRECT int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); @@ -140,8 +158,6 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) output_data_ = output_region_.char_data(); output_region_.advise(MADV_HUGEPAGE); - block_done_.resize(window_count_, 0); - block_end_.resize(window_count_, 0); section_scanner_ = std::make_unique(output_data_, window_count_, registry_); } @@ -157,10 +173,20 @@ char* raw_input_stream_t::mutable_data() noexcept { return output_data_; } std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; } std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; } std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; } -mps_phase_registry_t& raw_input_stream_t::registry() noexcept { return registry_; } -input_stream_view_t raw_input_stream_t::view() noexcept + +void raw_input_stream_t::read_window_payload(std::size_t offset, std::size_t size) { - return {output_data_, output_data_, output_view_size_, file_size_, ®istry_}; + if (pread_full(fd_, output_data_ + offset, size, offset)) { return; } + // O_DIRECT can reject an unaligned request with EINVAL; fall back to the + // buffered descriptor for this window when that happens. + if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0 && + pread_full(buffered_fd_, output_data_ + offset, size, offset)) { + return; + } + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread raw MPS file '%s': %s", + path_.c_str(), + std::strerror(errno)); } void raw_input_stream_t::run_decode_tasks() @@ -177,96 +203,24 @@ void raw_input_stream_t::run_decode_tasks() std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads); thread_count = std::max(1, std::min(thread_count, window_count_)); - std::atomic_size_t next_window{0}; - std::exception_ptr first_error = nullptr; - std::mutex error_mutex; - std::atomic_bool stop{false}; - - auto mark_error = [&](std::exception_ptr eptr) { - std::lock_guard lock(error_mutex); - if (!first_error) { - first_error = eptr; - stop.store(true, std::memory_order_release); - } - }; - - auto read_window = [&](std::size_t index) { - MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); - std::size_t offset = index * window_bytes_; - std::size_t size = std::min(window_bytes_, file_size_ - offset); - std::size_t done = 0; - { - MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); - while (done < size) { - ssize_t got = - ::pread(fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); - if (got < 0) { - if (errno == EINTR) { continue; } - if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0) { - got = ::pread( - buffered_fd_, output_data_ + offset + done, size - done, (off_t)(offset + done)); - if (got >= 0) { - done += (std::size_t)got; - continue; - } - if (errno == EINTR) { continue; } - } - mps_parser_fail(error_type_t::RuntimeError, - "Failed to pread raw MPS file '%s': %s", - path_.c_str(), - std::strerror(errno)); - } - if (got == 0) { - mps_parser_fail(error_type_t::RuntimeError, - "Unexpected EOF while reading raw MPS file '%s'", - path_.c_str()); - } - done += (std::size_t)got; + // Each window is read independently and handed to the scanner, which owns the + // contiguous decoded-byte frontier and the parallel section publication. + parallel_error_latch_t latch; + parallel_for_indexed( + window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) { + MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); + std::size_t offset = index * window_bytes_; + std::size_t size = std::min(window_bytes_, file_size_ - offset); + { + MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); + read_window_payload(offset, size); } - } - - { MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); - frontier_mutex_.lock(); - block_done_[index] = 1; - block_end_[index] = offset + size; - std::size_t before = ready_bytes_; - while (next_block_ < block_done_.size() && block_done_[next_block_]) { - ready_bytes_ = block_end_[next_block_]; - ++next_block_; - } - std::size_t after = ready_bytes_; - frontier_mutex_.unlock(); - if (after > before) { section_scanner_->publish_ready(after); } - } - }; - - std::vector workers; - workers.reserve(thread_count); - for (std::size_t t = 0; t < thread_count; ++t) { - workers.emplace_back([&, t] { - std::string thread_name = "raw-input-read-" + std::to_string(t); - nvtx::name_current_thread(thread_name.c_str()); - MPS_NVTX_RANGE("raw_worker_loop", nvtx::colors::io); - while (!stop.load(std::memory_order_acquire)) { - std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); - if (index >= window_count_) { break; } - try { - read_window(index); - } catch (...) { - mark_error(std::current_exception()); - return; - } - } }); - } - for (auto& worker : workers) { - worker.join(); - } - if (first_error) { std::rethrow_exception(first_error); } + latch.rethrow_if_error(); - output_view_size_ = ready_bytes_; + output_view_size_ = section_scanner_->ready_bytes(); section_scanner_->publish_ready(output_view_size_); } @@ -283,17 +237,12 @@ char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); } std::size_t memory_input_stream_t::size() const noexcept { return input_size_; } std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; } std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; } -mps_phase_registry_t& memory_input_stream_t::registry() noexcept { return registry_; } -input_stream_view_t memory_input_stream_t::view() noexcept -{ - return {buffer_.data(), buffer_.data(), input_size_, compressed_size_, ®istry_}; -} void memory_input_stream_t::run_decode_tasks() { MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io); + // Single block: observe_block advances the frontier and publishes. section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_); - section_scanner_->publish_ready(input_size_); } bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); } @@ -332,4 +281,4 @@ const char* file_read_method_name(FileReadMethod method) noexcept } } -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index b0089be257..802d6fe191 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -1,20 +1,45 @@ // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights // reserved. SPDX-License-Identifier: Apache-2.0 +// Input layer for the fast MPS parser: turns on-disk bytes (plain or .lz4) into one +// contiguous parse buffer and publishes MPS section boundaries as data becomes available. +// +// Model: +// - Output is an anonymous mmap'd buffer (THP-hinted, tail-padded for SIMD/cursor safety). +// Raw inputs pread directly into fixed slots; LZ4 decodes into the same layout. +// - Work is split into windows (fixed spans of compressed/raw file bytes). Workers use +// parallel_for_indexed() — std::thread + shared-index dispatch, not OpenMP — because +// blocking pread()/decode does not compose cleanly with OMP team barriers. +// - Each completed window/block is handed to mps_section_block_scanner_t::observe_block(). +// Blocks may finish out of order; the scanner advances a contiguous ready_bytes_ +// frontier and publishes section ranges into mps_phase_registry_t only once the prefix +// up to a section title is contiguous and scannable. +// - The parser runs as OpenMP tasks on those published phases while run_decode_tasks() +// (raw parallel pread, or the LZ4 reader → metadata scanner → decoder pipeline) fills +// the buffer on separate threads. parallel_error_latch_t propagates the first worker +// failure and stops the rest. +// +// LZ4 adds a resident-window pool (parallel pread of compressed spans), block metadata +// scanning with ptr_if_contiguous()/copy_to for window-boundary payloads, parallel decode +// workers, window ref-counting/release, and lazy commit_up_to() of decoded output pages. + #pragma once #include "mmap_region.hpp" #include "mps_section_scanner.hpp" +#include "nvtx_ranges.hpp" #include #include #include +#include #include #include #include +#include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { inline constexpr std::size_t input_buffer_padding_bytes = 64; @@ -62,6 +87,80 @@ std::size_t system_page_size(); std::size_t get_file_size(int fd, const std::string& path); std::size_t get_file_size(const std::string& path); +/** + * @brief Read exactly @p bytes at @p offset into @p dst, retrying on EINTR. + * + * Returns false and leaves errno set on error or unexpected EOF. + */ +bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset); + +// First-error-wins latch shared by the parallel reader/decoder pipelines. The +// first captured exception is retained and a stop flag is raised so cooperating +// workers can unwind promptly. The retained exception is rethrown by the +// orchestrating thread once all workers have joined. +class parallel_error_latch_t { + public: + void capture(std::exception_ptr eptr) + { + std::lock_guard lock(mutex_); + if (!first_error_) { + first_error_ = eptr; + stopped_.store(true, std::memory_order_release); + } + } + + bool stopped() const noexcept { return stopped_.load(std::memory_order_acquire); } + + void rethrow_if_error() const + { + if (first_error_) { std::rethrow_exception(first_error_); } + } + + private: + std::mutex mutex_; + std::exception_ptr first_error_ = nullptr; + std::atomic_bool stopped_{false}; +}; + +// Work-stealing parallel loop over [0, count). Each of thread_count workers pulls +// the next index from a shared counter and invokes body(index). An exception +// escaping body is captured into the latch and stops the loop; the caller is +// responsible for calling latch.rethrow_if_error() after this returns. Workers +// are named "" when a prefix is supplied. +// OMP just doesn't really play well with blocking pread() +template +void parallel_for_indexed(std::size_t count, + std::size_t thread_count, + parallel_error_latch_t& latch, + const char* thread_name_prefix, + Body body) +{ + std::atomic_size_t next{0}; + std::vector workers; + workers.reserve(thread_count); + for (std::size_t t = 0; t < thread_count; ++t) { + workers.emplace_back([&, t] { + if (thread_name_prefix != nullptr) { + std::string name = thread_name_prefix + std::to_string(t); + nvtx::name_current_thread(name.c_str()); + } + while (!latch.stopped()) { + std::size_t index = next.fetch_add(1, std::memory_order_relaxed); + if (index >= count) { break; } + try { + body(index); + } catch (...) { + latch.capture(std::current_exception()); + return; + } + } + }); + } + for (auto& worker : workers) { + worker.join(); + } +} + struct input_stream_view_t { const char* data = nullptr; char* mutable_data = nullptr; @@ -70,7 +169,28 @@ struct input_stream_view_t { mps_phase_registry_t* registry = nullptr; }; -class lz4_input_stream_t { +/** + * @brief CRTP base supplying the registry and view() shared by every input + * stream. Derived classes provide data()/mutable_data()/size()/compressed_size(). + */ +template +class input_stream_base_t { + public: + mps_phase_registry_t& registry() noexcept { return registry_; } + + input_stream_view_t view() noexcept + { + auto* self = static_cast(this); + return {self->data(), self->mutable_data(), self->size(), self->compressed_size(), ®istry_}; + } + + protected: + mps_phase_registry_t registry_; +}; + +// Handles lz4 compressed files (useful since lz4 is very fast, works well for MPS, and makes +// parallel decompression trivial) +class lz4_input_stream_t : public input_stream_base_t { public: explicit lz4_input_stream_t(const std::string& path); ~lz4_input_stream_t(); @@ -83,8 +203,6 @@ class lz4_input_stream_t { std::size_t size() const noexcept; std::size_t compressed_size() const noexcept; std::size_t reserve_size_hint() const noexcept; - mps_phase_registry_t& registry() noexcept; - input_stream_view_t view() noexcept; void run_decode_tasks(); @@ -108,13 +226,13 @@ class lz4_input_stream_t { bool block_checksum_ = false; bool content_checksum_ = false; bool dict_id_ = false; - mps_phase_registry_t registry_; std::mutex commit_mutex_; std::unique_ptr section_scanner_; std::size_t block_slot_count_ = 0; }; -class raw_input_stream_t { +// Takes a file path +class raw_input_stream_t : public input_stream_base_t { public: explicit raw_input_stream_t(const std::string& path); ~raw_input_stream_t(); @@ -127,12 +245,12 @@ class raw_input_stream_t { std::size_t size() const noexcept; std::size_t compressed_size() const noexcept; std::size_t reserve_size_hint() const noexcept; - mps_phase_registry_t& registry() noexcept; - input_stream_view_t view() noexcept; void run_decode_tasks(); private: + void read_window_payload(std::size_t offset, std::size_t size); + std::string path_; int fd_ = -1; int buffered_fd_ = -1; @@ -144,16 +262,11 @@ class raw_input_stream_t { std::size_t file_size_ = 0; std::size_t window_bytes_ = 0; std::size_t window_count_ = 0; - mps_phase_registry_t registry_; - std::mutex frontier_mutex_; - std::vector block_done_; - std::vector block_end_; std::unique_ptr section_scanner_; - std::size_t next_block_ = 0; - std::size_t ready_bytes_ = 0; }; -class memory_input_stream_t { +// Takes an in-memory buffer +class memory_input_stream_t : public input_stream_base_t { public: memory_input_stream_t(std::vector buffer, std::size_t input_size, @@ -167,8 +280,6 @@ class memory_input_stream_t { std::size_t size() const noexcept; std::size_t compressed_size() const noexcept; std::size_t reserve_size_hint() const noexcept; - mps_phase_registry_t& registry() noexcept; - input_stream_view_t view() noexcept; void run_decode_tasks(); @@ -176,8 +287,7 @@ class memory_input_stream_t { std::vector buffer_; std::size_t input_size_ = 0; std::size_t compressed_size_ = 0; - mps_phase_registry_t registry_; std::unique_ptr section_scanner_; }; -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp index 4d47b23c10..b7138fedb6 100644 --- a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp +++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp @@ -25,7 +25,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { // below this threshold, the serial row-hash build is usually cheaper than partition setup inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024; @@ -301,4 +301,4 @@ class smallstr_hash_table_t { #endif }; -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 9c47ba63c7..2c40d6745b 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -38,7 +38,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; @@ -108,15 +108,14 @@ const lz4_runtime_t& lz4_runtime() } #endif -int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, int dst_capacity) +int lz4_decompress_safe_runtime([[maybe_unused]] const char* src, + [[maybe_unused]] char* dst, + [[maybe_unused]] int compressed_size, + [[maybe_unused]] int dst_capacity) { #if defined(MPS_PARSER_WITH_LZ4) return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity); #else - (void)src; - (void)dst; - (void)compressed_size; - (void)dst_capacity; mps_parser_fail( error_type_t::RuntimeError, "Experimental fast MPS parser was built without LZ4 decompression support. " @@ -127,7 +126,7 @@ int lz4_decompress_safe_runtime(const char* src, char* dst, int compressed_size, void ensure_lz4_runtime_available() { #if defined(MPS_PARSER_WITH_LZ4) - (void)lz4_runtime(); + [[maybe_unused]] auto& runtime = lz4_runtime(); #else mps_parser_fail( error_type_t::RuntimeError, @@ -176,27 +175,6 @@ std::size_t block_max_size_from_bd(unsigned char bd) } } -bool pread_full_plain(int fd, char* dst, std::size_t bytes, std::size_t offset) -{ - std::size_t done = 0; - while (done < bytes) { - std::size_t remaining = bytes - done; - std::size_t chunk = - std::min(remaining, (std::size_t)std::numeric_limits::max()); - ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done)); - if (got < 0) { - if (errno == EINTR) { continue; } - return false; - } - if (got == 0) { - errno = EIO; - return false; - } - done += (std::size_t)got; - } - return true; -} - struct lz4_resident_window_t { std::size_t index = 0; std::size_t file_offset = 0; @@ -210,6 +188,9 @@ class lz4_resident_windows_t { { } + // Compressed file bytes arrive in fixed resident windows; block payloads may span a boundary. + // Return a direct pointer when the whole payload sits in one window (LZ4 decompress + pin); + // otherwise nullptr and the caller stages via copy_to. const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const { if (size == 0) return nullptr; @@ -277,26 +258,29 @@ class lz4_resident_windows_t { std::vector& windows_; }; -} // namespace +// Parsed fields of the leading LZ4 frame descriptor (RFC: magic, FLG, BD, and +// optional content size / dictionary id / header checksum). +struct lz4_frame_header_t { + std::size_t block_max_size = 0; + std::size_t content_size = 0; + std::size_t header_size = 0; + bool content_size_present = false; + bool block_checksum = false; + bool content_checksum = false; + bool dict_id = false; +}; -lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) +lz4_frame_header_t parse_lz4_frame_header(int fd, + const std::string& path, + std::size_t compressed_size) { - MPS_NVTX_RANGE("lz4_input_construct", nvtx::colors::io); - - ensure_lz4_runtime_available(); - - fd_ = open_lz4_fd(path); - ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); - - compressed_size_ = get_file_size(fd_, path); - - char header[32]; - if (compressed_size_ < 7) { + if (compressed_size < 7) { mps_parser_fail(error_type_t::ValidationError, "LZ4 input is too small to contain a frame header"); } - std::size_t header_bytes = std::min(sizeof(header), compressed_size_); - if (!pread_full_plain(fd_, header, header_bytes, 0)) { + char header[32]; + std::size_t header_bytes = std::min(sizeof(header), compressed_size); + if (!pread_full(fd, header, header_bytes, 0)) { mps_parser_fail(error_type_t::RuntimeError, "Failed to read LZ4 frame header '%s': %s", path.c_str(), @@ -317,24 +301,26 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version"); } bool block_independent = (flg & 0x20u) != 0; - block_checksum_ = (flg & 0x10u) != 0; - content_size_present_ = (flg & 0x08u) != 0; - content_checksum_ = (flg & 0x04u) != 0; - dict_id_ = (flg & 0x01u) != 0; if (!block_independent) { mps_parser_fail(error_type_t::ValidationError, "parallel LZ4 reader requires independent blocks; compress with -BI"); } - block_max_size_ = block_max_size_from_bd(bd); - if (content_size_present_) { + + lz4_frame_header_t info; + info.block_checksum = (flg & 0x10u) != 0; + info.content_size_present = (flg & 0x08u) != 0; + info.content_checksum = (flg & 0x04u) != 0; + info.dict_id = (flg & 0x01u) != 0; + info.block_max_size = block_max_size_from_bd(bd); + if (info.content_size_present) { if (offset + 8 > header_bytes) { mps_parser_fail(error_type_t::ValidationError, "truncated LZ4 frame while reading content size"); } - content_size_ = (std::size_t)read_le64(header + offset); + info.content_size = (std::size_t)read_le64(header + offset); offset += 8; } - if (dict_id_) { + if (info.dict_id) { if (offset + 4 > header_bytes) { mps_parser_fail(error_type_t::ValidationError, "truncated LZ4 frame while reading dictionary id"); @@ -346,7 +332,31 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) "truncated LZ4 frame while reading header checksum"); } offset += 1; - header_size_ = offset; + info.header_size = offset; + return info; +} + +} // namespace + +lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) +{ + MPS_NVTX_RANGE("lz4_input_constructor", nvtx::colors::io); + + ensure_lz4_runtime_available(); + + fd_ = open_lz4_fd(path); + ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + + compressed_size_ = get_file_size(fd_, path); + + lz4_frame_header_t header = parse_lz4_frame_header(fd_, path, compressed_size_); + block_max_size_ = header.block_max_size; + content_size_ = header.content_size; + header_size_ = header.header_size; + content_size_present_ = header.content_size_present; + block_checksum_ = header.block_checksum; + content_checksum_ = header.content_checksum; + dict_id_ = header.dict_id; std::size_t reserve_size = content_size_; if (!content_size_present_) { @@ -355,7 +365,7 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) } reserve_size += input_buffer_padding_bytes; - constexpr std::size_t huge_alignment = 2 * 1024 * 1024; + constexpr std::size_t huge_alignment = 2 * 1024 * 1024; // 2MiB output_mapped_size_ = cuda::round_up(reserve_size, system_page_size()); output_region_ = mmap_region_t::anonymous_aligned(output_mapped_size_, huge_alignment, @@ -385,11 +395,6 @@ std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept ? content_size_ : std::max(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024); } -mps_phase_registry_t& lz4_input_stream_t::registry() noexcept { return registry_; } -input_stream_view_t lz4_input_stream_t::view() noexcept -{ - return {output_data_, output_data_, output_view_size_, compressed_size_, ®istry_}; -} void lz4_input_stream_t::commit_up_to(std::size_t bytes) { @@ -419,6 +424,12 @@ struct resident_block_desc_t { bool uncompressed = false; }; +// Two distinct units flow through this pipeline: +// * window - a fixed-size span of the compressed file read by the I/O stage. +// * block - a single independent LZ4 data block (decompressed unit) that the +// metadata scanner discovers inside the resident windows. +// Windows feed blocks; the decoded blocks are handed to the section scanner, +// which owns the contiguous decoded-byte frontier and section publication. struct lz4_pipeline_t { explicit lz4_pipeline_t(lz4_input_stream_t& input_) : input(input_), @@ -428,9 +439,7 @@ struct lz4_pipeline_t { window_done(window_count, 0), window_refs(window_count), window_scanned(window_count), - window_released(window_count), - block_done(input.block_slot_count_, 0), - block_end(input.block_slot_count_, 0) + window_released(window_count) { for (std::size_t i = 0; i < window_count; ++i) { std::size_t offset = i * window_bytes; @@ -446,39 +455,27 @@ struct lz4_pipeline_t { void run() { - start_readers(); std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this); start_decoders(); + run_readers(); - for (auto& reader : readers) { - reader.join(); - } scanner.join(); for (auto& worker : decoders) { worker.join(); } - if (first_error) { std::rethrow_exception(first_error); } + latch.rethrow_if_error(); } void finalize() { - input.output_view_size_ = ready_bytes; + input.output_view_size_ = input.section_scanner_->ready_bytes(); input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes); input.section_scanner_->publish_ready(input.output_view_size_); } - void mark_error(std::exception_ptr eptr) - { - std::lock_guard lock(error_mutex); - if (!first_error) { - first_error = eptr; - stop_workers.store(true, std::memory_order_release); - } - } - void fail_and_notify(std::exception_ptr eptr) { - mark_error(eptr); + latch.capture(eptr); window_cv.notify_all(); desc_cv.notify_all(); } @@ -513,46 +510,42 @@ struct lz4_pipeline_t { } } - void start_readers() + void run_readers() { - readers.reserve(io_threads); - for (std::size_t t = 0; t < io_threads; ++t) { - readers.emplace_back(&lz4_pipeline_t::run_reader_stage, this, t); - } + parallel_for_indexed( + window_count, io_threads, latch, "lz4-window-read-", [this](std::size_t index) { + read_window(index); + }); } - void run_reader_stage(std::size_t tid) + void read_window(std::size_t index) { - std::string thread_name = "lz4-window-read-" + std::to_string(tid); - nvtx::name_current_thread(thread_name.c_str()); - while (!stop_workers.load(std::memory_order_acquire)) { - std::size_t index = next_window.fetch_add(1, std::memory_order_relaxed); - if (index >= windows.size()) { break; } - auto& w = windows[index]; - w.data.reset(new char[w.size]); - add_compressed_resident(w.size); - bool ok = false; - { - MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); - ok = pread_full_plain(input.fd_, w.data.get(), w.size, w.file_offset); - } - if (!ok) { - try { - mps_parser_fail(error_type_t::RuntimeError, - "Failed to pread LZ4 resident window: %s", - std::strerror(errno)); - } catch (...) { - fail_and_notify(std::current_exception()); - } - return; - } - { - MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); - std::lock_guard lock(window_mutex); - window_done[index] = 1; + auto& w = windows[index]; + w.data.reset(new char[w.size]); + add_compressed_resident(w.size); + bool ok = false; + { + MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); + ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset); + } + if (!ok) { + // Capture-and-notify locally so scanner/decoder waiters wake; do not let + // the exception escape to parallel_for_indexed without the cv notify. + try { + mps_parser_fail(error_type_t::RuntimeError, + "Failed to pread LZ4 resident window: %s", + std::strerror(errno)); + } catch (...) { + fail_and_notify(std::current_exception()); } - window_cv.notify_all(); + return; } + { + MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); + std::lock_guard lock(window_mutex); + window_done[index] = 1; + } + window_cv.notify_all(); } void start_decoders() @@ -582,10 +575,8 @@ struct lz4_pipeline_t { { MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io); std::unique_lock lock(desc_mutex); - desc_cv.wait(lock, [&] { - return stop_workers.load(std::memory_order_acquire) || scanner_done || !desc_queue.empty(); - }); - if (stop_workers.load(std::memory_order_acquire) || desc_queue.empty()) { return {}; } + desc_cv.wait(lock, [&] { return latch.stopped() || scanner_done || !desc_queue.empty(); }); + if (latch.stopped() || desc_queue.empty()) { return {}; } std::vector batch = std::move(desc_queue.front()); desc_queue.pop_front(); return batch; @@ -628,35 +619,16 @@ struct lz4_pipeline_t { { if (block.window_index == std::numeric_limits::max()) { return; } uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel); - (void)old; assert(old > 0); if (old == 1) { try_release_window(block.window_index); } } void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size) { - { - MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); - input.section_scanner_->observe_block(block.index, dst, dst + actual_size); - } - std::size_t before = 0; - std::size_t after = 0; - { - MPS_NVTX_RANGE("lz4_frontier_update", nvtx::colors::generic); - std::lock_guard lock(frontier_mutex); - block_done[block.index] = 1; - block_end[block.index] = block.decompressed_offset + actual_size; - before = ready_bytes; - while (next_block < block_done.size() && block_done[next_block]) { - ready_bytes = block_end[next_block]; - ++next_block; - } - after = ready_bytes; - } - if (after > before) { - MPS_NVTX_RANGE("lz4_publish_ready", nvtx::colors::generic); - input.section_scanner_->publish_ready(after); - } + MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic); + // The scanner advances the contiguous decoded-byte frontier and publishes + // section ranges as blocks complete, regardless of decode order. + input.section_scanner_->observe_block(block.index, dst, dst + actual_size); } void wait_range_ready(std::size_t begin, std::size_t size) @@ -675,9 +647,8 @@ struct lz4_pipeline_t { for (std::size_t wi = first; wi <= last; ++wi) { MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io); std::unique_lock lock(window_mutex); - window_cv.wait( - lock, [&] { return stop_workers.load(std::memory_order_acquire) || window_done[wi] != 0; }); - if (stop_workers.load(std::memory_order_acquire) && window_done[wi] == 0) { + window_cv.wait(lock, [&] { return latch.stopped() || window_done[wi] != 0; }); + if (latch.stopped() && window_done[wi] == 0) { mps_parser_fail(error_type_t::RuntimeError, "LZ4 metadata scanner stopped before required window was ready"); } @@ -745,7 +716,7 @@ struct lz4_pipeline_t { batch_decoded_bytes += block.decompressed_size; batch.push_back(block); blocks_scanned.fetch_add(1, std::memory_order_relaxed); - if (blocks_scanned.load(std::memory_order_relaxed) > block_done.size()) { + if (blocks_scanned.load(std::memory_order_relaxed) > input.block_slot_count_) { mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 input block count exceeded reserved metadata slots"); } @@ -857,11 +828,8 @@ struct lz4_pipeline_t { std::vector windows; const std::size_t io_threads; - std::exception_ptr first_error = nullptr; - std::mutex error_mutex; - std::atomic_bool stop_workers{false}; + parallel_error_latch_t latch; - std::atomic_size_t next_window{0}; std::vector window_done; std::vector> window_refs; std::vector> window_scanned; @@ -878,15 +846,7 @@ struct lz4_pipeline_t { std::atomic_size_t blocks_scanned{0}; std::vector> crossing_payloads; - std::vector readers; std::vector decoders; - - // Tracks the contiguous decoded-byte frontier across out-of-order block completions. - std::mutex frontier_mutex; - std::vector block_done; - std::vector block_end; - std::size_t next_block = 0; - std::size_t ready_bytes = 0; }; void lz4_input_stream_t::run_decode_tasks() @@ -897,4 +857,4 @@ void lz4_input_stream_t::run_decode_tasks() pipeline.finalize(); } -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp index 389f563efa..7727e0d2f7 100644 --- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -19,7 +19,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; @@ -131,4 +131,4 @@ class mmap_region_t { std::size_t size_ = 0; }; -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index 9eee8708e0..a3c9fe87a3 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -16,7 +16,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; @@ -202,7 +202,7 @@ void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, cons const char* expected = nullptr; if (slot.compare_exchange_strong( expected, ptr, std::memory_order_release, std::memory_order_acquire)) { - publish_section_ranges(); + notify_ready_phases(); } } @@ -252,13 +252,14 @@ void mps_section_block_scanner_t::scan_section_range(const char* begin, const ch } // In compliant MPS, indicator records begin in column 1 while data records - // begin in column 2+. Treat start-of-file or "\n[nonblank]" as the cheap - // candidate signal, then run the exact section matcher only for candidates. + // begin in column 2+. use "\n[nonblank]" as a needle for the SIMD scan const simde__m256i newline = simde_mm256_set1_epi8('\n'); while ((std::size_t)(end - p) >= kSimdWidth) { // The first-line path above increments p when p == data_, so p - 1 is // in-bounds here. Loading the previous vector lets us test "\nX" for all // 32 candidate column-1 bytes with one AVX2 mask. + // loadu is comparable to aligned reads on modern SSE/AVX. + // might warrant some checks on ARM though simde__m256i current = simde_mm256_loadu_si256(reinterpret_cast(p)); simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast(p - 1)); std::uint32_t mask = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256( @@ -290,6 +291,8 @@ void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::siz scan_section_range(data_ + begin, data_ + end); } +// scans a freshly decoded block for section titles, along with the start/end boundaries if a +// section title straddles blocks void mps_section_block_scanner_t::observe_block(std::size_t block_index, const char* begin, const char* end) @@ -311,6 +314,26 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, block_decoded_[block_index + 1].load(std::memory_order_acquire)) { scan_boundary(block_index, block_index + 1); } + + advance_ready_frontier(); +} + +void mps_section_block_scanner_t::advance_ready_frontier() +{ + std::size_t new_ready = 0; + bool grew = false; + { + // block_decoded_ is stored with release after the begin/end offsets, so an + // acquire load of a set flag makes the matching end offset visible here. + std::lock_guard lock(frontier_mutex_); + while (next_block_ < block_count_ && + block_decoded_[next_block_].load(std::memory_order_acquire)) { + new_ready = block_end_offsets_[next_block_].load(std::memory_order_acquire); + ++next_block_; + grew = true; + } + } + if (grew) { publish_ready(new_ready); } } void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes) @@ -318,10 +341,15 @@ void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes) ready_bytes_.store(ready_bytes, std::memory_order_release); std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0; scan_section_range(data_ + begin, data_ + ready_bytes); - publish_section_ranges(); + notify_ready_phases(); +} + +std::size_t mps_section_block_scanner_t::ready_bytes() const noexcept +{ + return ready_bytes_.load(std::memory_order_acquire); } -void mps_section_block_scanner_t::publish_section_ranges() +void mps_section_block_scanner_t::notify_ready_phases() { // Publication model: each present phase runs from its own section header to // the first later section header that has been discovered. Optional sections @@ -430,4 +458,4 @@ void mps_section_block_scanner_t::publish_section_ranges() } } -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index 9fcffa6ea7..7fd249a7e8 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -11,7 +11,11 @@ #include -namespace mps_fast { +// The section scanner handles freshly read/decoded blocks and scans them for section titles while +// they're still warm in cache it then publishes read/decoded input ranges to the parser workers, +// which handle their respective sections in parallel. + +namespace cuopt::linear_programming::io::detail { enum class mps_section_kind { rows, @@ -78,9 +82,17 @@ class mps_section_block_scanner_t { std::size_t block_count, mps_phase_registry_t& registry); + // Records a freshly decoded block, scans it for section titles, advances the + // contiguous decoded-byte frontier across out-of-order completions, and + // publishes any newly available section ranges. Producers only need to feed + // blocks in any order; the frontier and publication live entirely here. void observe_block(std::size_t block_index, const char* begin, const char* end); void publish_ready(std::size_t ready_bytes); + // Current contiguous decoded-byte frontier; producers use this as the final + // view size once all blocks have been observed. + std::size_t ready_bytes() const noexcept; + private: static constexpr std::size_t section_count = 9; // Section titles are short; 128 bytes is enough to rescan around a decoded @@ -92,7 +104,8 @@ class mps_section_block_scanner_t { void scan_section_range(const char* begin, const char* end); void scan_boundary(std::size_t left_index, std::size_t right_index); void record_section_hit(mps_section_kind kind, const char* ptr); - void publish_section_ranges(); + void notify_ready_phases(); + void advance_ready_frontier(); const char* data_ = nullptr; std::size_t block_count_ = 0; @@ -103,6 +116,8 @@ class mps_section_block_scanner_t { std::unique_ptr block_end_offsets_; std::atomic_size_t ready_bytes_{0}; std::atomic section_hits_[section_count]{}; + std::mutex frontier_mutex_; + std::size_t next_block_ = 0; }; -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp index fac9e64d78..0f47b45f56 100644 --- a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp +++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp @@ -14,7 +14,7 @@ #include #endif -namespace mps_fast::nvtx { +namespace cuopt::linear_programming::io::detail::nvtx { namespace colors { constexpr std::uint32_t generic = 0xff8b949e; @@ -92,7 +92,9 @@ class scoped_range_t { scoped_range_t& operator=(const scoped_range_t&) = delete; private: - void push(const char* name, std::uint32_t color, std::uint32_t category) + void push([[maybe_unused]] const char* name, + [[maybe_unused]] std::uint32_t color, + [[maybe_unused]] std::uint32_t category) { #ifdef MPS_FAST_NVTX nvtxEventAttributes_t event{}; @@ -105,10 +107,6 @@ class scoped_range_t { event.category = category; nvtxRangePushEx(&event); active_ = true; -#else - (void)name; - (void)color; - (void)category; #endif } @@ -118,18 +116,17 @@ class scoped_range_t { #endif }; -inline void name_current_thread(const char* name) +inline void name_current_thread([[maybe_unused]] const char* name) { #ifdef MPS_FAST_NVTX nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name); -#else - (void)name; #endif } -} // namespace mps_fast::nvtx +} // namespace cuopt::linear_programming::io::detail::nvtx #define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b #define MPS_FAST_NVTX_CONCAT(a, b) MPS_FAST_NVTX_CONCAT_INNER(a, b) -#define MPS_NVTX_RANGE(name, color) \ - ::mps_fast::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT(_mps_nvtx_range_, __LINE__)(name, color) +#define MPS_NVTX_RANGE(name, color) \ + ::cuopt::linear_programming::io::detail::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT( \ + _mps_nvtx_range_, __LINE__)(name, color) diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp index 5f7cecda94..9d4dea2bbf 100644 --- a/cpp/src/io/mps_parser.cpp +++ b/cpp/src/io/mps_parser.cpp @@ -797,9 +797,9 @@ void mps_parser_t::parse_rows(std::string_view line) } if (type == Objective) { // Keep only the first name or OBJNAME since it was set before - if (objective_name.empty()) - objective_name = name; - else + if (objective_name.empty()) objective_name = name; + // aligns with CPLEX/SCIP behavior + else if (name != objective_name) ignored_objective_names.emplace(name); // If we wanted to strictly follow MPS definition: a new objective row ('N') should be treated // as an unbounded constraints, aka an extra contraints row with lower bound -infinity and upper diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp index 6392833ce3..c9b3a351c6 100644 --- a/cpp/src/io/parser.cpp +++ b/cpp/src/io/parser.cpp @@ -44,7 +44,7 @@ template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path) { CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str()); - return mps_fast::parse_mps_fast_file(mps_file_path); + return detail::parse_mps_fast_file(mps_file_path); } template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp index 96a881c880..70658aa9b3 100644 --- a/cpp/src/utilities/perf_counters.hpp +++ b/cpp/src/utilities/perf_counters.hpp @@ -14,7 +14,7 @@ #include #include -namespace mps_fast { +namespace cuopt::linear_programming::io::detail { // Utils to return to total resident set size (used physical pages) static size_t parse_status_kb_line(const char* line, const char* key) @@ -191,4 +191,4 @@ static inline void print_perf_totals(const char* label, std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate); } -} // namespace mps_fast +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp index f07d84ebde..2ef8339da3 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp @@ -17,6 +17,8 @@ #include #include +namespace cuopt::linear_programming::io::detail { + namespace { uint64_t bits(double value) { return std::bit_cast(value); } @@ -27,36 +29,37 @@ double reference_strtod(std::string_view token) for (char& c : normalized) { if (c == 'd' || c == 'D') { c = 'e'; } } - char* end = nullptr; - errno = 0; - double value = std::strtod(normalized.c_str(), &end); - ASSERT_EQ(end, normalized.c_str() + normalized.size()); - return value; + char* end = nullptr; + errno = 0; + return std::strtod(normalized.c_str(), &end); } double parse_token(std::string_view token) { const char* p = token.data(); - return mps_fast::fp64::parse_fp64_advance(p, token.data() + token.size()); + return fp64::parse_fp64_advance(p, token.data() + token.size()); } -double parse_padded_token(std::string_view token) +void check_bitwise_strtod(std::string_view token) { + std::string normalized(token); + for (char& c : normalized) { + if (c == 'd' || c == 'D') { c = 'e'; } + } + char* end = nullptr; + errno = 0; + const double ref = std::strtod(normalized.c_str(), &end); + EXPECT_EQ(end, normalized.c_str() + normalized.size()); + std::string padded(token); padded.append(40, ' '); - const char* p = padded.data(); - double value = mps_fast::fp64::parse_fp64_advance(p, padded.data() + padded.size()); - ASSERT_EQ(p, padded.data() + token.size()); - return value; -} + const char* p = padded.data(); + const double padded_value = fp64::parse_fp64_advance(p, padded.data() + padded.size()); + EXPECT_EQ(p, padded.data() + token.size()); -void check_bitwise_strtod(std::string_view token) -{ - const double ref = reference_strtod(token); const uint64_t ref_bits = bits(ref); EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'"; - EXPECT_EQ(ref_bits, bits(parse_padded_token(token))) - << "padded parse mismatch for '" << token << "'"; + EXPECT_EQ(ref_bits, bits(padded_value)) << "padded parse mismatch for '" << token << "'"; } std::string random_token(std::mt19937_64& rng) @@ -155,7 +158,7 @@ TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd) std::setlocale(LC_NUMERIC, "C"); std::string text = "123.45 ABC"; const char* p = text.data(); - double value = mps_fast::fp64::parse_fp64_advance(p, text.data() + text.size()); + double value = fp64::parse_fp64_advance(p, text.data() + text.size()); EXPECT_EQ(bits(reference_strtod("123.45")), bits(value)); EXPECT_EQ(text.data() + 6, p); @@ -172,3 +175,5 @@ TEST(FastFp64ParserTest, FixedSeedRandomDifferential) check_bitwise_strtod(token); } } + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp index aa05736616..fe349b47e0 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -25,6 +25,8 @@ #include +namespace cuopt::linear_programming::io::detail { + namespace { struct TempMpsFile { @@ -36,20 +38,24 @@ struct TempMpsFile { "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps", static_cast(getpid())); int fd = mkstemps(path_template, 4); - if (fd < 0) { FAIL() << "mkstemps failed: " << std::strerror(errno); } + if (fd < 0) { + throw std::runtime_error(std::string("mkstemps failed: ") + std::strerror(errno)); + } path = path_template; FILE* file = fdopen(fd, "wb"); if (file == nullptr) { close(fd); - FAIL() << "fdopen failed: " << std::strerror(errno); + throw std::runtime_error(std::string("fdopen failed: ") + std::strerror(errno)); } if (!contents.empty() && std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) { std::fclose(file); - FAIL() << "failed to write temporary MPS file: " << std::strerror(errno); + throw std::runtime_error(std::string("failed to write temporary MPS file: ") + + std::strerror(errno)); } if (std::fclose(file) != 0) { - FAIL() << "failed to close temporary MPS file: " << std::strerror(errno); + throw std::runtime_error(std::string("failed to close temporary MPS file: ") + + std::strerror(errno)); } } @@ -77,7 +83,7 @@ struct TempOwnedPath { std::string path; }; -std::string_view range_text(const mps_fast::mps_phase_range_t& range) +std::string_view range_text(const mps_phase_range_t& range) { if (!range.present) { return {}; } return std::string_view(range.begin, static_cast(range.end - range.begin)); @@ -85,15 +91,14 @@ std::string_view range_text(const mps_fast::mps_phase_range_t& range) uint64_t bits(double value) { return std::bit_cast(value); } -void check_models_match_reference_bitwise( - const mps_fast::parser_model_t& fast, - const cuopt::linear_programming::io::mps_data_model_t& reference, - std::string_view context) +void check_models_match_reference_bitwise(const parser_model_t& fast, + const mps_data_model_t& reference, + std::string_view context) { EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars"; EXPECT_EQ(reference.n_constraints_, fast.n_constraints_) << std::string(context) + " n_constraints"; - EXPECT_EQ(reference.nnz_, fast.nnz_) << std::string(context) + " nnz"; + EXPECT_EQ(reference.get_nnz(), fast.get_nnz()) << std::string(context) + " nnz"; EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize"; EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name"; EXPECT_EQ(reference.objective_name_, fast.objective_name_) @@ -126,8 +131,8 @@ void check_models_match_reference_bitwise( void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) { TempMpsFile file(std::move(contents)); - auto fast = mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - auto reference = cuopt::linear_programming::io::read_mps(file.path, false); + auto fast = parse_mps_fast_file(file.path, FileReadMethod::Read); + auto reference = read_mps(file.path, false); check_models_match_reference_bitwise(fast, reference, fixture_name); } @@ -138,7 +143,7 @@ std::string row_name(size_t i) return out.str(); } -int find_var_index(const mps_fast::parser_model_t& model, std::string_view name) +int find_var_index(const parser_model_t& model, std::string_view name) { for (size_t i = 0; i < model.var_names_.size(); ++i) { if (model.var_names_[i] == name) { return static_cast(i); } @@ -146,11 +151,8 @@ int find_var_index(const mps_fast::parser_model_t& model, std::stri return -1; } -void check_model_shapes(const mps_fast::parser_model_t& model, - int rows, - int vars, - int nnz, - std::string_view context) +void check_model_shapes( + const parser_model_t& model, int rows, int vars, int nnz, std::string_view context) { EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows"; EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars"; @@ -210,24 +212,23 @@ TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks) EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split"; const size_t split = columns_pos + 3; - mps_fast::mps_phase_registry_t registry; - mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 2, registry); scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); scanner.publish_ready(0); scanner.observe_block(0, mps.data(), mps.data() + split); scanner.publish_ready(mps.size()); - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::header)) << "header not ready"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) << "columns not ready"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic)) - << "quadratic sentinel not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::header)) << "header not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready"; + EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready"; - EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::columns)).starts_with("COLUMNS")) + EXPECT_TRUE(range_text(registry.range(mps_phase_kind::columns)).starts_with("COLUMNS")) << "columns range begins at wrong boundary"; - EXPECT_TRUE(range_text(registry.range(mps_fast::mps_phase_kind::rhs)).starts_with("RHS")) + EXPECT_TRUE(range_text(registry.range(mps_phase_kind::rhs)).starts_with("RHS")) << "rhs range begins at wrong boundary"; } @@ -241,20 +242,18 @@ TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte) EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture"; for (size_t offset = 1; offset < header.size(); ++offset) { const size_t split = pos + offset; - mps_fast::mps_phase_registry_t registry; - mps_fast::mps_section_block_scanner_t scanner(mps.data(), 2, registry); + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 2, registry); scanner.observe_block(1, mps.data() + split, mps.data() + mps.size()); scanner.observe_block(0, mps.data(), mps.data() + split); scanner.publish_ready(mps.size()); - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rows)) << "rows not ready after split"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::columns)) - << "columns not ready after split"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::rhs)) << "rhs not ready after split"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::bounds)) - << "bounds not ready after split"; - EXPECT_TRUE(registry.ready(mps_fast::mps_phase_kind::quadratic)) + EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::bounds)) << "bounds not ready after split"; + EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready after split"; } } @@ -273,8 +272,8 @@ TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows) EXPECT_THROW( { - mps_fast::mps_phase_registry_t registry; - mps_fast::mps_section_block_scanner_t scanner(mps.data(), 1, registry); + mps_phase_registry_t registry; + mps_section_block_scanner_t scanner(mps.data(), 1, registry); scanner.observe_block(0, mps.data(), mps.data() + mps.size()); scanner.publish_ready(mps.size()); }, @@ -328,8 +327,7 @@ TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins) verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents); TempMpsFile file(contents); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); EXPECT_EQ(1, model.n_vars_) << "n_vars"; EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound"; EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound"; @@ -371,8 +369,7 @@ TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath) " RHS1 rowA 0\n" "ENDATA\n"); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars"; EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints"; EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default"; @@ -397,8 +394,7 @@ TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically) " SC B AUX_S 5\n" "ENDATA\n"); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); check_model_shapes(model, 1, 4, 1, "bounds-only"); EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name"; EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1"; @@ -439,8 +435,7 @@ TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds) " RHS1 R1 10\n" "ENDATA\n"); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); check_model_shapes(model, 1, 3, 3, "integer markers"); const int xint = find_var_index(model, "XINT"); const int xcont = find_var_index(model, "XCONT"); @@ -530,36 +525,23 @@ TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise) TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective) { - TempMpsFile file( - "NAME OBJMETA\n" - "OBJSENSE\n" - " MAX\n" - "OBJNAME\n" - " COST\n" - "ROWS\n" - " N ALT\n" - " N COST\n" - " L R1\n" - "COLUMNS\n" - " X1 ALT 100 COST 5\n" - " X1 R1 1\n" - " X2 COST -2 R1 3\n" - "RHS\n" - " RHS1 COST 7 R1 11\n" - "ENDATA\n"); - - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - EXPECT_TRUE(model.maximize_) << "OBJSENSE MAX not applied"; - EXPECT_EQ(std::string("OBJMETA"), model.problem_name_) << "problem name"; - EXPECT_EQ(std::string("COST"), model.objective_name_) << "objective name"; - EXPECT_EQ(-7.0, model.objective_offset_) << "objective RHS offset"; - const int x1 = find_var_index(model, "X1"); - const int x2 = find_var_index(model, "X2"); - ASSERT_GE(x1, 0); - ASSERT_GE(x2, 0); - EXPECT_EQ(5.0, model.c_.at(x1)) << "named objective coefficient X1"; - EXPECT_EQ(-2.0, model.c_.at(x2)) << "named objective coefficient X2"; + verify_fixture_bitwise("objective_metadata", + "NAME OBJMETA\n" + "OBJSENSE\n" + " MAX\n" + "OBJNAME\n" + " COST\n" + "ROWS\n" + " N ALT\n" + " N COST\n" + " L R1\n" + "COLUMNS\n" + " X1 ALT 100 COST 5\n" + " X1 R1 1\n" + " X2 COST -2 R1 3\n" + "RHS\n" + " RHS1 COST 7 R1 11\n" + "ENDATA\n"); } TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) @@ -577,11 +559,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) "RHS\n" " RHS1 R1 0\n" "ENDATA\n"); - EXPECT_THROW( - { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - std::logic_error); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); } { @@ -595,11 +574,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) "RHS\n" " RHS1 R1 0\n" "ENDATA\n"); - EXPECT_THROW( - { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - std::logic_error); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); } { @@ -613,11 +589,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) "RHS\n" " RHS1 MISSING 1\n" "ENDATA\n"); - EXPECT_THROW( - { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - std::logic_error); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); } { @@ -633,11 +606,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) "BOUNDS\n" " XX B X1 1\n" "ENDATA\n"); - EXPECT_THROW( - { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - std::logic_error); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); } { @@ -653,11 +623,8 @@ TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors) "BOUNDS\n" " SC B X1\n" "ENDATA\n"); - EXPECT_THROW( - { - (void)mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); - }, - std::logic_error); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::logic_error); } } @@ -685,8 +652,7 @@ TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary) mps += " 0\nENDATA\n"; TempMpsFile file(std::move(mps)); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); check_model_shapes( model, static_cast(row_count), 2, static_cast(row_count + 1), "large columns"); EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name"; @@ -708,8 +674,7 @@ TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered) mps += "ENDATA\n"; TempMpsFile file(std::move(mps)); - auto model = - mps_fast::parse_mps_fast_file(file.path, mps_fast::FileReadMethod::Read); + auto model = parse_mps_fast_file(file.path, FileReadMethod::Read); check_model_shapes(model, 1, 1, 1, "large bounds"); EXPECT_EQ(static_cast((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0)) << "large repeated bounds last value"; @@ -743,10 +708,8 @@ TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput) const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path; if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; } - auto raw = - mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); - auto lz4 = - mps_fast::parse_mps_fast_file(lz4_file.path, mps_fast::FileReadMethod::Read); + auto raw = parse_mps_fast_file(raw_file.path, FileReadMethod::Read); + auto lz4 = parse_mps_fast_file(lz4_file.path, FileReadMethod::Read); check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity"); EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count"; @@ -777,12 +740,9 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch) if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; } if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; } - auto raw = - mps_fast::parse_mps_fast_file(raw_file.path, mps_fast::FileReadMethod::Read); - auto gzip = - mps_fast::parse_mps_fast_file(gzip_file.path, mps_fast::FileReadMethod::Read); - auto bzip2 = - mps_fast::parse_mps_fast_file(bzip2_file.path, mps_fast::FileReadMethod::Read); + auto raw = parse_mps_fast_file(raw_file.path, FileReadMethod::Read); + auto gzip = parse_mps_fast_file(gzip_file.path, FileReadMethod::Read); + auto bzip2 = parse_mps_fast_file(bzip2_file.path, FileReadMethod::Read); check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity"); check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity"); @@ -803,3 +763,5 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch) EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types"; EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types"; } + +} // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index 3b01f10227..6a47471c09 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -931,18 +931,24 @@ TEST_F(mip_partial_bounds_test, lp) ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \ mps_reader_param_name) +#define INSTANTIATE_DEFAULT_MPS_READER_TEST(Fixture) \ + INSTANTIATE_TEST_SUITE_P( \ + mps_readers, Fixture, ::testing::Values(default_mps_reader_param), mps_reader_param_name) + INSTANTIATE_MPS_READER_TEST(good_mps_1_test); INSTANTIATE_MPS_READER_TEST(up_low_bounds_test); -INSTANTIATE_MPS_READER_TEST(some_var_bounds_test); -INSTANTIATE_MPS_READER_TEST(fixed_var_bound_test); -INSTANTIATE_MPS_READER_TEST(free_var_bound_test); -INSTANTIATE_MPS_READER_TEST(lower_inf_var_bound_test); -INSTANTIATE_MPS_READER_TEST(upper_inf_var_bound_test); INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test); INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test); INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test); +// fast mps parser doesn't support fixed format +INSTANTIATE_DEFAULT_MPS_READER_TEST(some_var_bounds_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(fixed_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test); +INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test); #undef INSTANTIATE_MPS_READER_TEST +#undef INSTANTIATE_DEFAULT_MPS_READER_TEST #ifdef MPS_PARSER_WITH_BZIP2 TEST(mps_parser, good_mps_file_bzip2_compressed) From 9185d7aa0586fe35496d2976086ea4b6e73c2494 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 12 Jun 2026 06:42:54 -0700 Subject: [PATCH 12/22] ai review comments --- cpp/cuopt_cli.cpp | 2 +- .../fast_parse_primitives.hpp | 18 ++++---- .../io/experimental_mps_fast/fast_parser.cpp | 20 +++++---- .../io/experimental_mps_fast/file_reader.cpp | 42 +++++++++++++++---- .../io/experimental_mps_fast/file_reader.hpp | 31 +++++++++++--- .../experimental_mps_fast/lz4_file_reader.cpp | 39 +++++++++-------- 6 files changed, 103 insertions(+), 49 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 55c506721a..f06e568208 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -309,7 +309,7 @@ int main(int argc, char* argv[]) .help( "MPS reader implementation: default uses the production parser; experimental-fast uses the " "experimental " - "SIMD parser for LP/MIP .mps and .mps.lz4 files") + "SIMD parser for LP/MIP .mps, .mps.lz4, .mps.gz, and .mps.bz2 files") .default_value(std::string("default")) .choices("default", "experimental-fast"); diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp index f77e14a410..8897bfef1c 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp @@ -246,18 +246,22 @@ struct cursor_t { if (UNLIKELY(ws_mask == 0)) { return slow(); } int field1_end_off = __builtin_ctz(ws_mask); - unsigned int printable_after_field1 = printable_mask >> field1_end_off; - if (UNLIKELY(printable_after_field1 == 0)) { return slow(); } - int field2_start_off = field1_end_off + __builtin_ctz(printable_after_field1); - - if (UNLIKELY(ptr[field2_start_off] == '\n')) { return slow(); } + simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); + unsigned int nl_mask = (unsigned int)simde_mm256_movemask_epi8(is_nl); + unsigned int barrier_after_field1 = (printable_mask | nl_mask) >> field1_end_off; + if (UNLIKELY(barrier_after_field1 == 0)) { return slow(); } + int field2_rel_off = __builtin_ctz(barrier_after_field1); + if (UNLIKELY(ptr[field1_end_off + field2_rel_off] == '\n' || + ptr[field1_end_off + field2_rel_off] == '\r')) { + return slow(); + } + int field2_start_off = field1_end_off + field2_rel_off; unsigned int ws_after_field2_start = ws_mask >> field2_start_off; if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); } int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start); - simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl); - unsigned int stop_mask = printable_mask | (unsigned int)simde_mm256_movemask_epi8(is_nl); + unsigned int stop_mask = printable_mask | nl_mask; unsigned int stop_after_field2 = stop_mask >> field2_end_off; if (LIKELY(stop_after_field2 != 0)) { ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2); diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 35e83a01aa..45eccce23d 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -2,7 +2,6 @@ // reserved. SPDX-License-Identifier: Apache-2.0 #include "fast_parser.hpp" -#include #include "fast_parse_primitives.hpp" #include "file_reader.hpp" #include "hash_table_smallstr.hpp" @@ -20,12 +19,11 @@ #include #include -#include -#include #include #include #include -#include +#include +#include #include #include #include @@ -36,15 +34,14 @@ #include #include #include -#include #include #include -#include -#include #include #include #include +#include + #define MPS_FAST_COMPACT_ROW_HASH #define MPS_FAST_THP_PREFAULT @@ -863,6 +860,9 @@ static bool parse_rows_section_parallel_impl(parse_state_t& state, } size_t total_rows = offsets[(size_t)num_threads]; + if (UNLIKELY(total_rows > (size_t)INT_MAX)) { + state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", total_rows); + } { scoped_timer_t timer("rows_resize_outputs"); state.row_names_sv.resize(total_rows); @@ -1003,6 +1003,10 @@ static void parse_rows_section_serial_impl(parse_state_t& state, const } expect_eol(state.cursor); } + if (UNLIKELY(state.row_names_sv.size() > (size_t)INT_MAX)) { + state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", + state.row_names_sv.size()); + } } template @@ -2969,7 +2973,7 @@ static padded_memory_input_t read_compressed_mps_file(const std::string& path) if (buffer.empty()) { buffer.push_back('\0'); } std::size_t input_size = buffer.size() - 1; - buffer.resize(input_size + input_buffer_padding_bytes, '\0'); + ensure_input_buffer_padding(buffer, input_size); return {std::move(buffer), input_size, get_file_size(path)}; } diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index e874011db8..76ee5b6b5b 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -5,6 +5,7 @@ #include "nvtx_ranges.hpp" #include +#include #include #include @@ -66,6 +67,18 @@ std::size_t add_input_padding(std::size_t size) } // namespace +void ensure_input_buffer_padding(std::vector& buffer, std::size_t input_size) +{ + if (input_size > buffer.size()) { + mps_parser_fail(error_type_t::ValidationError, + "input_size %zu exceeds buffer size %zu", + input_size, + buffer.size()); + } + std::size_t required = add_input_padding(input_size); + if (buffer.size() < required) { buffer.resize(required, '\0'); } +} + std::size_t get_file_size(int fd, const std::string& path) { struct stat st; @@ -128,22 +141,29 @@ bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset) raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) { MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io); - buffered_fd_ = ::open(path.c_str(), O_RDONLY); - if (buffered_fd_ < 0) { + int buffered_fd = ::open(path.c_str(), O_RDONLY); + cuopt::scope_guard close_buffered([&] { + if (buffered_fd >= 0) { ::close(buffered_fd); } + }); + if (buffered_fd < 0) { mps_parser_fail(error_type_t::RuntimeError, "Failed to open raw MPS file '%s': %s", path.c_str(), std::strerror(errno)); } - file_size_ = get_file_size(buffered_fd_, path); - fd_ = buffered_fd_; - bool use_direct_io = file_size_ > raw_input_direct_io_threshold_bytes; - if (use_direct_io) { + int direct_fd = -1; + cuopt::scope_guard close_direct([&] { + if (direct_fd >= 0) { ::close(direct_fd); } + }); + + file_size_ = get_file_size(buffered_fd, path); + int read_fd = buffered_fd; + if (file_size_ > raw_input_direct_io_threshold_bytes) { #ifdef O_DIRECT - int direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); + direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); if (direct_fd >= 0) { - fd_ = direct_fd; + read_fd = direct_fd; direct_io_ = true; } #endif @@ -160,6 +180,11 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) section_scanner_ = std::make_unique(output_data_, window_count_, registry_); + + buffered_fd_ = buffered_fd; + buffered_fd = -1; + fd_ = read_fd; + if (read_fd == direct_fd) { direct_fd = -1; } } raw_input_stream_t::~raw_input_stream_t() @@ -229,6 +254,7 @@ memory_input_stream_t::memory_input_stream_t(std::vector buffer, std::size_t compressed_size) : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size) { + ensure_input_buffer_padding(buffer_, input_size_); section_scanner_ = std::make_unique(buffer_.data(), 1, registry_); } diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index 802d6fe191..8c24a3d297 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -37,12 +37,15 @@ #include #include #include +#include #include namespace cuopt::linear_programming::io::detail { inline constexpr std::size_t input_buffer_padding_bytes = 64; +void ensure_input_buffer_padding(std::vector& buffer, std::size_t input_size); + struct lz4_pipeline_t; /** @@ -122,6 +125,27 @@ class parallel_error_latch_t { std::atomic_bool stopped_{false}; }; +class scoped_thread_group { + public: + void reserve(std::size_t count) { threads_.reserve(count); } + + template + void emplace(F&& f) + { + threads_.emplace_back(std::forward(f)); + } + + ~scoped_thread_group() + { + for (auto& thread : threads_) { + if (thread.joinable()) { thread.join(); } + } + } + + private: + std::vector threads_; +}; + // Work-stealing parallel loop over [0, count). Each of thread_count workers pulls // the next index from a shared counter and invokes body(index). An exception // escaping body is captured into the latch and stops the loop; the caller is @@ -136,10 +160,10 @@ void parallel_for_indexed(std::size_t count, Body body) { std::atomic_size_t next{0}; - std::vector workers; + scoped_thread_group workers; workers.reserve(thread_count); for (std::size_t t = 0; t < thread_count; ++t) { - workers.emplace_back([&, t] { + workers.emplace([&, t] { if (thread_name_prefix != nullptr) { std::string name = thread_name_prefix + std::to_string(t); nvtx::name_current_thread(name.c_str()); @@ -156,9 +180,6 @@ void parallel_for_indexed(std::size_t count, } }); } - for (auto& worker : workers) { - worker.join(); - } } struct input_stream_view_t { diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 2c40d6745b..d26109b011 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -6,6 +6,7 @@ #include "nvtx_ranges.hpp" #include +#include #include @@ -344,12 +345,15 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) ensure_lz4_runtime_available(); - fd_ = open_lz4_fd(path); - ::posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + int fd = open_lz4_fd(path); + cuopt::scope_guard close_fd([&] { + if (fd >= 0) { ::close(fd); } + }); + ::posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); - compressed_size_ = get_file_size(fd_, path); + compressed_size_ = get_file_size(fd, path); - lz4_frame_header_t header = parse_lz4_frame_header(fd_, path, compressed_size_); + lz4_frame_header_t header = parse_lz4_frame_header(fd, path, compressed_size_); block_max_size_ = header.block_max_size; content_size_ = header.content_size; header_size_ = header.header_size; @@ -378,6 +382,9 @@ lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path) section_scanner_ = std::make_unique(output_data_, block_slot_count_, registry_); + + fd_ = fd; + fd = -1; } lz4_input_stream_t::~lz4_input_stream_t() @@ -455,13 +462,14 @@ struct lz4_pipeline_t { void run() { - std::thread scanner(&lz4_pipeline_t::run_scanner_stage, this); - start_decoders(); - run_readers(); - - scanner.join(); - for (auto& worker : decoders) { - worker.join(); + { + scoped_thread_group background; + background.reserve(io_threads + 1); + background.emplace([this] { run_scanner_stage(); }); + for (std::size_t t = 0; t < io_threads; ++t) { + background.emplace([this, t] { run_decoder_stage(t); }); + } + run_readers(); } latch.rethrow_if_error(); } @@ -548,14 +556,6 @@ struct lz4_pipeline_t { window_cv.notify_all(); } - void start_decoders() - { - decoders.reserve(io_threads); - for (std::size_t t = 0; t < io_threads; ++t) { - decoders.emplace_back(&lz4_pipeline_t::run_decoder_stage, this, t); - } - } - void run_decoder_stage(std::size_t tid) { try { @@ -846,7 +846,6 @@ struct lz4_pipeline_t { std::atomic_size_t blocks_scanned{0}; std::vector> crossing_payloads; - std::vector decoders; }; void lz4_input_stream_t::run_decode_tasks() From a1e14d504425e54448a480c7e17d6b35553119bb Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 12 Jun 2026 07:11:03 -0700 Subject: [PATCH 13/22] ai review --- .../experimental_mps_fast/lz4_file_reader.cpp | 17 ++++++++++++----- .../mps_section_scanner.cpp | 3 ++- .../mps_section_scanner.hpp | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index d26109b011..85309efaa2 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -462,15 +462,22 @@ struct lz4_pipeline_t { void run() { + std::exception_ptr startup_error; { scoped_thread_group background; - background.reserve(io_threads + 1); - background.emplace([this] { run_scanner_stage(); }); - for (std::size_t t = 0; t < io_threads; ++t) { - background.emplace([this, t] { run_decoder_stage(t); }); + try { + background.reserve(io_threads + 1); + background.emplace([this] { run_scanner_stage(); }); + for (std::size_t t = 0; t < io_threads; ++t) { + background.emplace([this, t] { run_decoder_stage(t); }); + } + run_readers(); + } catch (...) { + startup_error = std::current_exception(); + fail_and_notify(startup_error); } - run_readers(); } + if (startup_error) { std::rethrow_exception(startup_error); } latch.rethrow_if_error(); } diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index a3c9fe87a3..8d39233e4d 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -117,7 +117,8 @@ bool mps_phase_registry_t::ready(mps_phase_kind phase) const mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const { std::size_t idx = phase_index(phase); - assert(ready_[idx].load(std::memory_order_acquire)); + bool is_ready = ready_[idx].load(std::memory_order_acquire); + assert(is_ready); return ranges_[idx]; } diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index 7fd249a7e8..824e976c4f 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -51,8 +51,8 @@ class mps_phase_registry_t { void attach_event(mps_phase_kind phase, omp_event_handle_t event); bool ready(mps_phase_kind phase) const; - // range() is lock-free: callers must observe ready(phase)==true first. The - // acquire load in ready() pairs with publish()'s release store before ranges_. + // range() acquire-loads ready_[phase] (pairs with publish()'s release store) before + // reading ranges_[phase]. Callers must not invoke range() until the phase is published. mps_phase_range_t range(mps_phase_kind phase) const; void publish_endata(const char* begin, bool present); From fe0aa31301ac08467a714a12c41d4ec7c7a7f7d1 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Fri, 12 Jun 2026 07:55:12 -0700 Subject: [PATCH 14/22] Comments on the build flags --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4ecb1e9a46..e50dc52172 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -461,7 +461,7 @@ if (HOST_LINEINFO) set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") endif () -# Needed for the fast MPS parser +# Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013) if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} From cfaccc3eb52120c36cd8a7b42121d1c6e6c9d6d5 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Sat, 13 Jun 2026 04:36:45 -0700 Subject: [PATCH 15/22] gate O_DIRECT behind non-nfs, add missing license notices --- .../io/experimental_mps_fast/file_reader.cpp | 65 ++++++++++++++++++- .../io/experimental_mps_fast/file_reader.hpp | 3 + thirdparty/THIRD_PARTY_LICENSES | 60 +++++++++++++++++ 3 files changed, 125 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 76ee5b6b5b..c00f84eb5e 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -10,11 +10,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -37,6 +39,7 @@ namespace { constexpr std::size_t raw_input_window_bytes = 64ull * 1024ull * 1024ull; constexpr std::size_t raw_input_max_read_threads = 8; constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull; +constexpr long nfs_super_magic = 0x6969; bool path_has_suffix(const std::string& path, const char* suffix) noexcept { @@ -65,6 +68,12 @@ std::size_t add_input_padding(std::size_t size) return size + input_buffer_padding_bytes; } +bool is_nfs_backed_path(const std::string& path) noexcept +{ + struct statfs fs; + return ::statfs(path.c_str(), &fs) == 0 && fs.f_type == nfs_super_magic; +} + } // namespace void ensure_input_buffer_padding(std::vector& buffer, std::size_t input_size) @@ -157,9 +166,13 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) if (direct_fd >= 0) { ::close(direct_fd); } }); - file_size_ = get_file_size(buffered_fd, path); - int read_fd = buffered_fd; - if (file_size_ > raw_input_direct_io_threshold_bytes) { + file_size_ = get_file_size(buffered_fd, path); + int read_fd = buffered_fd; + bool large_enough_for_direct = file_size_ > raw_input_direct_io_threshold_bytes; + bool nfs_backed = is_nfs_backed_path(path); + // Buffered reads are consistently faster than O_DIRECT on our NFS mounts; + // keep direct I/O for large local files where it wins. + if (large_enough_for_direct && !nfs_backed) { #ifdef O_DIRECT direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT); if (direct_fd >= 0) { @@ -231,6 +244,9 @@ void raw_input_stream_t::run_decode_tasks() // Each window is read independently and handed to the scanner, which owns the // contiguous decoded-byte frontier and the parallel section publication. parallel_error_latch_t latch; +#ifdef MPS_FAST_TIMERS + auto read_wall_start = std::chrono::steady_clock::now(); +#endif parallel_for_indexed( window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) { MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io); @@ -238,13 +254,56 @@ void raw_input_stream_t::run_decode_tasks() std::size_t size = std::min(window_bytes_, file_size_ - offset); { MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io); +#ifdef MPS_FAST_TIMERS + auto start = std::chrono::steady_clock::now(); +#endif read_window_payload(offset, size); +#ifdef MPS_FAST_TIMERS + auto end = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(end - start); + read_window_ms_[index] = + (uint32_t)std::min(elapsed.count(), std::numeric_limits::max()); +#endif } MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io); section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size); }); +#ifdef MPS_FAST_TIMERS + auto read_wall_end = std::chrono::steady_clock::now(); +#endif latch.rethrow_if_error(); +#ifdef MPS_FAST_TIMERS + if (!read_window_ms_.empty()) { + std::vector sorted = read_window_ms_; + std::sort(sorted.begin(), sorted.end()); + auto percentile = [&](double pct) { + std::size_t idx = (std::size_t)std::min((double)(sorted.size() - 1), + pct * (double)(sorted.size() - 1)); + return sorted[idx]; + }; + uint64_t total_ms = 0; + for (uint32_t value : read_window_ms_) { + total_ms += value; + } + std::fprintf( + stderr, + "[RAW_READ_LATENCY] windows=%zu wall_ms=%lld total_window_ms=%llu avg_ms=%.3f min_ms=%u " + "p50_ms=%u p90_ms=%u p99_ms=%u max_ms=%u\n", + read_window_ms_.size(), + (long long)std::chrono::duration_cast(read_wall_end - + read_wall_start) + .count(), + (unsigned long long)total_ms, + (double)total_ms / (double)read_window_ms_.size(), + sorted.front(), + percentile(0.50), + percentile(0.90), + percentile(0.99), + sorted.back()); + } +#endif + output_view_size_ = section_scanner_->ready_bytes(); section_scanner_->publish_ready(output_view_size_); } diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index 8c24a3d297..5472434b1a 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -283,6 +283,9 @@ class raw_input_stream_t : public input_stream_base_t { std::size_t file_size_ = 0; std::size_t window_bytes_ = 0; std::size_t window_count_ = 0; +#ifdef MPS_FAST_TIMERS + std::vector read_window_ms_; +#endif std::unique_ptr section_scanner_; }; diff --git a/thirdparty/THIRD_PARTY_LICENSES b/thirdparty/THIRD_PARTY_LICENSES index a70fa8ce1c..e09000b56d 100644 --- a/thirdparty/THIRD_PARTY_LICENSES +++ b/thirdparty/THIRD_PARTY_LICENSES @@ -512,3 +512,63 @@ Copyright notice: Jean-loup Gailly Mark Adler jloup@gzip.org madler@alumni.caltech.edu + + +----------------------------------------------------------------------------------------- +== LZ4 + +Usage: cuopt uses LZ4 through dynamically loaded library symbols + +Copyright (c) Yann Collet. All rights reserved. + +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +----------------------------------------------------------------------------------------- +== SIMDe + +Usage: cuopt uses SIMDe in experimental fast MPS parser SIMD compatibility code + +Copyright (c) 2017 Evan Nemerson + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From 72208385fea719e1deef478ca1d04505f2789f97 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Mon, 15 Jun 2026 02:08:42 -0700 Subject: [PATCH 16/22] fix bitwise comps, more cleanup and comments --- .../io/experimental_mps_fast/fast_parser.cpp | 112 +++++++++++----- .../io/experimental_mps_fast/file_reader.cpp | 18 +-- .../experimental_mps_fast/lz4_file_reader.cpp | 125 +++++++++++++----- .../mps_section_scanner.cpp | 13 +- .../mps_section_scanner.hpp | 23 ++++ .../fast_parser_edge_test.cpp | 44 ++++-- 6 files changed, 244 insertions(+), 91 deletions(-) diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 45eccce23d..8eae082e25 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -435,6 +435,20 @@ static inline void observe_dense_name(bool& candidate, observed_count++; } +// Maps MPS row/column names to indices via one of two strategies, chosen per problem: +// +// * dense_ordered - when every name in a section is a shared prefix followed by a +// contiguous run of integers (e.g. R0001, R0002, ... or x1, x2, ...). The index is +// then computed straight from the parsed integer (value - min_id), so no hash table +// is built or probed. This is the common, fast case for solver-generated models. +// * hash - the general fallback (smallstr_hash_table_t) for arbitrary names. +// +// Each section decides its own mode while scanning: it stays a dense_ordered "candidate" +// as long as names keep matching the prefix + consecutive-integer + zero-pad-width rule +// (see observe_dense_name), and the first violation drops it to the hash path. The chosen +// mode lives in row_index_mode / col_index_mode, and every lookup branches on it +// (row_lookup / read_row_lookup vs the dense_ordered variants below). Holding this in mind +// explains most of the paired/dual code paths throughout this file. template struct parse_state_t { mps_data_model_t& problem; @@ -510,6 +524,51 @@ struct parse_state_t { return true; } + // Insert all rows into the hash table. The perf-counter instrumentation is isolated in + // these two helpers so its #ifdefs do not fragment init_row_hash_table_impl's setup flow; + // both compile down to a bare insert loop when MPS_FAST_PERF_COUNTERS is off. + void insert_rows_partitioned( + int num_threads, + const std::array& partition_offsets, + const std::vector& row_order, + const std::vector& row_hashes) + { + scoped_timer_t timer("row_hash_insert_partitioned"); +#ifdef MPS_FAST_PERF_COUNTERS + std::vector perf_snapshots(MPS_ROW_HASH_PARTITIONS); +#endif +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { + size_t p = (size_t)part_id; +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { + size_t idx = row_order[pos]; + row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx); + } +#ifdef MPS_FAST_PERF_COUNTERS + perf_snapshots[p] = perf_counters.stop(); +#endif + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("row_hash_insert_partitioned", perf_snapshots); +#endif + } + + void insert_rows_serial(size_t n_rows) + { +#ifdef MPS_FAST_PERF_COUNTERS + thread_perf_counters_t perf_counters; +#endif + for (size_t idx = 0; idx < n_rows; ++idx) { + row_hash_.insert_serial(row_names_sv[idx], idx); + } +#ifdef MPS_FAST_PERF_COUNTERS + print_perf_totals("row_hash_insert_all", {perf_counters.stop()}); +#endif + } + void init_row_hash_table_impl() { scoped_timer_t timer("row_hash_init_total"); @@ -580,37 +639,9 @@ struct parse_state_t { scoped_timer_t timer("row_hash_insert_all"); row_hash_.reset_build_probe_stats(); if (use_partitioned) { - scoped_timer_t timer("row_hash_insert_partitioned"); -#ifdef MPS_FAST_PERF_COUNTERS - std::vector perf_snapshots(MPS_ROW_HASH_PARTITIONS); -#endif -#pragma omp parallel for schedule(static) num_threads(num_threads) - for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) { - size_t p = (size_t)part_id; -#ifdef MPS_FAST_PERF_COUNTERS - thread_perf_counters_t perf_counters; -#endif - for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) { - size_t idx = row_order[pos]; - row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx); - } -#ifdef MPS_FAST_PERF_COUNTERS - perf_snapshots[p] = perf_counters.stop(); -#endif - } -#ifdef MPS_FAST_PERF_COUNTERS - print_perf_totals("row_hash_insert_partitioned", perf_snapshots); -#endif + insert_rows_partitioned(num_threads, partition_offsets, row_order, row_hashes); } else { -#ifdef MPS_FAST_PERF_COUNTERS - thread_perf_counters_t perf_counters; -#endif - for (size_t idx = 0; idx < n_rows; ++idx) { - row_hash_.insert_serial(row_names_sv[idx], idx); - } -#ifdef MPS_FAST_PERF_COUNTERS - print_perf_totals("row_hash_insert_all", {perf_counters.stop()}); -#endif + insert_rows_serial(n_rows); } row_hash_.print_build_probe_report(n_rows); } @@ -798,6 +829,11 @@ static std::vector compute_row_chunk_boundaries(const char } // reads the row section in chunks and inserts into the worker's hash table partition +// Parallel ROWS parser: count constraints per chunk, prefix-sum, then fill the output arrays +// in parallel (with per-chunk dense-name reconciliation at the end). Must keep the same line +// grammar as its serial twin parse_rows_section_serial_impl; parse_rows_section chooses between +// them by size. Returns false if a chunk hit a malformed line (nothing committed for the fill +// pass), so the caller can reset and retry serially for clean error reporting. template static bool parse_rows_section_parallel_impl(parse_state_t& state, const char* rows_start, @@ -1808,6 +1844,9 @@ static void materialize_problem_csr(parse_state_t& state) state.temp_A_indices_region.reset(); } +// COLUMNS is always parsed chunk-parallel: each chunk is counted/parsed by parse_columns_chunk +// and the per-chunk results are stitched together by merge_chunk_results_to_csr. There is no +// separate serial implementation -- a single thread just runs one chunk through the same path. template static void parse_columns_section_parallel(parse_state_t& state, int num_threads, @@ -1997,6 +2036,10 @@ static bool apply_bound_record(std::string_view bound_type, return true; } +// Parallel BOUNDS parser for the common dense/ordered-name case. Returns false when the section +// is too small or not safely parallelizable, so parse_bounds_section resets and falls back to its +// serial path. Bound-type semantics (LO/UP/FX/...) are shared with the serial path through +// apply_bound_record, so the two cannot drift. template static bool parse_bounds_section_parallel_dense(parse_state_t& state, cursor_t& cursor, @@ -2791,6 +2834,10 @@ static mps_data_model_t parse_mps_fast_stream(Stream& stream, input.registry->publish(mps_phase_kind::quadratic, empty); }; + // These ints carry no data; they exist only as OpenMP task-dependency tokens. A task's + // depend(out: X) "produces" X and depend(in: X) waits on it, so the phase ordering in the + // task graph below (e.g. bounds after columns_done, because bounds reference variable names) + // is expressed purely through which tokens each task depends on. int header_ready = 0, rows_ready = 0, columns_ready = 0; int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0; int header_done = 0, rows_done = 0, columns_done = 0; @@ -2807,6 +2854,11 @@ static mps_data_model_t parse_mps_fast_stream(Stream& stream, #pragma omp single { + // Bridge between the producer and the parse tasks: each detached task below blocks + // until run_decode_tasks() publishes that phase's byte range into the registry, then + // completes its event and fulfills depend(out: _ready) -- releasing the matching + // parse task. This is what lets ROWS parsing start the instant the ROWS bytes are + // decoded, overlapping with the decode of later sections. omp_event_handle_t ev_header; #pragma omp task detach(ev_header) depend(out : header_ready) { diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index c00f84eb5e..1ccedba52e 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include #include #include @@ -48,18 +50,6 @@ bool path_has_suffix(const std::string& path, const char* suffix) noexcept path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; } -std::size_t round_up_to_multiple(std::size_t value, std::size_t alignment) -{ - if (alignment == 0) { return value; } - std::size_t remainder = value % alignment; - if (remainder == 0) { return value; } - std::size_t increment = alignment - remainder; - if (value > std::numeric_limits::max() - increment) { - mps_parser_fail(error_type_t::OutOfMemoryError, "allocation size overflow"); - } - return value + increment; -} - std::size_t add_input_padding(std::size_t size) { if (size > std::numeric_limits::max() - input_buffer_padding_bytes) { @@ -184,8 +174,8 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) window_bytes_ = raw_input_window_bytes; window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); - output_mapped_size_ = round_up_to_multiple( - std::max(add_input_padding(file_size_), 1), system_page_size()); + output_mapped_size_ = + cuda::round_up(std::max(add_input_padding(file_size_), 1), system_page_size()); output_region_ = mmap_region_t::anonymous( output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer"); output_data_ = output_region_.char_data(); diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 85309efaa2..4696b0ae81 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -240,20 +240,16 @@ class lz4_resident_windows_t { if (windows_.empty()) { mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows"); } - std::size_t lo = 0; - std::size_t hi = windows_.size(); - while (lo < hi) { - std::size_t mid = lo + (hi - lo) / 2; - const auto& w = windows_[mid]; - if (offset < w.file_offset) { - hi = mid; - } else if (offset >= w.file_offset + w.size) { - lo = mid + 1; - } else { - return w; - } + std::size_t window_stride = windows_.size() > 1 ? windows_[1].file_offset : windows_[0].size; + std::size_t idx = offset / window_stride; + if (idx >= windows_.size()) { + mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); } - mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); + const auto& w = windows_[idx]; + if (offset >= w.file_offset + w.size) { + mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows"); + } + return w; } std::vector& windows_; @@ -431,22 +427,34 @@ struct resident_block_desc_t { bool uncompressed = false; }; +struct window_state_t { + std::atomic decode_refs{0}; + std::atomic released{0}; +}; + // Two distinct units flow through this pipeline: // * window - a fixed-size span of the compressed file read by the I/O stage. // * block - a single independent LZ4 data block (decompressed unit) that the // metadata scanner discovers inside the resident windows. // Windows feed blocks; the decoded blocks are handed to the section scanner, // which owns the contiguous decoded-byte frontier and section publication. +// +// Locking (the grouped members below repeat each guard in context): +// * window_mutex - guards window_done[] (reader -> scanner readiness) +// * desc_mutex - guards desc_queue + scanner_done (scanner -> decoders) +// * window_release_mutex - serializes freeing a window buffer + RSS accounting +// * window_state_[].decode_refs/.released, scanned_through_, blocks_scanned, +// compressed_resident_bytes - lock-free atomics +// Locks are never nested. The scanner thread is the sole writer of the frame walk, +// so offset / decompressed_offset are mutated without locking. struct lz4_pipeline_t { explicit lz4_pipeline_t(lz4_input_stream_t& input_) : input(input_), window_count(cuda::ceil_div(input.compressed_size_, window_bytes)), windows(window_count), + window_state_(std::make_unique(window_count)), io_threads(std::min(lz4_input_max_io_threads, window_count)), - window_done(window_count, 0), - window_refs(window_count), - window_scanned(window_count), - window_released(window_count) + window_done(window_count, 0) { for (std::size_t i = 0; i < window_count; ++i) { std::size_t offset = i * window_bytes; @@ -454,12 +462,23 @@ struct lz4_pipeline_t { windows[i].index = i; windows[i].file_offset = offset; windows[i].size = size; - window_refs[i].store(0, std::memory_order_relaxed); - window_scanned[i].store(0, std::memory_order_relaxed); - window_released[i].store(0, std::memory_order_relaxed); } } + // Runs the three-stage pipeline to completion: + // + // readers --window_done/window_cv--> scanner --desc_queue/desc_cv--> decoders + // + // * readers (io_threads): pread fixed compressed windows into RAM, mark ready. + // * scanner (1 thread) : walk the LZ4 frame in order, slice it into block + // descriptors, push them to decoders in batches. + // * decoders (io_threads): decompress blocks into the output buffer and hand + // each to the section scanner, which advances the + // decoded-byte frontier and publishes section ranges. + // + // Consumers are spawned first so they are parked waiting before the readers (which + // run on this thread) start producing. scoped_thread_group joins the background + // threads on scope exit; any stage's failure is captured in `latch` and rethrown here. void run() { std::exception_ptr startup_error; @@ -471,7 +490,7 @@ struct lz4_pipeline_t { for (std::size_t t = 0; t < io_threads; ++t) { background.emplace([this, t] { run_decoder_stage(t); }); } - run_readers(); + run_readers(); // produce on the calling thread, now that consumers are parked } catch (...) { startup_error = std::current_exception(); fail_and_notify(startup_error); @@ -503,12 +522,11 @@ struct lz4_pipeline_t { void try_release_window(std::size_t index) { if (index >= window_count) { return; } - if (window_scanned[index].load(std::memory_order_acquire) == 0) { return; } - if (window_refs[index].load(std::memory_order_acquire) != 0) { return; } + if (index >= scanned_through_.load(std::memory_order_acquire)) { return; } + window_state_t& state = window_state_[index]; + if (state.decode_refs.load(std::memory_order_acquire) != 0) { return; } uint8_t expected = 0; - if (!window_released[index].compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { - return; - } + if (!state.released.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { return; } std::lock_guard lock(window_release_mutex); if (windows[index].data) { windows[index].data.reset(); @@ -518,9 +536,13 @@ struct lz4_pipeline_t { void mark_windows_scanned_before(std::size_t offset) { - std::size_t last_excl = std::min(window_count, offset / window_bytes); - for (std::size_t wi = 0; wi < last_excl; ++wi) { - window_scanned[wi].store(1, std::memory_order_release); + assert(offset >= last_mark_offset_); + last_mark_offset_ = offset; + std::size_t new_scanned_through = std::min(window_count, offset / window_bytes); + std::size_t prev = scanned_through_.load(std::memory_order_relaxed); + if (new_scanned_through <= prev) { return; } + scanned_through_.store(new_scanned_through, std::memory_order_release); + for (std::size_t wi = prev; wi < new_scanned_through; ++wi) { try_release_window(wi); } } @@ -625,7 +647,8 @@ struct lz4_pipeline_t { void release_block_window_ref(const resident_block_desc_t& block) { if (block.window_index == std::numeric_limits::max()) { return; } - uint32_t old = window_refs[block.window_index].fetch_sub(1, std::memory_order_acq_rel); + uint32_t old = + window_state_[block.window_index].decode_refs.fetch_sub(1, std::memory_order_acq_rel); assert(old > 0); if (old == 1) { try_release_window(block.window_index); } } @@ -743,6 +766,7 @@ struct lz4_pipeline_t { std::size_t& offset, std::size_t& decompressed_offset) { + // --- Decode the block-size word and validate it --------------------------- bool uncompressed = (raw_block_size & lz4_uncompressed_block) != 0; std::size_t block_payload_size = raw_block_size & lz4_block_size_mask; if (block_payload_size == 0) { @@ -757,12 +781,16 @@ struct lz4_pipeline_t { "LZ4 frame contains more blocks than content size allows"); } + // --- Wait until the payload bytes are resident ---------------------------- wait_range_ready(offset, block_payload_size); if (offset + block_payload_size > input.compressed_size_) { mps_parser_fail(error_type_t::ValidationError, "truncated LZ4 frame while reading block payload"); } + // --- Determine the decompressed size -------------------------------------- + // Compressed blocks expand to block_max_size_ (or the content-size remainder + // for the final block); uncompressed blocks keep their payload size. std::size_t decompressed_size = block_payload_size; if (!uncompressed) { decompressed_size = @@ -775,6 +803,12 @@ struct lz4_pipeline_t { mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size"); } + // --- Stage the payload for the decoder ------------------------------------ + // Fast path: the whole payload lives in one window, so point the decoder + // straight at it (zero copy) and pin that window with a decode_refs bump until + // the decode completes. Otherwise it straddles a window boundary: copy it out + // into crossing_payloads, which stays alive for the whole run, so no window pin + // is needed (and the source window can be released as soon as it is scanned). const char* src = resident.ptr_if_contiguous(offset, block_payload_size); std::size_t window_index = std::numeric_limits::max(); if (src == nullptr) { @@ -783,9 +817,10 @@ struct lz4_pipeline_t { src = crossing_payloads.back().data(); } else { window_index = offset / window_bytes; - window_refs[window_index].fetch_add(1, std::memory_order_acq_rel); + window_state_[window_index].decode_refs.fetch_add(1, std::memory_order_acq_rel); } + // --- Record the descriptor and advance past the block (+ optional checksum) - resident_block_desc_t block{src, block_payload_size, decompressed_offset, @@ -829,28 +864,50 @@ struct lz4_pipeline_t { } } + // ---- Input + chunking (immutable after construction) ------------------------ + // The compressed file is split into fixed-size `windows`; `io_threads` reader + // threads pull them by index. lz4_input_stream_t& input; const std::size_t window_bytes = lz4_pipeline_batch_bytes; const std::size_t window_count; std::vector windows; const std::size_t io_threads; + // First-error-wins latch shared by all three stages: stops the pipeline and + // retains the first exception for run() to rethrow after the threads join. parallel_error_latch_t latch; + // ---- Reader -> scanner readiness (guarded by window_mutex) ----------------- + // A reader sets window_done[i]=1 once window i is resident; the scanner blocks + // on window_cv until every window covering the bytes it needs is ready. std::vector window_done; - std::vector> window_refs; - std::vector> window_scanned; - std::vector> window_released; std::mutex window_mutex; std::condition_variable window_cv; + + // ---- Window lifecycle / early release --------------------------------------- + // windows[i].data is freed exactly once, when the metadata scan has passed window i + // (scanned_through_ > i) AND no decoder still pins it (window_state_[i].decode_refs == 0). + // scanned_through_ advances monotonically in mark_windows_scanned_before (last_mark_offset_ + // asserts that monotonicity); decode_refs bumps in scan_one_block and drops in + // release_block_window_ref; the per-window `released` CAS makes the free exactly-once. + // window_release_mutex serializes the data.reset() + compressed_resident_bytes accounting. + std::unique_ptr window_state_; + std::atomic_size_t scanned_through_{0}; + std::size_t last_mark_offset_{0}; std::mutex window_release_mutex; std::atomic_size_t compressed_resident_bytes{0}; + // ---- Scanner -> decoder queue (guarded by desc_mutex) ---------------------- + // The scanner pushes batches of block descriptors; decoders pop them via desc_cv. + // scanner_done signals the scanner has emitted its final batch. std::deque> desc_queue; bool scanner_done = false; std::mutex desc_mutex; std::condition_variable desc_cv; + // ---- Scanner scratch / progress --------------------------------------------- + // blocks_scanned doubles as the running block index; crossing_payloads holds staged + // copies of blocks that straddle a window boundary (see scan_one_block). std::atomic_size_t blocks_scanned{0}; std::vector> crossing_payloads; }; diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index 8d39233e4d..b6b04afbff 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -303,11 +303,14 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, "MPS section scanner observed invalid LZ4 block index"); } + // --- Scan this block, then record its extent and mark it decoded. The release store on + // block_decoded_ publishes the two relaxed offset stores above it. scan_section_range(begin, end); block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed); block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed); block_decoded_[block_index].store(1, std::memory_order_release); + // --- Rescan the seams with already-decoded neighbors, in case a title straddles the boundary. if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) { scan_boundary(block_index - 1, block_index); } @@ -316,6 +319,7 @@ void mps_section_block_scanner_t::observe_block(std::size_t block_index, scan_boundary(block_index, block_index + 1); } + // --- Extend the contiguous decoded-byte frontier and publish any newly bounded phases. advance_ready_frontier(); } @@ -324,8 +328,6 @@ void mps_section_block_scanner_t::advance_ready_frontier() std::size_t new_ready = 0; bool grew = false; { - // block_decoded_ is stored with release after the begin/end offsets, so an - // acquire load of a set flag makes the matching end offset visible here. std::lock_guard lock(frontier_mutex_); while (next_block_ < block_count_ && block_decoded_[next_block_].load(std::memory_order_acquire)) { @@ -409,6 +411,13 @@ void mps_section_block_scanner_t::notify_ready_phases() } }; + // Three publication shapes follow: + // (1) mandatory header/rows/columns -- each spans from its start to the next mandatory + // section; published as soon as that bounding section is available. + // (2) optional rhs/ranges/bounds via publish_optional -- present=true once bounded, or + // present=false once a later section proves the optional one cannot still appear. + // (3) quadratic -- starts at the earliest of the three quad markers (quadobj/qmatrix/qcmatrix). + // final_boundary (ENDATA, or the final ready frontier for truncated files) closes the tail. if (available(rows) && !registry_.ready(mps_phase_kind::header)) { registry_.publish(mps_phase_kind::header, {data_, rows, true}); } diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp index 824e976c4f..5d05e8b2f8 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp @@ -61,6 +61,9 @@ class mps_phase_registry_t { bool endata_present() const; private: + // mutex_ guards ranges_/events_/has_event_/event_fulfilled_ and the endata_* fields for writers. + // Readers observe ready_[phase] / endata_ready_ (release-stored under the lock on publish, + // acquire-loaded here) and may then read the matching range lock-free -- see range()'s contract. static constexpr std::size_t phase_count = 7; static std::size_t phase_index(mps_phase_kind phase); @@ -76,6 +79,19 @@ class mps_phase_registry_t { mutable std::mutex mutex_; }; +// Turns out-of-order decoded blocks into ordered section-range publications for the parser: +// +// producer --observe_block(i,...)--> [SIMD-scan block i for section titles] --> section_hits_ +// [advance contiguous decoded-byte frontier (ready_bytes_)] +// --> notify_ready_phases --> registry --> parser tasks +// +// Producers (the LZ4 decoders / raw readers) call observe_block for each block in any order. +// Per block the scanner (1) SIMD-scans it for section titles starting in column 1 and records +// the first byte of each section via a first-writer-wins CAS; (2) advances a contiguous +// decoded-byte frontier across whatever leading blocks are now present; and (3) recomputes which +// phases are fully bounded and publishes their [begin,end) ranges to the registry, unblocking the +// matching parser task. A title can straddle two blocks, so adjacent decoded blocks are also +// rescanned over a small overlap (boundary_overlap). class mps_section_block_scanner_t { public: mps_section_block_scanner_t(const char* data, @@ -107,6 +123,13 @@ class mps_section_block_scanner_t { void notify_ready_phases(); void advance_ready_frontier(); + // Concurrency: observe_block runs concurrently on many producer threads. + // * frontier_mutex_ guards next_block_ and the ready_bytes_ frontier advance. + // * publish_mutex_ serializes notify_ready_phases so each phase publishes once, in order. + // * block_decoded_[i] is release-stored after block_begin/end_offsets_[i] (relaxed), so an + // acquire-load of a set flag makes those offsets visible to the reader. + // * section_hits_[k] is a first-writer-wins CAS holding the earliest byte of section k. + // * registry_ carries its own internal lock. const char* data_ = nullptr; std::size_t block_count_ = 0; mps_phase_registry_t& registry_; diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp index fe349b47e0..07cc0139fc 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,19 @@ std::string_view range_text(const mps_phase_range_t& range) uint64_t bits(double value) { return std::bit_cast(value); } +template +void expect_vectors_bitwise_equal(const std::vector& reference, + const std::vector& fast, + std::string_view field, + std::string_view context) +{ + static_assert(std::is_trivially_copyable_v); + SCOPED_TRACE(std::string(context) + " " + std::string(field)); + ASSERT_EQ(reference.size(), fast.size()) << "size"; + if (reference.empty()) { return; } + EXPECT_EQ(0, std::memcmp(reference.data(), fast.data(), reference.size() * sizeof(T))); +} + void check_models_match_reference_bitwise(const parser_model_t& fast, const mps_data_model_t& reference, std::string_view context) @@ -109,19 +123,27 @@ void check_models_match_reference_bitwise(const parser_model_t& fas EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_)) << std::string(context) + " objective_offset"; - EXPECT_EQ(reference.A_, fast.A_) << std::string(context) + " A"; + expect_vectors_bitwise_equal(reference.A_, fast.A_, "A", context); EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices"; EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets"; - EXPECT_EQ(reference.b_, fast.b_) << std::string(context) + " b"; - EXPECT_EQ(reference.c_, fast.c_) << std::string(context) + " c"; - EXPECT_EQ(reference.variable_lower_bounds_, fast.variable_lower_bounds_) - << std::string(context) + " variable_lower_bounds"; - EXPECT_EQ(reference.variable_upper_bounds_, fast.variable_upper_bounds_) - << std::string(context) + " variable_upper_bounds"; - EXPECT_EQ(reference.constraint_lower_bounds_, fast.constraint_lower_bounds_) - << std::string(context) + " constraint_lower_bounds"; - EXPECT_EQ(reference.constraint_upper_bounds_, fast.constraint_upper_bounds_) - << std::string(context) + " constraint_upper_bounds"; + expect_vectors_bitwise_equal(reference.b_, fast.b_, "b", context); + expect_vectors_bitwise_equal(reference.c_, fast.c_, "c", context); + expect_vectors_bitwise_equal(reference.variable_lower_bounds_, + fast.variable_lower_bounds_, + "variable_lower_bounds", + context); + expect_vectors_bitwise_equal(reference.variable_upper_bounds_, + fast.variable_upper_bounds_, + "variable_upper_bounds", + context); + expect_vectors_bitwise_equal(reference.constraint_lower_bounds_, + fast.constraint_lower_bounds_, + "constraint_lower_bounds", + context); + expect_vectors_bitwise_equal(reference.constraint_upper_bounds_, + fast.constraint_upper_bounds_, + "constraint_upper_bounds", + context); EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types"; EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types"; EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names"; From 1990c067d267798334bdc651cbf7483810898d0a Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Mon, 15 Jun 2026 09:00:00 -0700 Subject: [PATCH 17/22] AI review comments --- cpp/cuopt_cli.cpp | 4 +- .../cuopt/linear_programming/io/parser.hpp | 25 ++- .../fast_fp64_parser.hpp | 7 +- .../io/experimental_mps_fast/fast_parser.cpp | 162 +++++++++++++++++- .../io/experimental_mps_fast/file_reader.cpp | 13 +- .../io/experimental_mps_fast/file_reader.hpp | 2 + .../experimental_mps_fast/lz4_file_reader.cpp | 39 ++--- .../io/experimental_mps_fast/mmap_region.hpp | 51 ++++-- .../mps_section_scanner.cpp | 7 + .../fast_fp64_parser_test.cpp | 9 + .../fast_parser_edge_test.cpp | 149 +++++++++++++++- cpp/tests/linear_programming/parser_test.cpp | 43 +++-- 12 files changed, 435 insertions(+), 76 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index f06e568208..13991ad1e3 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -308,8 +308,8 @@ int main(int argc, char* argv[]) program.add_argument("--mps-reader") .help( "MPS reader implementation: default uses the production parser; experimental-fast uses the " - "experimental " - "SIMD parser for LP/MIP .mps, .mps.lz4, .mps.gz, and .mps.bz2 files") + "experimental SIMD parser for free-format LP/MIP/QP/QCQP (SOCP) .mps/.qps files and their " + ".gz/.bz2/.lz4 compressed variants") .default_value(std::string("default")) .choices("default", "experimental-fast"); diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index 4e46d43224..2c678f4f4e 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -20,7 +20,8 @@ namespace cuopt::linear_programming::io { /** * @brief Selects which MPS reader implementation should be used by dispatching entry points. * - * The experimental fast reader is intentionally opt-in. It currently supports LP/MIP/QP problems. + * The experimental fast reader is intentionally opt-in. It supports the same free-format + * MPS/QPS scope as read_mps(): LP, MIP, QP (QUADOBJ/QMATRIX), and QCQP/SOCP (QCMATRIX). */ enum class mps_reader_type_t { default_reader, fast_experimental }; @@ -51,11 +52,14 @@ mps_data_model_t read_mps(const std::string& mps_file_path, bool fixed_mps_format = false); /** - * @brief Reads a raw LP/MIP/QP MPS problem with the experimental SIMD-optimized reader. SOCP is - * unsupported for now. + * @brief Reads an MPS/QPS problem with the experimental SIMD-optimized reader. * - * @param[in] mps_file_path Path to a raw or compressed .mps file. - * @return mps_data_model_t A fully formed LP/MIP problem which represents the given file. + * Supports the same free-format LP/MIP/QP/QCQP (SOCP-relevant QCMATRIX) scope as read_mps(). + * Fixed MPS format forcing is not supported. Accepts .mps/.qps and their .gz/.bz2/.lz4 variants + * (compression is detected from the file path, same as read_mps()). + * + * @param[in] mps_file_path Path to a raw or compressed .mps or .qps file. + * @return mps_data_model_t A fully formed LP/MIP/QP problem which represents the given file. */ template mps_data_model_t read_mps_fast_experimental(const std::string& mps_file_path); @@ -128,9 +132,10 @@ mps_data_model_t read_lp_from_string(std::string_view lp_contents); * @brief Reads an optimization problem from a file, dispatching on the file * extension. Extension matching is case-insensitive. * - * Routing: + * Routing (case-insensitive extensions): * - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4 - * → read_mps() + * → read_mps() when mps_reader == default_reader, or read_mps_fast_experimental() + * when mps_reader == fast_experimental (fixed_mps_format must be false) * - .lp, .lp.gz, .lp.bz2, .lp.lz4 → read_lp() * - anything else → std::logic_error * @@ -160,12 +165,6 @@ inline mps_data_model_t read(const std::string& path, throw std::logic_error( "experimental fast MPS reader does not support fixed MPS format forcing"); } - if (lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2") || - lower.ends_with(".qps.lz4")) { - throw std::logic_error( - "experimental fast MPS reader supports .mps, .mps.lz4, .mps.gz, and .mps.bz2 " - "LP/MIP files only"); - } return read_mps_fast_experimental(path); } return read_mps(path, fixed_mps_format); diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp index f007c0f707..02aca44dc3 100644 --- a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp +++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp @@ -423,7 +423,12 @@ static inline double parse_fp64_advance(const char*& p, const char* end) } double v = assemble_fp64(dec); - if (v == v) return v; + if (v == v) { + if (p < end && (unsigned char)*p > 32) { + mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token"); + } + return v; + } return fallback_strtod(std::string_view(start, (size_t)(p - start))); } diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 8eae082e25..165d16d066 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -494,6 +495,14 @@ struct parse_state_t { // some writers introduce zero-column variables only in BOUNDS. std::map bounds_only_vars; + struct qcmatrix_block_t { + size_t row_idx = SIZE_MAX; + std::string_view row_name; + std::vector> entries; + }; + + std::vector qcmatrix_blocks; + parse_state_t(mps_data_model_t& p, cursor_t& c) : problem(p), cursor(c) {} void init_row_hash_table() @@ -2558,11 +2567,13 @@ static void parse_quadratic_sections(parse_state_t& state, cursor_t& c auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) { size_t var1_idx = lookup_quadratic_var(state, var1); if (var1_idx == SIZE_MAX) { - cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var1.size(), var1.data()); + cursor.error( + "unknown variable name in quadratic section: %.*s", (int)var1.size(), var1.data()); } size_t var2_idx = lookup_quadratic_var(state, var2); if (var2_idx == SIZE_MAX) { - cursor.error("unknown variable name in QUADOBJ/QMATRIX: %.*s", (int)var2.size(), var2.data()); + cursor.error( + "unknown variable name in quadratic section: %.*s", (int)var2.size(), var2.data()); } active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value); }; @@ -2576,18 +2587,42 @@ static void parse_quadratic_sections(parse_state_t& state, cursor_t& c active_entries = &qmatrix_entries; continue; } - if (accept_section(cursor, "QCMATRIX")) { - cursor.error("QCMATRIX sections are not supported by the experimental fast MPS parser"); + if (accept(cursor, "QCMATRIX")) { + auto row_name = cursor.read_field(); + if (row_name.empty()) { cursor.error("QCMATRIX missing constraint row name"); } + size_t row_idx = state.row_lookup(row_name); + if (row_idx == SIZE_MAX) { + cursor.error( + "unknown constraint row name in QCMATRIX: %.*s", (int)row_name.size(), row_name.data()); + } + char row_type = state.problem.row_types_[row_idx]; + if (row_type != 'L' && row_type != 'G') { + cursor.error( + "QCMATRIX row must have ROWS type L or G: %.*s", (int)row_name.size(), row_name.data()); + } + expect_eol(cursor); + typename parse_state_t::qcmatrix_block_t block; + block.row_idx = row_idx; + block.row_name = row_name; + state.qcmatrix_blocks.push_back(std::move(block)); + active_entries = &state.qcmatrix_blocks.back().entries; + continue; } if (active_entries == nullptr) { break; } - auto var1 = cursor.read_field(); + const char* field_start = cursor.ptr; + auto var1 = cursor.read_field(); if (UNLIKELY(var1.empty())) { break; } - if (UNLIKELY(var1[0] == '$')) { + if (UNLIKELY(var1[0] == '$' || var1[0] == '*')) { cursor.skip_to_eol(); expect_eol(cursor); continue; } + const bool starts_column_one = + field_start == cursor.start || field_start[-1] == '\n' || field_start[-1] == '\r'; + if (UNLIKELY(starts_column_one)) { + cursor.error("unknown quadratic section record: %.*s", (int)var1.size(), var1.data()); + } auto var2 = cursor.read_field(); if (UNLIKELY(!var2.empty() && var2[0] == '$')) { cursor.skip_to_eol(); @@ -2679,6 +2714,120 @@ static void parse_quadratic_range(parse_state_t& state, mps_phase_rang parse_quadratic_sections(state, cursor); } +template +static void finalize_qcmatrix_constraints(parse_state_t& state) +{ + if (state.qcmatrix_blocks.empty()) { return; } + scoped_timer_t timer("finalize_qcmatrix_constraints"); + const size_t original_rows = (size_t)state.problem.n_constraints_; + std::vector quadratic_rows(original_rows, 0); + std::vector seen_rows(original_rows, 0); + size_t active_blocks = 0; + + for (const auto& block : state.qcmatrix_blocks) { + if (block.entries.empty()) { continue; } + if (block.row_idx >= original_rows) { + state.cursor.error("QCMATRIX row index is out of range"); + } + if (seen_rows[block.row_idx]) { + state.cursor.error("duplicate QCMATRIX block for constraint row: %.*s", + (int)block.row_name.size(), + block.row_name.data()); + } + seen_rows[block.row_idx] = 1; + quadratic_rows[block.row_idx] = 1; + ++active_blocks; + } + + if (active_blocks == 0) { return; } + + // rebuild the A_ matrix. fairly ugly and brute force, could do better if we parsed the QCMATRIX + // entries before building the CSR in COLUMNS but unclear if worth it + for (const auto& block : state.qcmatrix_blocks) { + if (block.entries.empty()) { continue; } + + size_t linear_begin = (size_t)state.problem.A_offsets_[block.row_idx]; + size_t linear_end = (size_t)state.problem.A_offsets_[block.row_idx + 1]; + typename mps_data_model_t::quadratic_constraint_t qc; + qc.constraint_row_index = (i_t)block.row_idx; + qc.constraint_row_name = state.problem.row_names_[block.row_idx]; + qc.constraint_row_type = state.problem.row_types_[block.row_idx]; + qc.rhs_value = state.problem.b_[block.row_idx]; + qc.linear_values.assign(state.problem.A_.begin() + linear_begin, + state.problem.A_.begin() + linear_end); + qc.linear_indices.assign(state.problem.A_indices_.begin() + linear_begin, + state.problem.A_indices_.begin() + linear_end); + + std::vector perm(block.entries.size()); + for (size_t i = 0; i < perm.size(); ++i) { + perm[i] = i; + } + std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) { + const auto& ea = block.entries[a]; + const auto& eb = block.entries[b]; + if (std::get<0>(ea) != std::get<0>(eb)) { return std::get<0>(ea) < std::get<0>(eb); } + return std::get<1>(ea) < std::get<1>(eb); + }); + + qc.rows.reserve(block.entries.size()); + qc.cols.reserve(block.entries.size()); + qc.vals.reserve(block.entries.size()); + for (size_t idx : perm) { + const auto& [row, col, val] = block.entries[idx]; + qc.rows.push_back(row); + qc.cols.push_back(col); + qc.vals.push_back(val); + } + state.problem.quadratic_constraints_.push_back(std::move(qc)); + } + + std::vector new_A; + std::vector new_A_indices; + std::vector new_A_offsets; + std::vector new_b; + std::vector new_clb; + std::vector new_cub; + std::vector new_row_names; + std::vector new_row_types; + + new_A.reserve(state.problem.A_.size()); + new_A_indices.reserve(state.problem.A_indices_.size()); + new_A_offsets.reserve(original_rows + 1 - active_blocks); + new_b.reserve(original_rows - active_blocks); + new_clb.reserve(original_rows - active_blocks); + new_cub.reserve(original_rows - active_blocks); + new_row_names.reserve(original_rows - active_blocks); + new_row_types.reserve(original_rows - active_blocks); + new_A_offsets.push_back(0); + + for (size_t row = 0; row < original_rows; ++row) { + if (quadratic_rows[row]) { continue; } + size_t begin = (size_t)state.problem.A_offsets_[row]; + size_t end = (size_t)state.problem.A_offsets_[row + 1]; + new_A.insert(new_A.end(), state.problem.A_.begin() + begin, state.problem.A_.begin() + end); + new_A_indices.insert(new_A_indices.end(), + state.problem.A_indices_.begin() + begin, + state.problem.A_indices_.begin() + end); + new_A_offsets.push_back((i_t)new_A.size()); + new_b.push_back(state.problem.b_[row]); + new_clb.push_back(state.problem.constraint_lower_bounds_[row]); + new_cub.push_back(state.problem.constraint_upper_bounds_[row]); + new_row_names.push_back(std::move(state.problem.row_names_[row])); + new_row_types.push_back(state.problem.row_types_[row]); + } + + state.problem.A_ = std::move(new_A); + state.problem.A_indices_ = std::move(new_A_indices); + state.problem.A_offsets_ = std::move(new_A_offsets); + state.problem.b_ = std::move(new_b); + state.problem.constraint_lower_bounds_ = std::move(new_clb); + state.problem.constraint_upper_bounds_ = std::move(new_cub); + state.problem.row_names_ = std::move(new_row_names); + state.problem.row_types_ = std::move(new_row_types); + state.problem.n_constraints_ = (i_t)state.problem.b_.size(); + state.problem.nnz_ = (i_t)state.problem.A_.size(); +} + template static void materialize_problem_names(parse_state_t& state) { @@ -2995,6 +3144,7 @@ static mps_data_model_t parse_mps_fast_stream(Stream& stream, parser_tasks.rethrow_if_error(); + finalize_qcmatrix_constraints(state); append_bounds_only_variables(state); input.size = stream.size(); diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 1ccedba52e..48397ae11e 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -46,8 +47,12 @@ constexpr long nfs_super_magic = 0x6969; bool path_has_suffix(const std::string& path, const char* suffix) noexcept { std::size_t suffix_len = std::strlen(suffix); - return path.size() >= suffix_len && - path.compare(path.size() - suffix_len, suffix_len, suffix) == 0; + if (path.size() < suffix_len) { return false; } + for (std::size_t i = 0; i < suffix_len; ++i) { + unsigned char path_char = path[path.size() - suffix_len + i]; + if (std::tolower(path_char) != suffix[i]) { return false; } + } + return true; } std::size_t add_input_padding(std::size_t size) @@ -97,6 +102,7 @@ std::size_t get_file_size(const std::string& path) { int fd = ::open(path.c_str(), O_RDONLY); if (fd < 0) { + ::close(fd); mps_parser_fail(error_type_t::RuntimeError, "Failed to open file '%s': %s", path.c_str(), @@ -173,6 +179,9 @@ raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path) } window_bytes_ = raw_input_window_bytes; window_count_ = std::max(1, (file_size_ + window_bytes_ - 1) / window_bytes_); +#ifdef MPS_FAST_TIMERS + read_window_ms_.assign(window_count_, 0); +#endif output_mapped_size_ = cuda::round_up(std::max(add_input_padding(file_size_), 1), system_page_size()); diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp index 5472434b1a..8ca3456401 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.hpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp @@ -159,6 +159,8 @@ void parallel_for_indexed(std::size_t count, const char* thread_name_prefix, Body body) { + assert(thread_count > 0); + std::atomic_size_t next{0}; scoped_thread_group workers; workers.reserve(thread_count); diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp index 4696b0ae81..5e535ce7f2 100644 --- a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp @@ -557,32 +557,29 @@ struct lz4_pipeline_t { void read_window(std::size_t index) { - auto& w = windows[index]; - w.data.reset(new char[w.size]); - add_compressed_resident(w.size); - bool ok = false; - { - MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); - ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset); - } - if (!ok) { - // Capture-and-notify locally so scanner/decoder waiters wake; do not let - // the exception escape to parallel_for_indexed without the cv notify. - try { + try { + auto& w = windows[index]; + w.data.reset(new char[w.size]); + add_compressed_resident(w.size); + bool ok = false; + { + MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io); + ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset); + } + if (!ok) { mps_parser_fail(error_type_t::RuntimeError, "Failed to pread LZ4 resident window: %s", std::strerror(errno)); - } catch (...) { - fail_and_notify(std::current_exception()); } - return; - } - { - MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); - std::lock_guard lock(window_mutex); - window_done[index] = 1; + { + MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic); + std::lock_guard lock(window_mutex); + window_done[index] = 1; + } + window_cv.notify_all(); + } catch (...) { + fail_and_notify(std::current_exception()); } - window_cv.notify_all(); } void run_decoder_stage(std::size_t tid) diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp index 7727e0d2f7..9d5469e860 100644 --- a/cpp/src/io/experimental_mps_fast/mmap_region.hpp +++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp @@ -35,20 +35,30 @@ class mmap_region_t { mmap_region_t(const mmap_region_t&) = delete; mmap_region_t& operator=(const mmap_region_t&) = delete; - mmap_region_t(mmap_region_t&& other) noexcept : ptr_(other.ptr_), size_(other.size_) + mmap_region_t(mmap_region_t&& other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + unmap_ptr_(other.unmap_ptr_), + unmap_size_(other.unmap_size_) { - other.ptr_ = nullptr; - other.size_ = 0; + other.ptr_ = nullptr; + other.size_ = 0; + other.unmap_ptr_ = nullptr; + other.unmap_size_ = 0; } mmap_region_t& operator=(mmap_region_t&& other) noexcept { if (this != &other) { reset(); - ptr_ = other.ptr_; - size_ = other.size_; - other.ptr_ = nullptr; - other.size_ = 0; + ptr_ = other.ptr_; + size_ = other.size_; + unmap_ptr_ = other.unmap_ptr_; + unmap_size_ = other.unmap_size_; + other.ptr_ = nullptr; + other.size_ = 0; + other.unmap_ptr_ = nullptr; + other.unmap_size_ = 0; } return *this; } @@ -93,11 +103,7 @@ class mmap_region_t { uintptr_t raw_addr = reinterpret_cast(raw); uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1); - std::size_t prefix = (std::size_t)(aligned_addr - raw_addr); - std::size_t suffix = raw_size - prefix - size; - if (prefix > 0) { ::munmap(raw, prefix); } - if (suffix > 0) { ::munmap(reinterpret_cast(aligned_addr + size), suffix); } - return mmap_region_t(reinterpret_cast(aligned_addr), size); + return mmap_region_t(reinterpret_cast(aligned_addr), size, raw, raw_size); } static void map_fixed_or_throw( @@ -112,9 +118,13 @@ class mmap_region_t { void reset() noexcept { - if (ptr_ != nullptr && size_ != 0) { ::munmap(ptr_, size_); } - ptr_ = nullptr; - size_ = 0; + void* base = unmap_ptr_ != nullptr ? unmap_ptr_ : ptr_; + std::size_t len = unmap_ptr_ != nullptr ? unmap_size_ : size_; + if (base != nullptr && len != 0) { ::munmap(base, len); } + ptr_ = nullptr; + size_ = 0; + unmap_ptr_ = nullptr; + unmap_size_ = 0; } void advise(int advice) const noexcept @@ -127,8 +137,15 @@ class mmap_region_t { std::size_t size() const noexcept { return size_; } private: - void* ptr_ = nullptr; - std::size_t size_ = 0; + mmap_region_t(void* ptr, std::size_t size, void* unmap_ptr, std::size_t unmap_size) noexcept + : ptr_(ptr), size_(size), unmap_ptr_(unmap_ptr), unmap_size_(unmap_size) + { + } + + void* ptr_ = nullptr; + std::size_t size_ = 0; + void* unmap_ptr_ = nullptr; + std::size_t unmap_size_ = 0; }; } // namespace cuopt::linear_programming::io::detail diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp index b6b04afbff..3924e2dcd5 100644 --- a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp +++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp @@ -125,6 +125,7 @@ mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const void mps_phase_registry_t::publish_endata(const char* begin, bool present) { std::lock_guard lock(mutex_); + if (endata_ready_.load(std::memory_order_acquire)) { return; } endata_begin_ = begin; endata_present_ = present; endata_ready_.store(true, std::memory_order_release); @@ -168,6 +169,12 @@ static section_record_match_t is_section_record(const char* line_start, while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) { ++after; } + // QCMATRIX records are of the form "QCMATRIX " + if (record.kind == mps_section_kind::qcmatrix) { + if (after == line_end) { return section_record_match_t::invalid; } + *kind = record.kind; + return section_record_match_t::section; + } if (after != line_end) { return section_record_match_t::invalid; } *kind = record.kind; return section_record_match_t::section; diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp index 2ef8339da3..8bde21bb61 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp @@ -165,6 +165,15 @@ TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd) EXPECT_EQ(std::string_view(" ABC"), std::string_view(p, 5)); } +TEST(FastFp64ParserTest, RejectsMalformedNumericSuffix) +{ + std::setlocale(LC_NUMERIC, "C"); + for (const char* token : {"1x", "1e", "1d+", "1e+"}) { + SCOPED_TRACE(token); + EXPECT_THROW(parse_token(token), std::exception); + } +} + TEST(FastFp64ParserTest, FixedSeedRandomDifferential) { std::setlocale(LC_NUMERIC, "C"); diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp index 07cc0139fc..771462a9ab 100644 --- a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp +++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp @@ -5,6 +5,7 @@ #include "mps_section_scanner.hpp" #include +#include #include @@ -148,13 +149,38 @@ void check_models_match_reference_bitwise(const parser_model_t& fas EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types"; EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names"; EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names"; + + ASSERT_EQ(reference.quadratic_constraints_.size(), fast.quadratic_constraints_.size()) + << std::string(context) + " quadratic_constraints size"; + for (size_t q = 0; q < reference.quadratic_constraints_.size(); ++q) { + const auto& ref_qc = reference.quadratic_constraints_[q]; + const auto& fast_qc = fast.quadratic_constraints_[q]; + SCOPED_TRACE(std::string(context) + " quadratic_constraint " + std::to_string(q)); + EXPECT_EQ(ref_qc.constraint_row_index, fast_qc.constraint_row_index); + EXPECT_EQ(ref_qc.constraint_row_name, fast_qc.constraint_row_name); + EXPECT_EQ(ref_qc.constraint_row_type, fast_qc.constraint_row_type); + EXPECT_EQ(bits(ref_qc.rhs_value), bits(fast_qc.rhs_value)); + expect_vectors_bitwise_equal( + ref_qc.linear_values, fast_qc.linear_values, "linear_values", context); + EXPECT_EQ(ref_qc.linear_indices, fast_qc.linear_indices); + expect_vectors_bitwise_equal(ref_qc.vals, fast_qc.vals, "qc_vals", context); + EXPECT_EQ(ref_qc.rows, fast_qc.rows); + EXPECT_EQ(ref_qc.cols, fast_qc.cols); + } +} + +mps_data_model_t parse_reference_model(const std::string& path) +{ + mps_data_model_t reference; + mps_parser_t parser(reference, path, false); + return reference; } void verify_fixture_bitwise(std::string_view fixture_name, std::string contents) { TempMpsFile file(std::move(contents)); auto fast = parse_mps_fast_file(file.path, FileReadMethod::Read); - auto reference = read_mps(file.path, false); + auto reference = parse_reference_model(file.path); check_models_match_reference_bitwise(fast, reference, fixture_name); } @@ -302,6 +328,27 @@ TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows) std::logic_error); } +TEST(FastMpsParserEdgeTest, ParserRejectsUnknownSectionRecords) +{ + TempMpsFile file( + "NAME BAD_UNKNOWN_SECTION\n" + "ROWS\n" + " N OBJ\n" + " L R1\n" + "COLUMNS\n" + " X1 OBJ 1 R1 2\n" + "RHS\n" + " RHS1 R1 3\n" + "BOUNDS\n" + " FR BND1 X1\n" + "QSECTION R1\n" + " X1 X1 1\n" + "ENDATA\n"); + + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception); +} + TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference) { verify_fixture_bitwise("bounds_defaults_and_types", @@ -786,4 +833,104 @@ TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch) EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types"; } +TEST(FastMpsParserEdgeTest, QcMatrixRowsMatchReferenceBitwise) +{ + verify_fixture_bitwise("qcmatrix rows", + "NAME QCMATRIX_TEST\n" + "ROWS\n" + " N OBJ\n" + " L LIN\n" + " L QC1\n" + " G QC2\n" + "COLUMNS\n" + " X1 OBJ 1 LIN 2\n" + " X1 QC1 3 QC2 4\n" + " X2 OBJ 2 LIN 5\n" + " X2 QC1 6 QC2 7\n" + "RHS\n" + " RHS1 LIN 10 QC1 11\n" + " RHS1 QC2 12\n" + "QCMATRIX QC1\n" + " X1 X1 1.25\n" + " X1 X2 -2.5\n" + "QCMATRIX QC2\n" + " X2 X2 3.75\n" + "ENDATA\n"); +} + +TEST(FastMpsParserEdgeTest, QcMatrixMalformedCasesMatchReference) +{ + const std::vector cases = { + "NAME DUP_QC\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX QC1\n" + " X1 X1 1\n" + "QCMATRIX QC1\n" + " X1 X1 2\n" + "ENDATA\n", + "NAME BAD_QC_ROW\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX UNKNOWN\n" + " X1 X1 1\n" + "ENDATA\n", + "NAME BAD_QC_VAR\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + "RHS\n" + " RHS1 QC1 3\n" + "QCMATRIX QC1\n" + " X1 XBAD 1\n" + "ENDATA\n"}; + + for (const auto& mps : cases) { + TempMpsFile file(mps); + EXPECT_THROW(((void)parse_reference_model(file.path)), std::exception); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception); + } +} + +TEST(FastMpsParserEdgeTest, QuadraticParserRejectsUnknownColumnOneRecords) +{ + const std::vector records = {"QSECTION QC1", + "CSECTION QC1 0 QUAD"}; + + for (const auto& record : records) { + TempMpsFile file( + "NAME BAD_QUAD_RECORD\n" + "ROWS\n" + " N OBJ\n" + " L QC1\n" + "COLUMNS\n" + " X1 OBJ 1 QC1 2\n" + " X2 OBJ 3 QC1 4\n" + "RHS\n" + " RHS1 QC1 5\n" + "QMATRIX\n" + " X1 X1 1\n" + + record + + "\n" + " X2 X2 2\n" + "ENDATA\n"); + EXPECT_THROW(((void)parse_mps_fast_file(file.path, FileReadMethod::Read)), + std::exception) + << record; + } +} + } // namespace cuopt::linear_programming::io::detail diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index 6a47471c09..70f7beb2dc 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -947,8 +947,9 @@ INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test); INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test); INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test); -#undef INSTANTIATE_MPS_READER_TEST -#undef INSTANTIATE_DEFAULT_MPS_READER_TEST +// NOTE: INSTANTIATE_MPS_READER_TEST / INSTANTIATE_DEFAULT_MPS_READER_TEST are intentionally +// left defined here; the QP/QCQP file fixtures below reuse them. They are #undef-ed after the +// last instantiation. #ifdef MPS_PARSER_WITH_BZIP2 TEST(mps_parser, good_mps_file_bzip2_compressed) @@ -1051,13 +1052,14 @@ TEST(qps_parser, quadratic_objective_basic) EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]); } +class qps_file_reader_test : public parser_fixture_base {}; + // Test actual QPS files from the dataset -TEST(qps_parser, test_qps_files) +TEST_P(qps_file_reader_test, test_qps_files) { // Test QP_Test_1.qps if it exists if (file_exists("quadratic_programming/QP_Test_1.qps")) { - auto parsed_data = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps", false); + auto parsed_data = read_mps_file("quadratic_programming/QP_Test_1.qps", false); EXPECT_EQ("QP_Test_1", parsed_data.get_problem_name()); EXPECT_EQ(2, parsed_data.get_n_variables()); // C------1 and C------2 @@ -1076,8 +1078,7 @@ TEST(qps_parser, test_qps_files) // Test QP_Test_2.qps if it exists if (file_exists("quadratic_programming/QP_Test_2.qps")) { - auto parsed_data = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps", false); + auto parsed_data = read_mps_file("quadratic_programming/QP_Test_2.qps", false); EXPECT_EQ("QP_Test_2", parsed_data.get_problem_name()); EXPECT_EQ(3, parsed_data.get_n_variables()); // C------1, C------2, C------3 @@ -2635,6 +2636,19 @@ TEST(read, qps_extension_dispatches_to_mps_parser) EXPECT_EQ(m.get_variable_names()[0], "x"); } +TEST(read, qps_extension_dispatches_to_fast_experimental_reader) +{ + temp_file_t tmp(".qps"); + { + std::ofstream out(tmp.string()); + out << kTrivialMps; + } + auto m = read(tmp.string(), mps_reader_type_t::fast_experimental); + ASSERT_EQ(m.get_variable_names().size(), 1u); + EXPECT_EQ(m.get_variable_names()[0], "x"); + EXPECT_NEAR(m.get_variable_upper_bounds()[0], 10.0, tolerance); +} + TEST(read, mps_gz_extension_dispatches_to_mps_parser) { auto m = read(cuopt::test::get_rapids_dataset_root_dir() + @@ -2849,13 +2863,12 @@ TEST(qps_parser, qcmatrix_append_api) } // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic. -TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) +TEST_P(qps_file_reader_test, qcmatrix_mps_linear_rhs_and_bounds) { if (!file_exists("qcqp/QC_Test_1.mps")) { GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root"; } - const auto model = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false); + const auto model = read_mps_file("qcqp/QC_Test_1.mps", false); ASSERT_TRUE(model.has_quadratic_constraints()); const auto& qcs = model.get_quadratic_constraints(); @@ -2901,13 +2914,12 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value); } -TEST(qps_parser, qcqp_p0033_mps_sections) +TEST_P(qps_file_reader_test, qcqp_p0033_mps_sections) { if (!file_exists("qcqp/p0033_qc1.mps")) { GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root"; } - const auto model = read_mps( - cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false); + const auto model = read_mps_file("qcqp/p0033_qc1.mps", false); EXPECT_EQ(12, model.get_n_constraints()); EXPECT_EQ(33, model.get_n_variables()); @@ -2950,4 +2962,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1) auto reloaded_2 = read_mps(temp_file_2.string(), false); compare_data_models(reloaded, reloaded_2); } + +INSTANTIATE_MPS_READER_TEST(qps_file_reader_test); + +#undef INSTANTIATE_MPS_READER_TEST +#undef INSTANTIATE_DEFAULT_MPS_READER_TEST } // namespace cuopt::linear_programming::io From d7358f68ee4448a3bfb8db380be0518870637549 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Mon, 15 Jun 2026 09:07:37 -0700 Subject: [PATCH 18/22] fix sloppy fix --- cpp/src/io/experimental_mps_fast/file_reader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp index 48397ae11e..78e4219e06 100644 --- a/cpp/src/io/experimental_mps_fast/file_reader.cpp +++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp @@ -102,12 +102,15 @@ std::size_t get_file_size(const std::string& path) { int fd = ::open(path.c_str(), O_RDONLY); if (fd < 0) { - ::close(fd); mps_parser_fail(error_type_t::RuntimeError, "Failed to open file '%s': %s", path.c_str(), std::strerror(errno)); } + cuopt::scope_guard close_fd([&] { + if (fd >= 0) { ::close(fd); } + }); + std::size_t size = get_file_size(fd, path); ::close(fd); return size; From 225ae33b5620ce957af99a177d0e09516b47628c Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 23 Jun 2026 01:58:26 -0700 Subject: [PATCH 19/22] review comments --- .../cuopt/linear_programming/io/parser.hpp | 16 +++++++++------- cpp/src/io/experimental_mps_fast/fast_parser.cpp | 2 -- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/io/parser.hpp b/cpp/include/cuopt/linear_programming/io/parser.hpp index 2c678f4f4e..7122282e70 100644 --- a/cpp/include/cuopt/linear_programming/io/parser.hpp +++ b/cpp/include/cuopt/linear_programming/io/parser.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -157,9 +158,13 @@ inline mps_data_model_t read(const std::string& path, std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); - if (lower.ends_with(".mps.lz4") || lower.ends_with(".mps.bz2") || lower.ends_with(".mps.gz") || - lower.ends_with(".mps") || lower.ends_with(".qps.lz4") || lower.ends_with(".qps.bz2") || - lower.ends_with(".qps.gz") || lower.ends_with(".qps")) { + for (const char* compression_suffix : {".bz2", ".gz", ".lz4"}) { + if (lower.ends_with(compression_suffix)) { + lower.resize(lower.size() - std::strlen(compression_suffix)); + break; + } + } + if (lower.ends_with(".mps") || lower.ends_with(".qps")) { if (mps_reader == mps_reader_type_t::fast_experimental) { if (fixed_mps_format) { throw std::logic_error( @@ -169,10 +174,7 @@ inline mps_data_model_t read(const std::string& path, } return read_mps(path, fixed_mps_format); } - if (lower.ends_with(".lp.lz4") || lower.ends_with(".lp.bz2") || lower.ends_with(".lp.gz") || - lower.ends_with(".lp")) { - return read_lp(path); - } + if (lower.ends_with(".lp")) { return read_lp(path); } throw std::logic_error( "read: unrecognized input file extension. Supported (case-insensitive): " ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, " diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp index 165d16d066..02038c6fd9 100644 --- a/cpp/src/io/experimental_mps_fast/fast_parser.cpp +++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp @@ -3137,8 +3137,6 @@ static mps_data_model_t parse_mps_fast_stream(Stream& stream, phase_end("quadratic"); }); } - -#pragma omp taskwait } } From 3f8f9fbfb95a7b26fda3182776c009cffb865eaf Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 23 Jun 2026 02:33:37 -0700 Subject: [PATCH 20/22] hopefully fix wheel CI builds --- cpp/CMakeLists.txt | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e50dc52172..d830f501c8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -458,13 +458,13 @@ set(CUOPT_SRC_FILES) set(MPS_FAST_SRC_FILES) add_subdirectory(src) if (HOST_LINEINFO) - set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") + set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") endif () # Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013) if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$") - set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2") endif () @@ -475,7 +475,7 @@ endif () # Must happen before gRPC files are appended to CUOPT_SRC_FILES. # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO). if (DEFINE_ASSERT) - set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG") endif () @@ -500,7 +500,7 @@ if (NOT SKIP_GRPC_BUILD) # The conda-forge abseil shared library is built with NDEBUG and does not # export that symbol (abseil-cpp#1624). Without this, Debug builds fail # at runtime with "undefined symbol: absl::…::Mutex::Dtor". - set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG") endif (NOT SKIP_GRPC_BUILD) @@ -627,10 +627,17 @@ target_link_libraries(cuopt PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} simde::simde + OpenMP::OpenMP_CXX $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) +# Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp +# and orders its lib dir ahead of the system one. Without it, OpenMP is supplied +# only as a bare -lgomp, which can resolve to an older system libgomp missing +# OpenMP 5.0 symbols such as omp_fulfill_event (used by the fast MPS parser). +target_link_options(cuopt PRIVATE $<$:-fopenmp>) + # ################################################################################################## # - generate tests -------------------------------------------------------------------------------- @@ -754,6 +761,7 @@ if (NOT BUILD_LP_ONLY) ) target_link_options(cuopt_cli PRIVATE -pie) + target_link_options(cuopt_cli PRIVATE $<$:-fopenmp>) target_include_directories(cuopt_cli PRIVATE @@ -813,6 +821,7 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) OpenMP::OpenMP_CXX PRIVATE ) + target_link_options(solve_MIP PRIVATE $<$:-fopenmp>) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags) endif () @@ -843,6 +852,7 @@ if (BUILD_LP_BENCHMARKS) OpenMP::OpenMP_CXX PRIVATE ) + target_link_options(solve_LP PRIVATE $<$:-fopenmp>) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags) endif () @@ -874,6 +884,7 @@ if (NOT SKIP_GRPC_BUILD) ) target_link_options(cuopt_grpc_server PRIVATE -pie) + target_link_options(cuopt_grpc_server PRIVATE $<$:-fopenmp>) target_include_directories(cuopt_grpc_server PRIVATE From 57e48a2bf8d298ca3bc0e0f054bedee5db4457d0 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 23 Jun 2026 04:57:19 -0700 Subject: [PATCH 21/22] wheel fix --- cpp/CMakeLists.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d830f501c8..385b43b3e2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -633,10 +633,11 @@ target_link_libraries(cuopt ) # Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp -# and orders its lib dir ahead of the system one. Without it, OpenMP is supplied -# only as a bare -lgomp, which can resolve to an older system libgomp missing -# OpenMP 5.0 symbols such as omp_fulfill_event (used by the fast MPS parser). -target_link_options(cuopt PRIVATE $<$:-fopenmp>) +# and orders its lib dir ahead of the system one. OpenMP::OpenMP_CXX alone supplies +# a bare -lgomp, which can resolve to an older system libgomp missing OpenMP 5.0 +# symbols such as omp_fulfill_event (used by the fast MPS parser). Plain -fopenmp +# (not gated on LINK_LANGUAGE:CXX) is required because cuopt is CUDA-linked. +target_link_options(cuopt PRIVATE -fopenmp) # ################################################################################################## @@ -760,8 +761,7 @@ if (NOT BUILD_LP_ONLY) "$<$:${CUOPT_CUDA_FLAGS}>" ) - target_link_options(cuopt_cli PRIVATE -pie) - target_link_options(cuopt_cli PRIVATE $<$:-fopenmp>) + target_link_options(cuopt_cli PRIVATE -pie -fopenmp) target_include_directories(cuopt_cli PRIVATE @@ -821,7 +821,7 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) OpenMP::OpenMP_CXX PRIVATE ) - target_link_options(solve_MIP PRIVATE $<$:-fopenmp>) + target_link_options(solve_MIP PRIVATE -fopenmp) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags) endif () @@ -852,7 +852,7 @@ if (BUILD_LP_BENCHMARKS) OpenMP::OpenMP_CXX PRIVATE ) - target_link_options(solve_LP PRIVATE $<$:-fopenmp>) + target_link_options(solve_LP PRIVATE -fopenmp) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags) endif () @@ -883,8 +883,7 @@ if (NOT SKIP_GRPC_BUILD) PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" ) - target_link_options(cuopt_grpc_server PRIVATE -pie) - target_link_options(cuopt_grpc_server PRIVATE $<$:-fopenmp>) + target_link_options(cuopt_grpc_server PRIVATE -pie -fopenmp) target_include_directories(cuopt_grpc_server PRIVATE From 2c5fec2fa20343c50c2a7cda8f5c7cc299112849 Mon Sep 17 00:00:00 2001 From: Alice Boucher Date: Tue, 23 Jun 2026 07:21:26 -0700 Subject: [PATCH 22/22] wheel fix, hopefully --- cpp/CMakeLists.txt | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 385b43b3e2..98f7848fed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -202,6 +202,21 @@ endif () find_package(OpenMP REQUIRED) message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") +# Resolve libgomp from the active C++ compiler, not FindOpenMP's generic -lgomp (which can +# resolve to an older system libgomp on Rocky/RHEL wheel builders). The fast MPS parser uses +# OpenMP 5.0 detached tasks (omp_fulfill_event); compile and link must use the same libgomp. +execute_process( + COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libgomp.so + OUTPUT_VARIABLE CUOPT_LIBGOMP_FILE + OUTPUT_STRIP_TRAILING_WHITESPACE +) +if (NOT IS_ABSOLUTE "${CUOPT_LIBGOMP_FILE}") + message(FATAL_ERROR "Could not resolve libgomp from ${CMAKE_CXX_COMPILER}: '${CUOPT_LIBGOMP_FILE}'") +endif () +get_filename_component(CUOPT_LIBGOMP_DIR "${CUOPT_LIBGOMP_FILE}" DIRECTORY) +message(STATUS "cuOpt: libgomp for OpenMP link = ${CUOPT_LIBGOMP_FILE}") +list(APPEND CUOPT_CXX_FLAGS -fopenmp) + # MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON) option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON) @@ -513,6 +528,7 @@ set_target_properties(cuopt INSTALL_RPATH "\$ORIGIN" INTERFACE_POSITION_INDEPENDENT_CODE ON CXX_SCAN_FOR_MODULES OFF + LINKER_LANGUAGE CXX ) target_compile_definitions(cuopt @@ -582,8 +598,7 @@ add_dependencies(cuopt PSLP) set(CUOPT_PRIVATE_CUDA_LIBS CUDA::curand CUDA::cusolver - TBB::tbb - OpenMP::OpenMP_CXX) + TBB::tbb) list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt) @@ -627,17 +642,15 @@ target_link_libraries(cuopt PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} simde::simde - OpenMP::OpenMP_CXX $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) -# Link with -fopenmp so the compiler driver pulls in its own (matching) libgomp -# and orders its lib dir ahead of the system one. OpenMP::OpenMP_CXX alone supplies -# a bare -lgomp, which can resolve to an older system libgomp missing OpenMP 5.0 -# symbols such as omp_fulfill_event (used by the fast MPS parser). Plain -fopenmp -# (not gated on LINK_LANGUAGE:CXX) is required because cuopt is CUDA-linked. -target_link_options(cuopt PRIVATE -fopenmp) +# Force libgomp from the active C++ toolchain into libcuopt.so. OpenMP::OpenMP_CXX and/or +# -fopenmp alone can leave omp_fulfill_event undefined (CUDA-linked target + --as-needed) or +# resolve a trailing bare -lgomp to an older system libgomp at executable link time. +target_link_directories(cuopt PRIVATE ${CUOPT_LIBGOMP_DIR}) +target_link_libraries(cuopt PRIVATE "-Wl,--no-as-needed" gomp "-Wl,--as-needed") # ################################################################################################## @@ -761,7 +774,7 @@ if (NOT BUILD_LP_ONLY) "$<$:${CUOPT_CUDA_FLAGS}>" ) - target_link_options(cuopt_cli PRIVATE -pie -fopenmp) + target_link_options(cuopt_cli PRIVATE -pie) target_include_directories(cuopt_cli PRIVATE @@ -776,7 +789,6 @@ if (NOT BUILD_LP_ONLY) target_link_libraries(cuopt_cli PUBLIC cuopt - OpenMP::OpenMP_CXX ${CUDSS_LIBRARIES} TBB::tbb PRIVATE @@ -818,10 +830,8 @@ if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) target_link_libraries(solve_MIP PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE ) - target_link_options(solve_MIP PRIVATE -fopenmp) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags) endif () @@ -849,10 +859,8 @@ if (BUILD_LP_BENCHMARKS) target_link_libraries(solve_LP PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE ) - target_link_options(solve_LP PRIVATE -fopenmp) if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags) endif () @@ -883,7 +891,7 @@ if (NOT SKIP_GRPC_BUILD) PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" ) - target_link_options(cuopt_grpc_server PRIVATE -pie -fopenmp) + target_link_options(cuopt_grpc_server PRIVATE -pie) target_include_directories(cuopt_grpc_server PRIVATE @@ -903,7 +911,6 @@ if (NOT SKIP_GRPC_BUILD) target_link_libraries(cuopt_grpc_server PUBLIC cuopt - OpenMP::OpenMP_CXX PRIVATE protobuf::libprotobuf gRPC::grpc++