diff --git a/.drone.jsonnet b/.drone.jsonnet index 60edc39cb..1a532b540 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -290,70 +290,70 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 18.04 Clang 6.0", "cppalliance/droneubuntu1804:1", - { TOOLSET: 'clang', COMPILER: 'clang++-6.0', CXXSTD: '03,11,14,17' }, + { TOOLSET: 'clang', COMPILER: 'clang++-6.0', CXXSTD: '14,17' }, "clang-6.0", ), linux_pipeline( "Linux 20.04 Clang 7", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-7', CXXSTD: '03,11,14,17' }, + { TOOLSET: 'clang', COMPILER: 'clang++-7', CXXSTD: '14,17' }, "clang-7", ), linux_pipeline( "Linux 20.04 Clang 8", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-8', CXXSTD: '03,11,14,17' }, + { TOOLSET: 'clang', COMPILER: 'clang++-8', CXXSTD: '14,17' }, "clang-8", ), linux_pipeline( "Linux 20.04 Clang 9", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-9', CXXSTD: '03,11,14,17,2a' }, + { TOOLSET: 'clang', COMPILER: 'clang++-9', CXXSTD: '14,17,2a' }, "clang-9", ), linux_pipeline( "Linux 20.04 Clang 10", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-10', CXXSTD: '03,11,14,17,2a' }, + { TOOLSET: 'clang', COMPILER: 'clang++-10', CXXSTD: '14,17,2a' }, "clang-10", ), linux_pipeline( "Linux 20.04 Clang 11", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-11', CXXSTD: '03,11,14,17,2a' }, + { TOOLSET: 'clang', COMPILER: 'clang++-11', CXXSTD: '14,17,2a' }, "clang-11", ), linux_pipeline( "Linux 20.04 Clang 12", "cppalliance/droneubuntu2004:1", - { TOOLSET: 'clang', COMPILER: 'clang++-12', CXXSTD: '03,11,14,17,2a' }, + { TOOLSET: 'clang', COMPILER: 'clang++-12', CXXSTD: '14,17,2a' }, "clang-12", ), linux_pipeline( "Linux 22.04 Clang 13", "cppalliance/droneubuntu2204:1", - { TOOLSET: 'clang', COMPILER: 'clang++-13', CXXSTD: '03,11,14,17,20' }, + { TOOLSET: 'clang', COMPILER: 'clang++-13', CXXSTD: '14,17,20' }, "clang-13", ), linux_pipeline( "Linux 22.04 Clang 14", "cppalliance/droneubuntu2204:1", - { TOOLSET: 'clang', COMPILER: 'clang++-14', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-14', CXXSTD: '14,17,20,2b' }, "clang-14", ), linux_pipeline( "Linux 22.04 Clang 15", "cppalliance/droneubuntu2204:1", - { TOOLSET: 'clang', COMPILER: 'clang++-15', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-15', CXXSTD: '14,17,20,2b' }, "clang-15", ["deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-15 main"], ), @@ -361,7 +361,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 22.04 Clang 16", "cppalliance/droneubuntu2204:1", - { TOOLSET: 'clang', COMPILER: 'clang++-16', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-16', CXXSTD: '17,20,2b' }, "clang-16", ["deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-16 main"], ), @@ -369,7 +369,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 17", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-17', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-17', CXXSTD: '17,20,2b' }, "clang-17", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-17 main"], ), @@ -377,7 +377,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 18", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-18', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-18', CXXSTD: '17,20,2b' }, "clang-18", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-18 main"], ), @@ -385,7 +385,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 19", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-19', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-19', CXXSTD: '17,20,2b' }, "clang-19", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main"], ), @@ -393,7 +393,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 20", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-20', CXXSTD: '03,11,14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-20', CXXSTD: '17,20,2b' }, "clang-20", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main"], ), @@ -401,7 +401,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 21", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '14,17,20,2b' }, + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' }, "clang-21", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], ), @@ -409,7 +409,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 21 UBSAN", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '14,17,20,2b' } + ubsan, + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + ubsan, "clang-21", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], ), @@ -417,7 +417,7 @@ local windows_pipeline(name, image, environment, arch = "amd64") = linux_pipeline( "Linux 24.04 Clang 21 ASAN", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '14,17,20,2b' } + asan, + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + asan, "clang-21", ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], ), diff --git a/include/boost/decimal/decimal_fast128_t.hpp b/include/boost/decimal/decimal_fast128_t.hpp index 467b55f23..06ce03b00 100644 --- a/include/boost/decimal/decimal_fast128_t.hpp +++ b/include/boost/decimal/decimal_fast128_t.hpp @@ -541,9 +541,19 @@ constexpr decimal_fast128_t::decimal_fast128_t(T1 coeff, T2 exp, const detail::c return; } - // Normalize the significand in the constructor, so we don't have - // to calculate the number of digits for operations - detail::normalize(min_coeff, exp, is_negative); + // Fast path: if the coefficient already has exactly precision-many digits, + // normalize is a no-op apart from a num_digits call. Skip the call entirely. + constexpr minimum_coefficient_size min_normal_significand { + detail::pow10(static_cast(detail::precision_v - 1))}; + constexpr minimum_coefficient_size max_normal_significand { + static_cast(detail::max_significand_v)}; + + if (min_coeff < min_normal_significand || min_coeff > max_normal_significand) + { + // Normalize the significand in the constructor, so we don't have + // to calculate the number of digits for operations + detail::normalize(min_coeff, exp, is_negative); + } significand_ = static_cast(min_coeff); diff --git a/include/boost/decimal/decimal_fast32_t.hpp b/include/boost/decimal/decimal_fast32_t.hpp index c6c89c00f..3c14fd08e 100644 --- a/include/boost/decimal/decimal_fast32_t.hpp +++ b/include/boost/decimal/decimal_fast32_t.hpp @@ -535,8 +535,18 @@ constexpr decimal_fast32_t::decimal_fast32_t(T1 coeff, T2 exp, const detail::con return; } - // Normalize in the constructor, so we never have to worry about it again - detail::normalize(min_coeff, exp, is_negative); + // Fast path: if the coefficient already has exactly precision-many digits, + // normalize is a no-op apart from a num_digits call. Skip the call entirely. + constexpr minimum_coefficient_size min_normal_significand { + detail::pow10(static_cast(detail::precision_v - 1))}; + constexpr minimum_coefficient_size max_normal_significand { + static_cast(detail::max_significand_v)}; + + if (min_coeff < min_normal_significand || min_coeff > max_normal_significand) + { + // Normalize in the constructor, so we never have to worry about it again + detail::normalize(min_coeff, exp, is_negative); + } significand_ = static_cast(min_coeff); diff --git a/include/boost/decimal/decimal_fast64_t.hpp b/include/boost/decimal/decimal_fast64_t.hpp index 37d09af3a..8db0b601a 100644 --- a/include/boost/decimal/decimal_fast64_t.hpp +++ b/include/boost/decimal/decimal_fast64_t.hpp @@ -544,8 +544,18 @@ constexpr decimal_fast64_t::decimal_fast64_t(T1 coeff, T2 exp, const detail::con return; } - // Normalize the value, so we don't have to worry about it with operations - detail::normalize(min_coeff, exp, is_negative); + // Fast path: if the coefficient already has exactly precision-many digits, + // normalize is a no-op apart from a num_digits call. Skip the call entirely. + constexpr minimum_coefficient_size min_normal_significand { + detail::pow10(static_cast(detail::precision_v - 1))}; + constexpr minimum_coefficient_size max_normal_significand { + static_cast(detail::max_significand_v)}; + + if (min_coeff < min_normal_significand || min_coeff > max_normal_significand) + { + // Normalize the value, so we don't have to worry about it with operations + detail::normalize(min_coeff, exp, is_negative); + } significand_ = static_cast(min_coeff); diff --git a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp index d492971cd..0dad4f249 100644 --- a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp +++ b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp @@ -80,8 +80,8 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // ---------- Compute initial sig_z = sig_gx * r / 10^16 ---------- // sig_z ≈ sqrt(gx) * 10^33 // sig_z = sig_gx * r_scaled / 10^16 - // r_scaled is 64-bit; use mul128By64 (SoftFloat-style) instead of full umul256 - u256 sig_z = mul128By64(gx_sig, r_scaled) / scale16; + // r_scaled is 64-bit; use mul128_by_64 (SoftFloat-style) instead of full umul256 + u256 sig_z = mul128_by_64(gx_sig, r_scaled) / scale16; // Precompute target = sig_gx * 10^33 (avoids recomputing in each Newton iteration) const u256 target = umul256(gx_sig, scale33_128); diff --git a/include/boost/decimal/detail/fenv_rounding.hpp b/include/boost/decimal/detail/fenv_rounding.hpp index cbb4bb237..46b8ea04f 100644 --- a/include/boost/decimal/detail/fenv_rounding.hpp +++ b/include/boost/decimal/detail/fenv_rounding.hpp @@ -100,6 +100,463 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod10(const int128::uint128_t lhs) noexcept return {q, r.low}; } +// Granlund-Montgomery reciprocal-multiplication division by powers of ten. +// For d = 10^shift, d is never a power of 2, so: +// l = ceil(log2(d)) (shared between widths) +// m_full = ceil(2^(N + l) / d), an (N+1)-bit value with the top bit always 1 +// m_low = m_full - 2^N, fits in N bits +// t1 = high_N(coeff * m_low) +// floor(coeff / d) = ((coeff - t1) >> 1 + t1) >> (l - 1) +// See Hacker's Delight, 2nd ed., Section 10-9. +// We instantiate at N = 128 (uint128 dividends, used by the decimal64 wide path and the decimal128 demoted path) and +// N = 256 (u256 dividends, used by the decimal128 wide path). +// All values were computed offline and validated against divmod across millions of inputs. + +#if !defined(__CUDACC__) || !defined(BOOST_DECIMAL_ENABLE_CUDA) + +namespace pow10_recip { + +BOOST_DECIMAL_INLINE_CONSTEXPR_VARIABLE int l_table[77] = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, + 34, 37, 40, 44, 47, 50, 54, 57, 60, 64, + 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, + 100, 103, 107, 110, 113, 117, 120, 123, 127, 130, + 133, 137, 140, 143, 147, 150, 153, 157, 160, 163, + 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, + 200, 203, 206, 210, 213, 216, 220, 223, 226, 230, + 233, 236, 240, 243, 246, 250, 253 +}; + +BOOST_DECIMAL_INLINE_CONSTEXPR_VARIABLE int128::uint128_t m_low_uint128_table[39] = { + {UINT64_C(0x0000000000000000), UINT64_C(0x0000000000000000)}, + {UINT64_C(0x9999999999999999), UINT64_C(0x999999999999999A)}, + {UINT64_C(0x47AE147AE147AE14), UINT64_C(0x7AE147AE147AE148)}, + {UINT64_C(0x0624DD2F1A9FBE76), UINT64_C(0xC8B4395810624DD3)}, + {UINT64_C(0xA36E2EB1C432CA57), UINT64_C(0xA786C226809D4952)}, + {UINT64_C(0x4F8B588E368F0846), UINT64_C(0x1F9F01B866E43AA8)}, + {UINT64_C(0x0C6F7A0B5ED8D36B), UINT64_C(0x4C7F349385836220)}, + {UINT64_C(0xAD7F29ABCAF48578), UINT64_C(0x7A6520EC08D2369A)}, + {UINT64_C(0x5798EE2308C39DF9), UINT64_C(0xFB841A566D74F87B)}, + {UINT64_C(0x12E0BE826D694B2E), UINT64_C(0x62D01511F12A6062)}, + {UINT64_C(0xB7CDFD9D7BDBAB7D), UINT64_C(0x6AE6881CB5109A37)}, + {UINT64_C(0x5FD7FE17964955FD), UINT64_C(0xEF1ED34A2A73AE92)}, + {UINT64_C(0x19799812DEA11197), UINT64_C(0xF27F0F6E885C8BA8)}, + {UINT64_C(0xC25C268497681C26), UINT64_C(0x50CB4BE40D60DF74)}, + {UINT64_C(0x6849B86A12B9B01E), UINT64_C(0xA70909833DE71929)}, + {UINT64_C(0x203AF9EE756159B2), UINT64_C(0x1F3A6E0297EC1421)}, + {UINT64_C(0xCD2B297D889BC2B6), UINT64_C(0x985D7CD0F3135368)}, + {UINT64_C(0x70EF54646D496892), UINT64_C(0x137DFD73F5A90F86)}, + {UINT64_C(0x2725DD1D243ABA0E), UINT64_C(0x75FE645CC4873F9F)}, + {UINT64_C(0xD83C94FB6D2AC34A), UINT64_C(0x5663D3C7A0D865CB)}, + {UINT64_C(0x79CA10C9242235D5), UINT64_C(0x11E976394D79EB09)}, + {UINT64_C(0x2E3B40A0E9B4F7DD), UINT64_C(0xA7EDF82DD794BC07)}, + {UINT64_C(0xE392010175EE5962), UINT64_C(0xA6498D1625BAC671)}, + {UINT64_C(0x82DB34012B25144E), UINT64_C(0xEB6E0A781E2F0528)}, + {UINT64_C(0x357C299A88EA76A5), UINT64_C(0x8924D52CE4F26A86)}, + {UINT64_C(0xEF2D0F5DA7DD8AA2), UINT64_C(0x7507BB7B07EA440A)}, + {UINT64_C(0x8C240C4AECB13BB5), UINT64_C(0x2A6C95FC0655033B)}, + {UINT64_C(0x3CE9A36F23C0FC90), UINT64_C(0xEEBD44C99EAA68FC)}, + {UINT64_C(0xFB0F6BE50601941B), UINT64_C(0x17953ADC3110A7F9)}, + {UINT64_C(0x95A5EFEA6B34767C), UINT64_C(0x12DDC8B027408661)}, + {UINT64_C(0x4484BFEEBC29F863), UINT64_C(0x424B06F3529A051B)}, + {UINT64_C(0x039D66589687F9E9), UINT64_C(0x01D59F290EE19DAF)}, + {UINT64_C(0x9F623D5A8A732974), UINT64_C(0xCFBC31DB4B0295E5)}, + {UINT64_C(0x4C4E977BA1F5BAC3), UINT64_C(0xD9635B15D59BAB1D)}, + {UINT64_C(0x09D8792FB4C49569), UINT64_C(0x7AB5E277DE16227E)}, + {UINT64_C(0xA95A5B7F87A0EF0F), UINT64_C(0x2ABC9D8C9689D0C9)}, + {UINT64_C(0x54484932D2E725A5), UINT64_C(0xBBCA17A3ABA173D4)}, + {UINT64_C(0x1039D428A8B8EAEA), UINT64_C(0xFCA1AC82EFB45CAA)}, + {UINT64_C(0xB38FB9DAA78E44AB), UINT64_C(0x2DCF7A6B19209443)} +}; + +BOOST_DECIMAL_INLINE_CONSTEXPR_VARIABLE u256 m_low_u256_table[77] = { + {UINT64_C(0), UINT64_C(0), UINT64_C(0), UINT64_C(0)}, + {UINT64_C(0x9999999999999999), UINT64_C(0x9999999999999999), UINT64_C(0x9999999999999999), UINT64_C(0x999999999999999A)}, + {UINT64_C(0x47AE147AE147AE14), UINT64_C(0x7AE147AE147AE147), UINT64_C(0xAE147AE147AE147A), UINT64_C(0xE147AE147AE147AF)}, + {UINT64_C(0x0624DD2F1A9FBE76), UINT64_C(0xC8B4395810624DD2), UINT64_C(0xF1A9FBE76C8B4395), UINT64_C(0x810624DD2F1A9FBF)}, + {UINT64_C(0xA36E2EB1C432CA57), UINT64_C(0xA786C226809D4951), UINT64_C(0x82A9930BE0DED288), UINT64_C(0xCE703AFB7E90FF98)}, + {UINT64_C(0x4F8B588E368F0846), UINT64_C(0x1F9F01B866E43AA7), UINT64_C(0x9BBADC0980B24207), UINT64_C(0x0B8CFBFC6540CC79)}, + {UINT64_C(0x0C6F7A0B5ED8D36B), UINT64_C(0x4C7F34938583621F), UINT64_C(0xAFC8B0079A2834D2), UINT64_C(0x6FA3FCC9EA9A3D2E)}, + {UINT64_C(0xAD7F29ABCAF48578), UINT64_C(0x7A6520EC08D23699), UINT64_C(0x194119A5C37387B7), UINT64_C(0x1906614310F6C849)}, + {UINT64_C(0x5798EE2308C39DF9), UINT64_C(0xFB841A566D74F87A), UINT64_C(0x7A9A7AEB02C2D2F8), UINT64_C(0xE0D1E768DA5F06A1)}, + {UINT64_C(0x12E0BE826D694B2E), UINT64_C(0x62D01511F12A6061), UINT64_C(0xFBAEC8BC02357593), UINT64_C(0xE70E52BA484C054E)}, + {UINT64_C(0xB7CDFD9D7BDBAB7D), UINT64_C(0x6AE6881CB5109A36), UINT64_C(0x5F7E0DF99D2255B9), UINT64_C(0x71B0845D4079A216)}, + {UINT64_C(0x5FD7FE17964955FD), UINT64_C(0xEF1ED34A2A73AE91), UINT64_C(0xE5FE71947DB51161), UINT64_C(0x27C069E4339481AB)}, + {UINT64_C(0x19799812DEA11197), UINT64_C(0xF27F0F6E885C8BA7), UINT64_C(0xEB31F476CAF7411A), UINT64_C(0x863387E9C2DD3489)}, + {UINT64_C(0xC25C268497681C26), UINT64_C(0x50CB4BE40D60DF73), UINT64_C(0x11E9872477F201C4), UINT64_C(0x09EC0CA937C8540E)}, + {UINT64_C(0x6849B86A12B9B01E), UINT64_C(0xA70909833DE71928), UINT64_C(0xDB2138E9F98E67D0), UINT64_C(0x07F00A20F96D100B)}, + {UINT64_C(0x203AF9EE756159B2), UINT64_C(0x1F3A6E0297EC1420), UINT64_C(0xAF4DC7219471ECA6), UINT64_C(0x6CC0081A61240CD6)}, + {UINT64_C(0xCD2B297D889BC2B6), UINT64_C(0x985D7CD0F3135367), UINT64_C(0x7EE2D835BA4FE10A), UINT64_C(0x4799A6909B6CE156)}, + {UINT64_C(0x70EF54646D496892), UINT64_C(0x137DFD73F5A90F85), UINT64_C(0xFF1BE02AFB731A6E), UINT64_C(0x9FAE1EDA15F0B445)}, + {UINT64_C(0x2725DD1D243ABA0E), UINT64_C(0x75FE645CC4873F9E), UINT64_C(0x65AFE688C928E1F2), UINT64_C(0x195818AE77F3C36B)}, + {UINT64_C(0xD83C94FB6D2AC34A), UINT64_C(0x5663D3C7A0D865CA), UINT64_C(0x3C4CA40E0EA7CFE9), UINT64_C(0xC2268DE3F31F9F11)}, + {UINT64_C(0x79CA10C9242235D5), UINT64_C(0x11E976394D79EB08), UINT64_C(0x303D500B3EECA654), UINT64_C(0x9B520B1CC27FB274)}, + {UINT64_C(0x2E3B40A0E9B4F7DD), UINT64_C(0xA7EDF82DD794BC06), UINT64_C(0x8CFDD9A298BD51DD), UINT64_C(0x490E6F4A3532F529)}, + {UINT64_C(0xE392010175EE5962), UINT64_C(0xA6498D1625BAC670), UINT64_C(0xE196290427954FC8), UINT64_C(0x74E3E54388518842)}, + {UINT64_C(0x82DB34012B25144E), UINT64_C(0xEB6E0A781E2F0527), UINT64_C(0x1ADE873686110CA0), UINT64_C(0x5D831DCFA04139CF)}, + {UINT64_C(0x357C299A88EA76A5), UINT64_C(0x8924D52CE4F26A85), UINT64_C(0xAF186C2B9E740A19), UINT64_C(0xE468E4A619CDC7D9)}, + {UINT64_C(0xEF2D0F5DA7DD8AA2), UINT64_C(0x7507BB7B07EA4409), UINT64_C(0x18271378FD86768F), UINT64_C(0xD3DB077029493FC1)}, + {UINT64_C(0x8C240C4AECB13BB5), UINT64_C(0x2A6C95FC0655033A), UINT64_C(0x79B8DC60CAD1F873), UINT64_C(0x0FE26C59BAA0FFCD)}, + {UINT64_C(0x3CE9A36F23C0FC90), UINT64_C(0xEEBD44C99EAA68FB), UINT64_C(0x9493E380A241938F), UINT64_C(0x3FE856AE2EE7330B)}, + {UINT64_C(0xFB0F6BE50601941B), UINT64_C(0x17953ADC3110A7F8), UINT64_C(0xEDB96C01039C1F4B), UINT64_C(0x9973BDE37E3EB811)}, + {UINT64_C(0x95A5EFEA6B34767C), UINT64_C(0x12DDC8B027408660), UINT64_C(0xBE2DF000CFB01909), UINT64_C(0x478FCB1C64FEF9A7)}, + {UINT64_C(0x4484BFEEBC29F863), UINT64_C(0x424B06F3529A051A), UINT64_C(0x31BE599A3FC01407), UINT64_C(0x6C7308E383FF2E20)}, + {UINT64_C(0x039D66589687F9E9), UINT64_C(0x01D59F290EE19DAE), UINT64_C(0x8E31E14833001005), UINT64_C(0xF05C071C6998F1B3)}, + {UINT64_C(0x9F623D5A8A732974), UINT64_C(0xCFBC31DB4B0295E4), UINT64_C(0x16B635405199B33C), UINT64_C(0xB3C671C70F5B1C51)}, + {UINT64_C(0x4C4E977BA1F5BAC3), UINT64_C(0xD9635B15D59BAB1C), UINT64_C(0xDEF82A99DAE15C30), UINT64_C(0x8FD1F49F3F7C16A7)}, + {UINT64_C(0x09D8792FB4C49569), UINT64_C(0x7AB5E277DE16227D), UINT64_C(0x7F2CEEE17BE77CF3), UINT64_C(0xA641907F65FCDEEC)}, + {UINT64_C(0xA95A5B7F87A0EF0F), UINT64_C(0x2ABC9D8C9689D0C8), UINT64_C(0xCB7B17CF2CA594B9), UINT64_C(0x0A0280CBD66164AD)}, + {UINT64_C(0x54484932D2E725A5), UINT64_C(0xBBCA17A3ABA173D3), UINT64_C(0xD5FC130C23B7AA2D), UINT64_C(0xA19B9A3CAB811D57)}, + {UINT64_C(0x1039D428A8B8EAEA), UINT64_C(0xFCA1AC82EFB45CA9), UINT64_C(0x77FCDC09B62C8824), UINT64_C(0x8149483089341779)}, + {UINT64_C(0xB38FB9DAA78E44AB), UINT64_C(0x2DCF7A6B19209442), UINT64_C(0x59949342BD140D07), UINT64_C(0x35420D1A7520258F)}, + {UINT64_C(0x5C72FB1552D836EF), UINT64_C(0x57D92EBC141A1035), UINT64_C(0x1476DC3564100A6C), UINT64_C(0x2A9B3DAEC419B7A5)}, + {UINT64_C(0x16C262777579C58C), UINT64_C(0x46475896767B402A), UINT64_C(0x76C57CF783400856), UINT64_C(0x887C31589CE15FB8)}, + {UINT64_C(0xBE03D0BF225C6F46), UINT64_C(0xD6D88DBD8A5ECD10), UINT64_C(0xBE08C7F26B99A6F0), UINT64_C(0xDA604EF42E3565F2)}, + {UINT64_C(0x64CFDA3281E38C38), UINT64_C(0xABE071646EB23DA6), UINT64_C(0xFE6D6CC1EFAE1F27), UINT64_C(0x1519D8C3582AB7F5)}, + {UINT64_C(0x1D7314F534B609C6), UINT64_C(0xEFE6C11D255B6485), UINT64_C(0x98578A34BFBE7F52), UINT64_C(0x7747E09C4688932B)}, + {UINT64_C(0xC8B821885456760B), UINT64_C(0x197134FB6EF8A0D5), UINT64_C(0xC08C105465FD9883), UINT64_C(0xF20C9A93A40DB844)}, + {UINT64_C(0x6D601AD376AB91A2), UINT64_C(0x7AC0F72F8BFA1A44), UINT64_C(0x9A09A6A9EB3146CF), UINT64_C(0xF4D6E20FB6716036)}, + {UINT64_C(0x244CE242C5560E1B), UINT64_C(0x95672C260994E1D0), UINT64_C(0x7B3AEBBB228DD23F), UINT64_C(0xF7124E72F85AB35F)}, + {UINT64_C(0xD3AE36D13BBCE35F), UINT64_C(0x5571E03CDC21694D), UINT64_C(0x91F7DF91D0E2E9FF), UINT64_C(0xF1B6E3EB26F78564)}, + {UINT64_C(0x7624F8A762FD82B2), UINT64_C(0xAAC18030B01ABAA4), UINT64_C(0x74C64C74A71BEE66), UINT64_C(0x5AF8B655B8C60450)}, + {UINT64_C(0x2B50C6EC4F31355B), UINT64_C(0xBBCE0026F3489550), UINT64_C(0x5D6B705D527CBEB8), UINT64_C(0x48C6F84493D19D0D)}, + {UINT64_C(0xDEE7A4AD4B81EEF9), UINT64_C(0x2C7CCD0B1EDA8880), UINT64_C(0x9578B3C883FACAC0), UINT64_C(0x74718D3A861C2E7B)}, + {UINT64_C(0x7F1FB6F10934BF2D), UINT64_C(0xBD30A408E57BA066), UINT64_C(0xDDFA296D36623BCD), UINT64_C(0x29F470FB9E7CF1FC)}, + {UINT64_C(0x327FC58DA0F6FF57), UINT64_C(0xCA8D50071DFC8052), UINT64_C(0x4B2E878A91E82FD7), UINT64_C(0x54C38D961863F4CA)}, + {UINT64_C(0xEA6608E29B24CBBF), UINT64_C(0xAA7BB33E9660CD50), UINT64_C(0x784A72774FD9E625), UINT64_C(0x546C15BCF3D32143)}, + {UINT64_C(0x8851A0B548EA3C99), UINT64_C(0x552FC298784D710D), UINT64_C(0x2D085B92A647EB51), UINT64_C(0x105677CA5CA8E769)}, + {UINT64_C(0x39DAE6F76D88307A), UINT64_C(0xAA8C9BAD2D0AC0D7), UINT64_C(0x5739E2DBB839890D), UINT64_C(0xA6AB93084A20B921)}, + {UINT64_C(0xF62B0B257C0D1A5D), UINT64_C(0xDDADC5E1E1AACE25), UINT64_C(0x585C9E2C59F5A815), UINT64_C(0xD778EB40769AC1CD)}, + {UINT64_C(0x91BC08EAC9A41517), UINT64_C(0xE48B04B4B488A4EA), UINT64_C(0xAD16E4F047F7B9AB), UINT64_C(0x12C72299F87BCE3E)}, + {UINT64_C(0x41633A556E1CDDAC), UINT64_C(0xB6D59D5D5D3A1D88), UINT64_C(0x8A78B7269FF96155), UINT64_C(0xA89F4EE193963E98)}, + {UINT64_C(0x011C2EAABE7D7E23), UINT64_C(0xC577B1177DC817A0), UINT64_C(0x6EC6F8EBB32DE777), UINT64_C(0xBA190BE7A944FEE0)}, + {UINT64_C(0x9B604AAACA62636C), UINT64_C(0x6F25E825960CF29A), UINT64_C(0x4AD7F4AC51E30BF2), UINT64_C(0xC35B463F753B3166)}, + {UINT64_C(0x4919D5556EB51C56), UINT64_C(0xBF518684780A5BAE), UINT64_C(0xA2465D56A7E8D65B), UINT64_C(0xCF7C3832C42F5AB8)}, + {UINT64_C(0x0747DDDDF22A7D12), UINT64_C(0x32A79ED060084958), UINT64_C(0x81D1E4455320AB7C), UINT64_C(0xA5FCF9C2368C4894)}, + {UINT64_C(0xA53FC9631D10C81D), UINT64_C(0x1DD8FE1A3340755A), UINT64_C(0x694FD3A21E9AABFA), UINT64_C(0xA32E5C69F0E0741F)}, + {UINT64_C(0x50FFD44F4A73D34A), UINT64_C(0x7E4731AE8F66C448), UINT64_C(0x543FDC81B2155662), UINT64_C(0x1C2516BB2719F67F)}, + {UINT64_C(0x0D9976A5D52975D5), UINT64_C(0x31D28E253F8569D3), UINT64_C(0x76997D348E77784E), UINT64_C(0x7CEA7895B8E19200)}, + {UINT64_C(0xAF5BF109550F22EE), UINT64_C(0xB61DB03B98D5761F), UINT64_C(0x2428C85417258D4A), UINT64_C(0x61772755F49C1CCC)}, + {UINT64_C(0x59165A6DDDA5B58B), UINT64_C(0xC4E48CFC7A445E7F), UINT64_C(0x5020A04345B7A43B), UINT64_C(0x812C1F77F6E34A3D)}, + {UINT64_C(0x1411E1F17E1E2AD6), UINT64_C(0x371D3D96C836B1FF), UINT64_C(0x734D4D029E2C8362), UINT64_C(0xCDBCE5F9924F6E97)}, + {UINT64_C(0xB9B6364F30304489), UINT64_C(0xF1C8628AD9F11CCB), UINT64_C(0xEBAEE19DC9E0D237), UINT64_C(0xAF94A328EA18B0F1)}, + {UINT64_C(0x615E91D8F359D06E), UINT64_C(0x5B06B53BE18DB0A3), UINT64_C(0x22F24E17D4B3DB5F), UINT64_C(0xBFAA1C20BB46F3F4)}, + {UINT64_C(0x1AB20E472914A6BE), UINT64_C(0xAF3890FCB4715A1C), UINT64_C(0x1BF50B4643C315E6), UINT64_C(0x32EE7CE6FC38C32A)}, + {UINT64_C(0xC45016D841BAA464), UINT64_C(0x4B8DB4C7871BC360), UINT64_C(0x2CBB453D39382309), UINT64_C(0xEB172E3E605AD1DD)}, + {UINT64_C(0x69D9ABE034955050), UINT64_C(0x3C715D6C6C1635E6), UINT64_C(0x8A2F6A9760F9B5A1), UINT64_C(0x88DF5831E6AF0E4A)}, + {UINT64_C(0x217AEFE690777373), UINT64_C(0x638DE456BCDE9185), UINT64_C(0x3B592212B3FAF7B4), UINT64_C(0x6D7F79C18558D83B)}, + {UINT64_C(0xCF2B1970E7258585), UINT64_C(0x6C163A2461641C08), UINT64_C(0x5EF503511FF7F2BA), UINT64_C(0x48CBF6026EF48D2C)}, + {UINT64_C(0x7288E1271F51379D), UINT64_C(0xF011C81D1AB67CD3), UINT64_C(0x7F2A690DB32CC22E), UINT64_C(0xA0A32B35259070F0)} +}; + +} // namespace pow10_recip + +#endif // !defined(__CUDACC__) || !defined(BOOST_DECIMAL_ENABLE_CUDA) + +// Compute (q, r) = (coeff / 10^shift, coeff mod 10^shift) for 0 <= shift <= 38. +// Uses Granlund-Montgomery reciprocal multiplication for shift in [1, 38] +// trivial for shift == 0. +BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod_pow10_uint128(const int128::uint128_t coeff, const int shift) noexcept + -> divmod_result +{ + if (shift == 0) + { + return {coeff, int128::uint128_t{0}}; + } + + #if defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA) + constexpr int l_table[39] = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, + 34, 37, 40, 44, 47, 50, 54, 57, 60, 64, + 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, + 100, 103, 107, 110, 113, 117, 120, 123, 127 + }; + constexpr int128::uint128_t m_low_uint128_table[39] = { + {UINT64_C(0x0000000000000000), UINT64_C(0x0000000000000000)}, + {UINT64_C(0x9999999999999999), UINT64_C(0x999999999999999A)}, + {UINT64_C(0x47AE147AE147AE14), UINT64_C(0x7AE147AE147AE148)}, + {UINT64_C(0x0624DD2F1A9FBE76), UINT64_C(0xC8B4395810624DD3)}, + {UINT64_C(0xA36E2EB1C432CA57), UINT64_C(0xA786C226809D4952)}, + {UINT64_C(0x4F8B588E368F0846), UINT64_C(0x1F9F01B866E43AA8)}, + {UINT64_C(0x0C6F7A0B5ED8D36B), UINT64_C(0x4C7F349385836220)}, + {UINT64_C(0xAD7F29ABCAF48578), UINT64_C(0x7A6520EC08D2369A)}, + {UINT64_C(0x5798EE2308C39DF9), UINT64_C(0xFB841A566D74F87B)}, + {UINT64_C(0x12E0BE826D694B2E), UINT64_C(0x62D01511F12A6062)}, + {UINT64_C(0xB7CDFD9D7BDBAB7D), UINT64_C(0x6AE6881CB5109A37)}, + {UINT64_C(0x5FD7FE17964955FD), UINT64_C(0xEF1ED34A2A73AE92)}, + {UINT64_C(0x19799812DEA11197), UINT64_C(0xF27F0F6E885C8BA8)}, + {UINT64_C(0xC25C268497681C26), UINT64_C(0x50CB4BE40D60DF74)}, + {UINT64_C(0x6849B86A12B9B01E), UINT64_C(0xA70909833DE71929)}, + {UINT64_C(0x203AF9EE756159B2), UINT64_C(0x1F3A6E0297EC1421)}, + {UINT64_C(0xCD2B297D889BC2B6), UINT64_C(0x985D7CD0F3135368)}, + {UINT64_C(0x70EF54646D496892), UINT64_C(0x137DFD73F5A90F86)}, + {UINT64_C(0x2725DD1D243ABA0E), UINT64_C(0x75FE645CC4873F9F)}, + {UINT64_C(0xD83C94FB6D2AC34A), UINT64_C(0x5663D3C7A0D865CB)}, + {UINT64_C(0x79CA10C9242235D5), UINT64_C(0x11E976394D79EB09)}, + {UINT64_C(0x2E3B40A0E9B4F7DD), UINT64_C(0xA7EDF82DD794BC07)}, + {UINT64_C(0xE392010175EE5962), UINT64_C(0xA6498D1625BAC671)}, + {UINT64_C(0x82DB34012B25144E), UINT64_C(0xEB6E0A781E2F0528)}, + {UINT64_C(0x357C299A88EA76A5), UINT64_C(0x8924D52CE4F26A86)}, + {UINT64_C(0xEF2D0F5DA7DD8AA2), UINT64_C(0x7507BB7B07EA440A)}, + {UINT64_C(0x8C240C4AECB13BB5), UINT64_C(0x2A6C95FC0655033B)}, + {UINT64_C(0x3CE9A36F23C0FC90), UINT64_C(0xEEBD44C99EAA68FC)}, + {UINT64_C(0xFB0F6BE50601941B), UINT64_C(0x17953ADC3110A7F9)}, + {UINT64_C(0x95A5EFEA6B34767C), UINT64_C(0x12DDC8B027408661)}, + {UINT64_C(0x4484BFEEBC29F863), UINT64_C(0x424B06F3529A051B)}, + {UINT64_C(0x039D66589687F9E9), UINT64_C(0x01D59F290EE19DAF)}, + {UINT64_C(0x9F623D5A8A732974), UINT64_C(0xCFBC31DB4B0295E5)}, + {UINT64_C(0x4C4E977BA1F5BAC3), UINT64_C(0xD9635B15D59BAB1D)}, + {UINT64_C(0x09D8792FB4C49569), UINT64_C(0x7AB5E277DE16227E)}, + {UINT64_C(0xA95A5B7F87A0EF0F), UINT64_C(0x2ABC9D8C9689D0C9)}, + {UINT64_C(0x54484932D2E725A5), UINT64_C(0xBBCA17A3ABA173D4)}, + {UINT64_C(0x1039D428A8B8EAEA), UINT64_C(0xFCA1AC82EFB45CAA)}, + {UINT64_C(0xB38FB9DAA78E44AB), UINT64_C(0x2DCF7A6B19209443)} + }; + const auto m_low {m_low_uint128_table[shift]}; + const int l {l_table[shift]}; + #else + const auto m_low {pow10_recip::m_low_uint128_table[shift]}; + const int l {pow10_recip::l_table[shift]}; + #endif + + const u256 product {umul256(coeff, m_low)}; + const int128::uint128_t t1 {product.bytes[3], product.bytes[2]}; + + const int128::uint128_t coeff_minus_t1 {coeff - t1}; + const int128::uint128_t t {(coeff_minus_t1 >> 1) + t1}; + const int128::uint128_t q {t >> (l - 1)}; + + const auto pow10_val {detail::pow10(int128::uint128_t{static_cast(shift)})}; + const int128::uint128_t r {coeff - q * pow10_val}; + + return {q, r}; +} + +// Compute (q, r) = (coeff / 10^shift, coeff mod 10^shift) for 0 <= shift <= 76, +// for u256 dividends. +// Uses Granlund-Montgomery reciprocal multiplication for shift in [1, 76] +// This replaces u256/u256 Knuth-D with a u256 mul-high plus a few u256 shifts and one u256 multiply for the remainder. +BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod_pow10_u256(const u256& coeff, const int shift) noexcept + -> u256_divmod_result +{ + if (shift == 0) + { + return {coeff, u256{}}; + } + + // Fast path matching default_div's lhs(shift)})}; + if (coeff < pow10_val_early) + { + return {u256{}, coeff}; + } + + #if defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA) + // Reuse l_table from divmod_pow10_uint128 if it's been emitted; otherwise + // declare locally. CUDA doesn't share function-local statics, so just declare. + constexpr int l_table_u256[77] = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, + 34, 37, 40, 44, 47, 50, 54, 57, 60, 64, + 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, + 100, 103, 107, 110, 113, 117, 120, 123, 127, 130, + 133, 137, 140, 143, 147, 150, 153, 157, 160, 163, + 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, + 200, 203, 206, 210, 213, 216, 220, 223, 226, 230, + 233, 236, 240, 243, 246, 250, 253 + }; + // Local 76-entry u256 table for CUDA. See namespace-scope copy below for + // values; duplicated here so device code does not depend on host statics. + constexpr u256 m_low_u256_local[77] = { + {UINT64_C(0), UINT64_C(0), UINT64_C(0), UINT64_C(0)}, + {UINT64_C(0x9999999999999999), UINT64_C(0x9999999999999999), UINT64_C(0x9999999999999999), UINT64_C(0x999999999999999A)}, + {UINT64_C(0x47AE147AE147AE14), UINT64_C(0x7AE147AE147AE147), UINT64_C(0xAE147AE147AE147A), UINT64_C(0xE147AE147AE147AF)}, + {UINT64_C(0x0624DD2F1A9FBE76), UINT64_C(0xC8B4395810624DD2), UINT64_C(0xF1A9FBE76C8B4395), UINT64_C(0x810624DD2F1A9FBF)}, + {UINT64_C(0xA36E2EB1C432CA57), UINT64_C(0xA786C226809D4951), UINT64_C(0x82A9930BE0DED288), UINT64_C(0xCE703AFB7E90FF98)}, + {UINT64_C(0x4F8B588E368F0846), UINT64_C(0x1F9F01B866E43AA7), UINT64_C(0x9BBADC0980B24207), UINT64_C(0x0B8CFBFC6540CC79)}, + {UINT64_C(0x0C6F7A0B5ED8D36B), UINT64_C(0x4C7F34938583621F), UINT64_C(0xAFC8B0079A2834D2), UINT64_C(0x6FA3FCC9EA9A3D2E)}, + {UINT64_C(0xAD7F29ABCAF48578), UINT64_C(0x7A6520EC08D23699), UINT64_C(0x194119A5C37387B7), UINT64_C(0x1906614310F6C849)}, + {UINT64_C(0x5798EE2308C39DF9), UINT64_C(0xFB841A566D74F87A), UINT64_C(0x7A9A7AEB02C2D2F8), UINT64_C(0xE0D1E768DA5F06A1)}, + {UINT64_C(0x12E0BE826D694B2E), UINT64_C(0x62D01511F12A6061), UINT64_C(0xFBAEC8BC02357593), UINT64_C(0xE70E52BA484C054E)}, + {UINT64_C(0xB7CDFD9D7BDBAB7D), UINT64_C(0x6AE6881CB5109A36), UINT64_C(0x5F7E0DF99D2255B9), UINT64_C(0x71B0845D4079A216)}, + {UINT64_C(0x5FD7FE17964955FD), UINT64_C(0xEF1ED34A2A73AE91), UINT64_C(0xE5FE71947DB51161), UINT64_C(0x27C069E4339481AB)}, + {UINT64_C(0x19799812DEA11197), UINT64_C(0xF27F0F6E885C8BA7), UINT64_C(0xEB31F476CAF7411A), UINT64_C(0x863387E9C2DD3489)}, + {UINT64_C(0xC25C268497681C26), UINT64_C(0x50CB4BE40D60DF73), UINT64_C(0x11E9872477F201C4), UINT64_C(0x09EC0CA937C8540E)}, + {UINT64_C(0x6849B86A12B9B01E), UINT64_C(0xA70909833DE71928), UINT64_C(0xDB2138E9F98E67D0), UINT64_C(0x07F00A20F96D100B)}, + {UINT64_C(0x203AF9EE756159B2), UINT64_C(0x1F3A6E0297EC1420), UINT64_C(0xAF4DC7219471ECA6), UINT64_C(0x6CC0081A61240CD6)}, + {UINT64_C(0xCD2B297D889BC2B6), UINT64_C(0x985D7CD0F3135367), UINT64_C(0x7EE2D835BA4FE10A), UINT64_C(0x4799A6909B6CE156)}, + {UINT64_C(0x70EF54646D496892), UINT64_C(0x137DFD73F5A90F85), UINT64_C(0xFF1BE02AFB731A6E), UINT64_C(0x9FAE1EDA15F0B445)}, + {UINT64_C(0x2725DD1D243ABA0E), UINT64_C(0x75FE645CC4873F9E), UINT64_C(0x65AFE688C928E1F2), UINT64_C(0x195818AE77F3C36B)}, + {UINT64_C(0xD83C94FB6D2AC34A), UINT64_C(0x5663D3C7A0D865CA), UINT64_C(0x3C4CA40E0EA7CFE9), UINT64_C(0xC2268DE3F31F9F11)}, + {UINT64_C(0x79CA10C9242235D5), UINT64_C(0x11E976394D79EB08), UINT64_C(0x303D500B3EECA654), UINT64_C(0x9B520B1CC27FB274)}, + {UINT64_C(0x2E3B40A0E9B4F7DD), UINT64_C(0xA7EDF82DD794BC06), UINT64_C(0x8CFDD9A298BD51DD), UINT64_C(0x490E6F4A3532F529)}, + {UINT64_C(0xE392010175EE5962), UINT64_C(0xA6498D1625BAC670), UINT64_C(0xE196290427954FC8), UINT64_C(0x74E3E54388518842)}, + {UINT64_C(0x82DB34012B25144E), UINT64_C(0xEB6E0A781E2F0527), UINT64_C(0x1ADE873686110CA0), UINT64_C(0x5D831DCFA04139CF)}, + {UINT64_C(0x357C299A88EA76A5), UINT64_C(0x8924D52CE4F26A85), UINT64_C(0xAF186C2B9E740A19), UINT64_C(0xE468E4A619CDC7D9)}, + {UINT64_C(0xEF2D0F5DA7DD8AA2), UINT64_C(0x7507BB7B07EA4409), UINT64_C(0x18271378FD86768F), UINT64_C(0xD3DB077029493FC1)}, + {UINT64_C(0x8C240C4AECB13BB5), UINT64_C(0x2A6C95FC0655033A), UINT64_C(0x79B8DC60CAD1F873), UINT64_C(0x0FE26C59BAA0FFCD)}, + {UINT64_C(0x3CE9A36F23C0FC90), UINT64_C(0xEEBD44C99EAA68FB), UINT64_C(0x9493E380A241938F), UINT64_C(0x3FE856AE2EE7330B)}, + {UINT64_C(0xFB0F6BE50601941B), UINT64_C(0x17953ADC3110A7F8), UINT64_C(0xEDB96C01039C1F4B), UINT64_C(0x9973BDE37E3EB811)}, + {UINT64_C(0x95A5EFEA6B34767C), UINT64_C(0x12DDC8B027408660), UINT64_C(0xBE2DF000CFB01909), UINT64_C(0x478FCB1C64FEF9A7)}, + {UINT64_C(0x4484BFEEBC29F863), UINT64_C(0x424B06F3529A051A), UINT64_C(0x31BE599A3FC01407), UINT64_C(0x6C7308E383FF2E20)}, + {UINT64_C(0x039D66589687F9E9), UINT64_C(0x01D59F290EE19DAE), UINT64_C(0x8E31E14833001005), UINT64_C(0xF05C071C6998F1B3)}, + {UINT64_C(0x9F623D5A8A732974), UINT64_C(0xCFBC31DB4B0295E4), UINT64_C(0x16B635405199B33C), UINT64_C(0xB3C671C70F5B1C51)}, + {UINT64_C(0x4C4E977BA1F5BAC3), UINT64_C(0xD9635B15D59BAB1C), UINT64_C(0xDEF82A99DAE15C30), UINT64_C(0x8FD1F49F3F7C16A7)}, + {UINT64_C(0x09D8792FB4C49569), UINT64_C(0x7AB5E277DE16227D), UINT64_C(0x7F2CEEE17BE77CF3), UINT64_C(0xA641907F65FCDEEC)}, + {UINT64_C(0xA95A5B7F87A0EF0F), UINT64_C(0x2ABC9D8C9689D0C8), UINT64_C(0xCB7B17CF2CA594B9), UINT64_C(0x0A0280CBD66164AD)}, + {UINT64_C(0x54484932D2E725A5), UINT64_C(0xBBCA17A3ABA173D3), UINT64_C(0xD5FC130C23B7AA2D), UINT64_C(0xA19B9A3CAB811D57)}, + {UINT64_C(0x1039D428A8B8EAEA), UINT64_C(0xFCA1AC82EFB45CA9), UINT64_C(0x77FCDC09B62C8824), UINT64_C(0x8149483089341779)}, + {UINT64_C(0xB38FB9DAA78E44AB), UINT64_C(0x2DCF7A6B19209442), UINT64_C(0x59949342BD140D07), UINT64_C(0x35420D1A7520258F)}, + {UINT64_C(0x5C72FB1552D836EF), UINT64_C(0x57D92EBC141A1035), UINT64_C(0x1476DC3564100A6C), UINT64_C(0x2A9B3DAEC419B7A5)}, + {UINT64_C(0x16C262777579C58C), UINT64_C(0x46475896767B402A), UINT64_C(0x76C57CF783400856), UINT64_C(0x887C31589CE15FB8)}, + {UINT64_C(0xBE03D0BF225C6F46), UINT64_C(0xD6D88DBD8A5ECD10), UINT64_C(0xBE08C7F26B99A6F0), UINT64_C(0xDA604EF42E3565F2)}, + {UINT64_C(0x64CFDA3281E38C38), UINT64_C(0xABE071646EB23DA6), UINT64_C(0xFE6D6CC1EFAE1F27), UINT64_C(0x1519D8C3582AB7F5)}, + {UINT64_C(0x1D7314F534B609C6), UINT64_C(0xEFE6C11D255B6485), UINT64_C(0x98578A34BFBE7F52), UINT64_C(0x7747E09C4688932B)}, + {UINT64_C(0xC8B821885456760B), UINT64_C(0x197134FB6EF8A0D5), UINT64_C(0xC08C105465FD9883), UINT64_C(0xF20C9A93A40DB844)}, + {UINT64_C(0x6D601AD376AB91A2), UINT64_C(0x7AC0F72F8BFA1A44), UINT64_C(0x9A09A6A9EB3146CF), UINT64_C(0xF4D6E20FB6716036)}, + {UINT64_C(0x244CE242C5560E1B), UINT64_C(0x95672C260994E1D0), UINT64_C(0x7B3AEBBB228DD23F), UINT64_C(0xF7124E72F85AB35F)}, + {UINT64_C(0xD3AE36D13BBCE35F), UINT64_C(0x5571E03CDC21694D), UINT64_C(0x91F7DF91D0E2E9FF), UINT64_C(0xF1B6E3EB26F78564)}, + {UINT64_C(0x7624F8A762FD82B2), UINT64_C(0xAAC18030B01ABAA4), UINT64_C(0x74C64C74A71BEE66), UINT64_C(0x5AF8B655B8C60450)}, + {UINT64_C(0x2B50C6EC4F31355B), UINT64_C(0xBBCE0026F3489550), UINT64_C(0x5D6B705D527CBEB8), UINT64_C(0x48C6F84493D19D0D)}, + {UINT64_C(0xDEE7A4AD4B81EEF9), UINT64_C(0x2C7CCD0B1EDA8880), UINT64_C(0x9578B3C883FACAC0), UINT64_C(0x74718D3A861C2E7B)}, + {UINT64_C(0x7F1FB6F10934BF2D), UINT64_C(0xBD30A408E57BA066), UINT64_C(0xDDFA296D36623BCD), UINT64_C(0x29F470FB9E7CF1FC)}, + {UINT64_C(0x327FC58DA0F6FF57), UINT64_C(0xCA8D50071DFC8052), UINT64_C(0x4B2E878A91E82FD7), UINT64_C(0x54C38D961863F4CA)}, + {UINT64_C(0xEA6608E29B24CBBF), UINT64_C(0xAA7BB33E9660CD50), UINT64_C(0x784A72774FD9E625), UINT64_C(0x546C15BCF3D32143)}, + {UINT64_C(0x8851A0B548EA3C99), UINT64_C(0x552FC298784D710D), UINT64_C(0x2D085B92A647EB51), UINT64_C(0x105677CA5CA8E769)}, + {UINT64_C(0x39DAE6F76D88307A), UINT64_C(0xAA8C9BAD2D0AC0D7), UINT64_C(0x5739E2DBB839890D), UINT64_C(0xA6AB93084A20B921)}, + {UINT64_C(0xF62B0B257C0D1A5D), UINT64_C(0xDDADC5E1E1AACE25), UINT64_C(0x585C9E2C59F5A815), UINT64_C(0xD778EB40769AC1CD)}, + {UINT64_C(0x91BC08EAC9A41517), UINT64_C(0xE48B04B4B488A4EA), UINT64_C(0xAD16E4F047F7B9AB), UINT64_C(0x12C72299F87BCE3E)}, + {UINT64_C(0x41633A556E1CDDAC), UINT64_C(0xB6D59D5D5D3A1D88), UINT64_C(0x8A78B7269FF96155), UINT64_C(0xA89F4EE193963E98)}, + {UINT64_C(0x011C2EAABE7D7E23), UINT64_C(0xC577B1177DC817A0), UINT64_C(0x6EC6F8EBB32DE777), UINT64_C(0xBA190BE7A944FEE0)}, + {UINT64_C(0x9B604AAACA62636C), UINT64_C(0x6F25E825960CF29A), UINT64_C(0x4AD7F4AC51E30BF2), UINT64_C(0xC35B463F753B3166)}, + {UINT64_C(0x4919D5556EB51C56), UINT64_C(0xBF518684780A5BAE), UINT64_C(0xA2465D56A7E8D65B), UINT64_C(0xCF7C3832C42F5AB8)}, + {UINT64_C(0x0747DDDDF22A7D12), UINT64_C(0x32A79ED060084958), UINT64_C(0x81D1E4455320AB7C), UINT64_C(0xA5FCF9C2368C4894)}, + {UINT64_C(0xA53FC9631D10C81D), UINT64_C(0x1DD8FE1A3340755A), UINT64_C(0x694FD3A21E9AABFA), UINT64_C(0xA32E5C69F0E0741F)}, + {UINT64_C(0x50FFD44F4A73D34A), UINT64_C(0x7E4731AE8F66C448), UINT64_C(0x543FDC81B2155662), UINT64_C(0x1C2516BB2719F67F)}, + {UINT64_C(0x0D9976A5D52975D5), UINT64_C(0x31D28E253F8569D3), UINT64_C(0x76997D348E77784E), UINT64_C(0x7CEA7895B8E19200)}, + {UINT64_C(0xAF5BF109550F22EE), UINT64_C(0xB61DB03B98D5761F), UINT64_C(0x2428C85417258D4A), UINT64_C(0x61772755F49C1CCC)}, + {UINT64_C(0x59165A6DDDA5B58B), UINT64_C(0xC4E48CFC7A445E7F), UINT64_C(0x5020A04345B7A43B), UINT64_C(0x812C1F77F6E34A3D)}, + {UINT64_C(0x1411E1F17E1E2AD6), UINT64_C(0x371D3D96C836B1FF), UINT64_C(0x734D4D029E2C8362), UINT64_C(0xCDBCE5F9924F6E97)}, + {UINT64_C(0xB9B6364F30304489), UINT64_C(0xF1C8628AD9F11CCB), UINT64_C(0xEBAEE19DC9E0D237), UINT64_C(0xAF94A328EA18B0F1)}, + {UINT64_C(0x615E91D8F359D06E), UINT64_C(0x5B06B53BE18DB0A3), UINT64_C(0x22F24E17D4B3DB5F), UINT64_C(0xBFAA1C20BB46F3F4)}, + {UINT64_C(0x1AB20E472914A6BE), UINT64_C(0xAF3890FCB4715A1C), UINT64_C(0x1BF50B4643C315E6), UINT64_C(0x32EE7CE6FC38C32A)}, + {UINT64_C(0xC45016D841BAA464), UINT64_C(0x4B8DB4C7871BC360), UINT64_C(0x2CBB453D39382309), UINT64_C(0xEB172E3E605AD1DD)}, + {UINT64_C(0x69D9ABE034955050), UINT64_C(0x3C715D6C6C1635E6), UINT64_C(0x8A2F6A9760F9B5A1), UINT64_C(0x88DF5831E6AF0E4A)}, + {UINT64_C(0x217AEFE690777373), UINT64_C(0x638DE456BCDE9185), UINT64_C(0x3B592212B3FAF7B4), UINT64_C(0x6D7F79C18558D83B)}, + {UINT64_C(0xCF2B1970E7258585), UINT64_C(0x6C163A2461641C08), UINT64_C(0x5EF503511FF7F2BA), UINT64_C(0x48CBF6026EF48D2C)}, + {UINT64_C(0x7288E1271F51379D), UINT64_C(0xF011C81D1AB67CD3), UINT64_C(0x7F2A690DB32CC22E), UINT64_C(0xA0A32B35259070F0)} + }; + const u256 m_low {m_low_u256_local[shift]}; + const int l {l_table_u256[shift]}; + #else + const u256 m_low {pow10_recip::m_low_u256_table[shift]}; + const int l {pow10_recip::l_table[shift]}; + #endif + + const u256 t1 {umul512_hi(coeff, m_low)}; + + // q = ((coeff - t1) >> 1 + t1) >> (l - 1). + u256 coeff_minus_t1 {coeff}; + { + // u256 lacks operator-=, so subtract via two's-complement add. + u256 neg_t1 {t1}; + neg_t1.bytes[0] = ~t1.bytes[0]; neg_t1.bytes[1] = ~t1.bytes[1]; + neg_t1.bytes[2] = ~t1.bytes[2]; neg_t1.bytes[3] = ~t1.bytes[3]; + // neg_t1 == ~t1; coeff - t1 == coeff + ~t1 + 1. + // Implement with manual add+propagate since u256 has no +=. + std::uint64_t carry {1U}; + for (int i = 0; i < 4; ++i) + { + const std::uint64_t a {coeff.bytes[i]}; + const std::uint64_t b {neg_t1.bytes[i]}; + const std::uint64_t s1 {a + b}; + const std::uint64_t c1 {(s1 < a) ? UINT64_C(1) : UINT64_C(0)}; + const std::uint64_t s2 {s1 + carry}; + const std::uint64_t c2 {(s2 < s1) ? UINT64_C(1) : UINT64_C(0)}; + coeff_minus_t1.bytes[i] = s2; + carry = c1 + c2; + } + } + coeff_minus_t1 >>= 1; + + // t = coeff_minus_t1 + t1 + u256 t {coeff_minus_t1}; + { + std::uint64_t carry {0}; + for (int i = 0; i < 4; ++i) + { + const std::uint64_t a {t.bytes[i]}; + const std::uint64_t b {t1.bytes[i]}; + const std::uint64_t s1 {a + b}; + const std::uint64_t c1 {(s1 < a) ? UINT64_C(1) : UINT64_C(0)}; + const std::uint64_t s2 {s1 + carry}; + const std::uint64_t c2 {(s2 < s1) ? UINT64_C(1) : UINT64_C(0)}; + t.bytes[i] = s2; + carry = c1 + c2; + } + } + t >>= (l - 1); + + // r = coeff - t * pow10(shift). pow10(shift) fits in u256 for shift in [1, 76]. + const u256 q_times_d {t * pow10_val_early}; + u256 r {coeff}; + { + u256 neg {q_times_d}; + neg.bytes[0] = ~q_times_d.bytes[0]; neg.bytes[1] = ~q_times_d.bytes[1]; + neg.bytes[2] = ~q_times_d.bytes[2]; neg.bytes[3] = ~q_times_d.bytes[3]; + std::uint64_t carry {1U}; + for (int i = 0; i < 4; ++i) + { + const std::uint64_t a {r.bytes[i]}; + const std::uint64_t b {neg.bytes[i]}; + const std::uint64_t s1 {a + b}; + const std::uint64_t c1 {(s1 < a) ? UINT64_C(1) : UINT64_C(0)}; + const std::uint64_t s2 {s1 + carry}; + const std::uint64_t c2 {(s2 < s1) ? UINT64_C(1) : UINT64_C(0)}; + r.bytes[i] = s2; + carry = c1 + c2; + } + } + + return {t, r}; +} + +// Overload set used to dispatch from coefficient_rounding without an `if constexpr` +// dependency. uint128 and u256 dividends route to their reciprocal-multiply paths; +// other widths fall through to the ordinary divmod. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod_pow10_dispatch(const T& coeff, int /*shift*/, const T& pow10_val) noexcept +{ + return divmod(coeff, pow10_val); +} + +inline BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod_pow10_dispatch(const int128::uint128_t& coeff, int shift, const int128::uint128_t& pow10_val) noexcept + -> divmod_result +{ + if (shift >= 1 && shift <= 38) + { + return divmod_pow10_uint128(coeff, shift); + } + return divmod(coeff, pow10_val); +} + +inline BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod_pow10_dispatch(const u256& coeff, int shift, const u256& pow10_val) noexcept + -> u256_divmod_result +{ + // For shift <= 9, pow10(shift) <= 10^9 < 2^32, so the existing div_mod takes + // its single-word fast path (8 iterations of 64/32 hardware divide); benchmarks + // show that beats our 16-multiply mul-high. For shift >= 10 the divisor needs + // Knuth-D in the existing path, which the reciprocal multiply outperforms by + // 1.2x to 2.1x on the realistic decimal128 wide-path input distribution. + if (shift >= 10 && shift <= 76) + { + return divmod_pow10_u256(coeff, shift); + } + return divmod(coeff, pow10_val); +} + template BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, const bool sticky, const rounding_mode round) noexcept -> int { @@ -111,36 +568,28 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, con val = div_res.quotient; const auto trailing_num {div_res.remainder}; - // Default rounding mode + // Compute the round-up decision once, then apply it. + // The only inputs that can overflow precision after the increment are val == max_significand combined + // with round_up == true, so predict that case rather than always doing a wide + // post-rounding compare + divide. + bool round_up {false}; switch (round) { case rounding_mode::fe_dec_to_nearest_from_zero: - if (trailing_num >= 5U) - { - ++val; - } + round_up = trailing_num >= 5U; break; case rounding_mode::fe_dec_downward: - if (is_neg && (trailing_num != 0U || sticky)) - { - ++val; - } + round_up = is_neg && (trailing_num != 0U || sticky); break; case rounding_mode::fe_dec_to_nearest: // Round to even or nearest - if (trailing_num > 5U || (trailing_num == 5U && (sticky || (static_cast(val) & 1U) == 1U))) - { - ++val; - } + round_up = trailing_num > 5U || (trailing_num == 5U && (sticky || (static_cast(val) & 1U) == 1U)); break; case rounding_mode::fe_dec_toward_zero: // Do nothing break; case rounding_mode::fe_dec_upward: - if (!is_neg && (trailing_num != 0U || sticky)) - { - ++val; - } + round_up = !is_neg && (trailing_num != 0U || sticky); break; // LCOV_EXCL_START default: @@ -148,12 +597,19 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, con // LCOV_EXCL_STOP } - // If the significand was e.g. 99'999'999 rounding up - // would put it out of range again - if (BOOST_DECIMAL_UNLIKELY(static_cast(val) > max_significand_v())) + if (round_up) { - val /= 10U; - ++exp; + if (BOOST_DECIMAL_UNLIKELY(static_cast(val) == max_significand_v())) + { + // val + 1 == 10^precision + // the cohort-preserving encoding is 10^(precision-1) with the exponent bumped by one. + val = static_cast(max_significand_v() / 10U + 1U); + ++exp; + } + else + { + ++val; + } } return exp; @@ -231,29 +687,36 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto coefficient_rounding(T1& coeff, T2& exp, T3& b const auto shift_pow_ten {detail::pow10(static_cast(shift))}; // In the synthetic integer cases it's inexpensive to see if we can demote the type - // relative to the cost of the division and modulo operation + // relative to the cost of the division and modulo operation. + // impl::divmod_pow10_dispatch routes uint128/uint128 cases to a Granlund-Montgomery reciprocal multiply, + // and falls through to the ordinary divmod elsewhere. demoted_integer_type shifted_coeff {}; bool sticky {}; BOOST_DECIMAL_IF_CONSTEXPR (sizeof(T1) < sizeof(int128::uint128_t)) { - const auto div_res {impl::divmod(coeff, shift_pow_ten)}; + const auto div_res {impl::divmod_pow10_dispatch(coeff, shift, shift_pow_ten)}; shifted_coeff = static_cast(div_res.quotient); const auto trailing_digits {div_res.remainder}; sticky = trailing_digits != 0U; } else { - if (coeff < std::numeric_limits::max()) + // We already have coeff_digits and the demoted type's digits10 is a constexpr int, + // so prefer the int compare over a 256-bit compare. + // This is slightly more conservative for the narrow band of (digits10+1)-digit values that fit in the + // demoted type by virtue of its high-bit slack which land in the wide-divmod branch. + if (coeff_digits <= std::numeric_limits::digits10) { const auto smaller_coeff {static_cast(coeff)}; - const auto div_res {impl::divmod(smaller_coeff, static_cast(shift_pow_ten))}; + const auto smaller_pow10 {static_cast(shift_pow_ten)}; + const auto div_res {impl::divmod_pow10_dispatch(smaller_coeff, shift, smaller_pow10)}; shifted_coeff = static_cast(div_res.quotient); const auto trailing_digits {div_res.remainder}; sticky = trailing_digits != 0U; } else { - const auto div_res {impl::divmod(coeff, shift_pow_ten)}; + const auto div_res {impl::divmod_pow10_dispatch(coeff, shift, shift_pow_ten)}; shifted_coeff = static_cast(div_res.quotient); const auto trailing_digits {div_res.remainder}; sticky = trailing_digits != 0U; diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index ce4c0224d..6d675b82b 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -910,26 +910,73 @@ BOOST_DECIMAL_CUDA_CONSTEXPR u256 umul256(const int128::uint128_t& a, const int1 const auto p2 = a_high * b_low; const auto p3 = a_high * b_high; - // Combine results - const auto middle = p1 + p2 + p0.high; + const auto p1_plus_p2 = p1 + p2; + const std::uint64_t carry_p1p2 = (p1_plus_p2 < p1) ? UINT64_C(1) : UINT64_C(0); + + const auto middle = p1_plus_p2 + p0.high; + const std::uint64_t carry_mid = (middle < p1_plus_p2) ? UINT64_C(1) : UINT64_C(0); result.bytes[0] = p0.low; result.bytes[1] = middle.low; - const auto high_sum = middle.high + p3; + auto high_sum = p3 + int128::uint128_t{0, middle.high}; + high_sum += int128::uint128_t{carry_p1p2 + carry_mid, 0}; + result.bytes[2] = high_sum.low; result.bytes[3] = high_sum.high; return result; } -// 128×64→256 multiplication (SoftFloat-style lightweight primitive) +// Returns the high 256 bits of a u256 * u256 -> u512 product +BOOST_DECIMAL_CUDA_CONSTEXPR u256 umul512_hi(const u256& a, const u256& b) noexcept +{ + // Decompose each operand into two uint128 halves. + const int128::uint128_t a_lo {a.bytes[1], a.bytes[0]}; + const int128::uint128_t a_hi {a.bytes[3], a.bytes[2]}; + const int128::uint128_t b_lo {b.bytes[1], b.bytes[0]}; + const int128::uint128_t b_hi {b.bytes[3], b.bytes[2]}; + + // Four uint128 * uint128 -> u256 partial products. + const u256 p_ll {umul256(a_lo, b_lo)}; + const u256 p_lh {umul256(a_lo, b_hi)}; + const u256 p_hl {umul256(a_hi, b_lo)}; + const u256 p_hh {umul256(a_hi, b_hi)}; + + const int128::uint128_t p_ll_hi {p_ll.bytes[3], p_ll.bytes[2]}; + const int128::uint128_t p_lh_lo {p_lh.bytes[1], p_lh.bytes[0]}; + const int128::uint128_t p_lh_hi {p_lh.bytes[3], p_lh.bytes[2]}; + const int128::uint128_t p_hl_lo {p_hl.bytes[1], p_hl.bytes[0]}; + const int128::uint128_t p_hl_hi {p_hl.bytes[3], p_hl.bytes[2]}; + const int128::uint128_t p_hh_lo {p_hh.bytes[1], p_hh.bytes[0]}; + const int128::uint128_t p_hh_hi {p_hh.bytes[3], p_hh.bytes[2]}; + + int128::uint128_t w1 {p_ll_hi}; + w1 += p_lh_lo; + std::uint64_t carry_w1 {(w1 < p_lh_lo) ? UINT64_C(1) : UINT64_C(0)}; + w1 += p_hl_lo; + carry_w1 += (w1 < p_hl_lo) ? UINT64_C(1) : UINT64_C(0); + + int128::uint128_t w2 {p_lh_hi}; + w2 += p_hl_hi; + std::uint64_t carry_w2 {(w2 < p_hl_hi) ? UINT64_C(1) : UINT64_C(0)}; + w2 += p_hh_lo; + carry_w2 += (w2 < p_hh_lo) ? UINT64_C(1) : UINT64_C(0); + const int128::uint128_t w2_before_carry {w2}; + w2 += int128::uint128_t{0, carry_w1}; + carry_w2 += (w2 < w2_before_carry) ? UINT64_C(1) : UINT64_C(0); + + const int128::uint128_t w3 {p_hh_hi + int128::uint128_t{0, carry_w2}}; + + return u256{w3, w2}; +} + +// 128x64 -> 256 multiplication (SoftFloat-style lightweight primitive) // Used when rhs is 64-bit (e.g. r_scaled from approx_recip_sqrt64) -// Explicit uint128_t cast ensures 64×64→128 widening (a.low*b otherwise returns uint64_t on some platforms) -BOOST_DECIMAL_CUDA_CONSTEXPR u256 mul128By64(const int128::uint128_t& a, const std::uint64_t b) noexcept +BOOST_DECIMAL_CUDA_CONSTEXPR u256 mul128_by_64(const int128::uint128_t& a, const std::uint64_t b) noexcept { - const int128::uint128_t p0 = int128::uint128_t{a.low} * b; // 64×64→128 - const int128::uint128_t p1 = int128::uint128_t{a.high} * b; // 64×64→128 + const int128::uint128_t p0 = int128::uint128_t{a.low} * b; // 64x64 -> 128 + const int128::uint128_t p1 = int128::uint128_t{a.high} * b; // 64x64 -> 128 const auto mid = p1.low + p0.high; const std::uint64_t carry1 = (mid < p0.high) ? 1U : 0U; const auto hi = p1.high + carry1;