From 5e1670507e85f007afea9402f90e550f771ba67b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 12 Sep 2025 10:07:25 -0700 Subject: [PATCH 01/20] Add MSVC build support on windows --- lib/meson.build | 8 +++---- lib/x86simdsort.cpp | 34 ++++++++++++++++-------------- lib/x86simdsort.h | 5 +++++ lib/x86simdsortcpuid.h | 48 ++++++++++++++++++++++++++++++++++++++++++ meson.build | 1 + 5 files changed, 76 insertions(+), 20 deletions(-) create mode 100644 lib/x86simdsortcpuid.h diff --git a/lib/meson.build b/lib/meson.build index 44ced535..2f9f06c3 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -6,7 +6,7 @@ if cpp.has_argument('-march=haswell') 'x86simdsort-avx2.cpp', ), include_directories : [src], - cpp_args : ['-march=haswell'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -18,7 +18,7 @@ if cpp.has_argument('-march=skylake-avx512') 'x86simdsort-skx.cpp', ), include_directories : [src], - cpp_args : ['-march=skylake-avx512'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -30,7 +30,7 @@ if cpp.has_argument('-march=icelake-client') 'x86simdsort-icl.cpp', ), include_directories : [src], - cpp_args : ['-march=icelake-client'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -42,7 +42,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-march=sapphirerapids'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 35d6ce43..a5e15299 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -1,6 +1,12 @@ +#if defined(_MSC_VER) +#define XSS_ATTRIBUTE_CONSTRUCTOR +#else +#define XSS_ATTRIBUTE_CONSTRUCTOR __attribute__((constructor)) +#endif #include "x86simdsort.h" #include "x86simdsort-internal.h" #include "x86simdsort-scalar.h" +#include "x86simdsortcpuid.h" #include #include #include @@ -12,23 +18,19 @@ static int check_cpu_feature_support(std::string_view cpufeature) if ((cpufeature == "avx512_spr") && (!disable_avx512)) #if defined(__FLT16_MAX__) && !defined(__INTEL_LLVM_COMPILER) \ && (!defined(__clang_major__) || __clang_major__ >= 18) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512fp16") - && __builtin_cpu_supports("avx512vbmi2"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512fp16") + && xss_cpu_supports("avx512vbmi2"); #else return 0; #endif else if ((cpufeature == "avx512_icl") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512vbmi2") - && __builtin_cpu_supports("avx512bw") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512vbmi2") + && xss_cpu_supports("avx512bw") && xss_cpu_supports("avx512vl"); else if ((cpufeature == "avx512_skx") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512dq") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512dq") + && xss_cpu_supports("avx512vl"); else if (cpufeature == "avx2") - return __builtin_cpu_supports("avx2"); + return xss_cpu_supports("avx2"); return 0; } @@ -121,11 +123,11 @@ constexpr bool IS_TYPE_FLOAT16() /* runtime dispatch mechanism */ #define DISPATCH(func, TYPE, ISA) \ - DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ - CAT(CAT(resolve_, func), TYPE)(void) \ + DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(resolve_, func), TYPE)(void) \ { \ CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ @@ -248,12 +250,12 @@ DISPATCH_ALL(argselect, } #define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ - static __attribute__((constructor)) void CAT( \ + static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ { \ CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 34ed101d..e30120ef 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,8 +6,13 @@ #include #include +#if defined(_MSC_VER) +#define XSS_EXPORT_SYMBOL __declspec(dllexport) +#define XSS_HIDE_SYMBOL +#else #define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) +#endif #define UNUSED(x) (void)(x) namespace x86simdsort { diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h new file mode 100644 index 00000000..a3dcd9e3 --- /dev/null +++ b/lib/x86simdsortcpuid.h @@ -0,0 +1,48 @@ +#ifndef X86SIMDSORT_CPUID_H +#define X86SIMDSORT_CPUID_H + +#ifdef _MSC_VER +#include +#include +#include + +static std::unordered_map xss_cpu_features; + +inline void xss_cpu_init() +{ + int cpuInfo[4] = {0}; + // Check AVX2 + __cpuid(cpuInfo, 0); + int nIds = cpuInfo[0]; + __cpuid(cpuInfo, 1); + bool osxsave = (cpuInfo[2] & (1 << 27)) != 0; + bool avx = (cpuInfo[2] & (1 << 28)) != 0; + __cpuid(cpuInfo, 7); + bool avx2 = (cpuInfo[1] & (1 << 5)) != 0; + bool avx512f = (cpuInfo[1] & (1 << 16)) != 0; + bool avx512dq = (cpuInfo[1] & (1 << 17)) != 0; + bool avx512bw = (cpuInfo[1] & (1 << 30)) != 0; + bool avx512vl = (cpuInfo[1] & (1 << 31)) != 0; + bool avx512vbmi2 = (cpuInfo[2] & (1 << 6)) != 0; + bool avx512fp16 = (cpuInfo[3] & (1 << 23)) != 0; + // Store results + xss_cpu_features["avx2"] = avx2; + xss_cpu_features["avx512f"] = avx512f; + xss_cpu_features["avx512dq"] = avx512dq; + xss_cpu_features["avx512bw"] = avx512bw; + xss_cpu_features["avx512vl"] = avx512vl; + xss_cpu_features["avx512vbmi2"] = avx512vbmi2; + xss_cpu_features["avx512fp16"] = avx512fp16; +} + +inline bool xss_cpu_supports(const char *feature) +{ + auto it = xss_cpu_features.find(feature); + return it != xss_cpu_features.end() && it->second; +} + +#else +#define xss_cpu_init() __builtin_cpu_init() +#define xss_cpu_supports(feature) __builtin_cpu_supports(feature) +#endif // _MSC_VER +#endif // X86SIMDSORT_CPUID_H diff --git a/meson.build b/meson.build index 0b826f06..70c9fef1 100644 --- a/meson.build +++ b/meson.build @@ -1,3 +1,4 @@ + project('x86-simd-sort', 'cpp', version : '7.0.x', license : 'BSD 3-clause', From 208940d590b4c0d1ed4bb44a8de6e659c8aaf62b Mon Sep 17 00:00:00 2001 From: Raghuveer Date: Fri, 12 Sep 2025 13:40:32 -0700 Subject: [PATCH 02/20] CI: use one ASAN build and run it without SDE to improve speed --- .github/workflows/c-cpp.yml | 44 ++++--------------------------------- 1 file changed, 4 insertions(+), 40 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 865c0df1..66345bdf 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,4 +1,4 @@ -name: Build and run tests + name: Build and run tests on: push: @@ -135,7 +135,7 @@ jobs: - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe --gtest_filter="*simdsort*" - ADL-ASAN-clang18: + ASAN-clang18: runs-on: ubuntu-24.04 @@ -172,45 +172,9 @@ jobs: - name: Run test suite on ADL run: sde -adl -- ./builddir/testexe --gtest_filter="*simdsort*" - SPR-ASAN-clang18: - - runs-on: intel-ubuntu-24.04 - - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git - - - name: Install Intel SDE - run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz - mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ - sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - - - name: Build examples - env: - CXX: clang++-18 - run: | - cd examples - make all - - - name: Build - env: - CXX: clang++-18 - run: | - make clean - meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir - cd builddir - ninja - - name: Run test suite on SPR - run: sde -spr -- ./builddir/testexe - - name: Run ICL fp16 tests - # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future - run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*" + run: sde -spr -- ./builddir/testexe --gtest_filter="*simdsort*" + SKX-SKL-openmp: From 7bc1655242e87e0bc808e9ca9461b578d1081d01 Mon Sep 17 00:00:00 2001 From: Raghuveer Date: Fri, 12 Sep 2025 13:50:14 -0700 Subject: [PATCH 03/20] Simplify compiler checks --- lib/meson.build | 63 ++++++++++++++++++++++--------------------------- meson.build | 8 +++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/lib/meson.build b/lib/meson.build index 2f9f06c3..29ee139b 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,40 +1,33 @@ libtargets = [] +libtargets += static_library('libavx', + files( + 'x86simdsort-avx2.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=haswell') - libtargets += static_library('libavx', - files( - 'x86simdsort-avx2.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif - -if cpp.has_argument('-march=skylake-avx512') - libtargets += static_library('libskx', - files( - 'x86simdsort-skx.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=icelake-client') - libtargets += static_library('libicl', - files( - 'x86simdsort-icl.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) if cancompilefp16 libtargets += static_library('libspr', @@ -42,7 +35,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/meson.build b/meson.build index 70c9fef1..38e84d51 100644 --- a/meson.build +++ b/meson.build @@ -11,6 +11,13 @@ bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') +# check if compiler supports -march=haswell, -march=skylake-avx512 and -march=icelake-client and error out if not +if cpp.get_id() != 'msvc' + if not cpp.has_argument('-march=haswell') or not cpp.has_argument('-march=skylake-avx512') or not cpp.has_argument('-march=icelake-client') + error('Compiler does not support -march=haswell, -march=skylake-avx512 or -march=icelake-client. Please use a newer compiler version.') + endif +endif + # Add IPP sort to benchmarks: benchipp = false ipplink = [] @@ -38,6 +45,7 @@ if get_option('use_openmp') omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) endif + fp16code = '''#include int main() { __m512h temp = _mm512_set1_ph(1.0f); From 2d2a29a58ce99490219d291d4379ac1a085fdcef Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 15 Sep 2025 12:36:59 -0700 Subject: [PATCH 04/20] Add avx512 fp16 header file when build with MSVC --- lib/x86simdsort-icl.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 6bbad2c3..96456aba 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,6 +1,10 @@ // ICL specific routines: #include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" +#ifdef _MSC_VER +#include "avx512-16bit-qsort.hpp" +#endif + namespace xss { namespace avx512 { From c84aa9c886f6f95c3b17c355d042edfe7895cfc5 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 16 Sep 2025 09:38:14 -0700 Subject: [PATCH 05/20] CI: add windows msvc build --- .github/workflows/c-cpp.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 66345bdf..a5757e2b 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,6 +9,37 @@ on: permissions: read-all jobs: + build-windows-msvc: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + + # Set up MSVC environment + - name: Set up MSVC Developer Command Prompt + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + # Install Python (Meson requires it) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + # Install Meson and Ninja + - name: Install Meson + Ninja + run: | + python -m pip install --upgrade pip + pip install meson ninja + + # Configure and build with Meson (MSVC will be used automatically) + - name: Configure (Meson) + run: meson setup --warnlevel 2 --buildtype release builddir --backend=ninja + + - name: Build (Ninja) + run: ninja -C builddir + SKL-gcc9: runs-on: ubuntu-24.04 From 463551169fa444ac84125b91cb6f5f668216e596 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 12:37:38 -0700 Subject: [PATCH 06/20] Use XSS_EXPORT_SYMBOL on all template specializations --- lib/x86simdsort.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index a5e15299..4abfa469 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -62,7 +62,7 @@ namespace x86simdsort { #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -71,7 +71,7 @@ namespace x86simdsort { static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void qselect( \ + void XSS_EXPORT_SYMBOL qselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -82,7 +82,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void partial_qsort( \ + void XSS_EXPORT_SYMBOL partial_qsort( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -93,8 +93,8 @@ namespace x86simdsort { const TYPE *, size_t, bool, bool) \ = NULL; \ template <> \ - std::vector argsort( \ - const TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + std::vector XSS_EXPORT_SYMBOL argsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -104,8 +104,8 @@ namespace x86simdsort { const TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - std::vector argselect( \ - const TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + std::vector XSS_EXPORT_SYMBOL argselect( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ } @@ -217,7 +217,7 @@ DISPATCH_ALL(argselect, TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void keyvalue_qsort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ TYPE2 *val, \ size_t arrsize, \ bool hasnan, \ @@ -227,7 +227,7 @@ DISPATCH_ALL(argselect, key, val, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_select(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ @@ -238,7 +238,7 @@ DISPATCH_ALL(argselect, key, val, k, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_partial_sort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ From d91a674b970767d02b0ad47a25def2e37d5474e9 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 13:51:21 -0700 Subject: [PATCH 07/20] Add checks for xsave and _xgetbv in cpuid checks --- lib/x86simdsortcpuid.h | 81 ++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h index a3dcd9e3..6da177d2 100644 --- a/lib/x86simdsortcpuid.h +++ b/lib/x86simdsortcpuid.h @@ -8,31 +8,66 @@ static std::unordered_map xss_cpu_features; -inline void xss_cpu_init() +static bool os_supports_avx() { - int cpuInfo[4] = {0}; - // Check AVX2 - __cpuid(cpuInfo, 0); - int nIds = cpuInfo[0]; + int cpuInfo[4]; __cpuid(cpuInfo, 1); - bool osxsave = (cpuInfo[2] & (1 << 27)) != 0; - bool avx = (cpuInfo[2] & (1 << 28)) != 0; - __cpuid(cpuInfo, 7); - bool avx2 = (cpuInfo[1] & (1 << 5)) != 0; - bool avx512f = (cpuInfo[1] & (1 << 16)) != 0; - bool avx512dq = (cpuInfo[1] & (1 << 17)) != 0; - bool avx512bw = (cpuInfo[1] & (1 << 30)) != 0; - bool avx512vl = (cpuInfo[1] & (1 << 31)) != 0; - bool avx512vbmi2 = (cpuInfo[2] & (1 << 6)) != 0; - bool avx512fp16 = (cpuInfo[3] & (1 << 23)) != 0; - // Store results - xss_cpu_features["avx2"] = avx2; - xss_cpu_features["avx512f"] = avx512f; - xss_cpu_features["avx512dq"] = avx512dq; - xss_cpu_features["avx512bw"] = avx512bw; - xss_cpu_features["avx512vl"] = avx512vl; - xss_cpu_features["avx512vbmi2"] = avx512vbmi2; - xss_cpu_features["avx512fp16"] = avx512fp16; + + bool osxsaveSupported = (cpuInfo[2] & (1 << 27)) != 0; // OSXSAVE bit + bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit + if (!(avxSupported && osxsaveSupported)) + return false; + + // Check XCR0[2:1] (XMM and YMM state) + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0x6) == 0x6; +} + +static bool os_supports_avx512() +{ + if (!os_supports_avx()) + return false; + + // Need XCR0[7:5] = opmask/ZMM/YMM state enabled + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0xE0) == 0xE0; +} + +void xss_cpu_init() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + int maxLeaf = cpuInfo[0]; + + bool hasAVX2 = false; + bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, hasAVX512VL = false; + bool hasAVX512VBMI2 = false, hasAVX512FP16 = false; + + if (maxLeaf >= 7) + { + __cpuidex(cpuInfo, 7, 0); + + // EBX bits + hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); + hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); + hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); + hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); + hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); + + // ECX bits + hasAVX512VBMI2 = os_supports_avx512() && (cpuInfo[2] & (1 << 6)); + + // EDX bits + hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); + } + + xss_cpu_features["avx2"] = hasAVX2; + xss_cpu_features["avx512f"] = hasAVX512F; + xss_cpu_features["avx512dq"] = hasAVX512DQ; + xss_cpu_features["avx512bw"] = hasAVX512BW; + xss_cpu_features["avx512vl"] = hasAVX512VL; + xss_cpu_features["avx512vbmi2"] = hasAVX512VBMI2; + xss_cpu_features["avx512fp16"] = hasAVX512FP16; } inline bool xss_cpu_supports(const char *feature) From 459468ca2c462bdcabdf1f2b5c5a9450ccb5d09a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 6 Nov 2025 21:33:26 -0800 Subject: [PATCH 08/20] Install gtest and build tests --- .github/workflows/c-cpp.yml | 19 ++++++++++++++++++- meson.build | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index a5757e2b..94c05bd0 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -33,12 +33,29 @@ jobs: python -m pip install --upgrade pip pip install meson ninja + - name: Setup environment + run: | + echo "PKG_CONFIG=${{ github.workspace }}/vcpkg/installed/x64-windows/tools/pkgconf/pkgconf.exe" >> $env:GITHUB_ENV + echo "PKG_CONFIG_PATH=${{ github.workspace }}/vcpkg/installed/x64-windows/lib/pkgconfig" >> $env:GITHUB_ENV + echo "LIB=${{ github.workspace }}/vcpkg/installed/x64-windows/lib;$env:LIB" >> $env:GITHUB_ENV + echo "INCLUDE=${{ github.workspace }}/vcpkg/installed/x64-windows/include;$env:INCLUDE" >> $env:GITHUB_ENV + shell: pwsh + + - name: Setup vcpkg and install pkg-config and gtest + run: | + git clone https://github.com/Microsoft/vcpkg.git + .\vcpkg\bootstrap-vcpkg.bat + .\vcpkg\vcpkg install gtest:x64-windows pkgconf:x64-windows + # Configure and build with Meson (MSVC will be used automatically) - name: Configure (Meson) - run: meson setup --warnlevel 2 --buildtype release builddir --backend=ninja + run: meson setup -Dbuild_tests=true --warnlevel 2 --buildtype release builddir --backend=ninja - name: Build (Ninja) run: ninja -C builddir + + - name: Run tests + run: .\builddir\testexe --gtest_filter="*qsort*" SKL-gcc9: diff --git a/meson.build b/meson.build index 38e84d51..703ec385 100644 --- a/meson.build +++ b/meson.build @@ -93,7 +93,7 @@ x86simdsortcpp_dep = declare_dependency( # Build test suite if option build_tests set to true if get_option('build_tests') - gtest_dep = dependency('gtest_main', required : true, static: false) + gtest_dep = dependency('gtest', main : true, required : true, static: false) subdir('tests') testexe = executable('testexe', include_directories : [lib, utils], From 6c3477d1f3e3d22082b55d27140cbe4a86757fc6 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 8 Nov 2025 22:26:52 -0800 Subject: [PATCH 09/20] Use meson test --- .github/workflows/c-cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 94c05bd0..793cd86b 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -55,7 +55,7 @@ jobs: run: ninja -C builddir - name: Run tests - run: .\builddir\testexe --gtest_filter="*qsort*" + run: meson test -C builddir --test-args "\-\-gtest_filter=*qsort*" -v SKL-gcc9: From 7d68dfbe26c644c9a0bd5b8952f1669b693d4615 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 4 Feb 2026 16:17:54 +0530 Subject: [PATCH 10/20] Resolve pointers when built with MSVC --- lib/x86simdsort.cpp | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 4abfa469..b3e24685 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -59,6 +59,75 @@ namespace x86simdsort { #define CAT_(a, b) a##b #define CAT(a, b) CAT_(a, b) +#ifdef _MSC_VER +#define DECLARE_INTERNAL_qsort(TYPE) \ + static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_qsort##TYPE == NULL) { \ + CAT(resolve_qsort, TYPE)(); \ + } \ + (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_qselect(TYPE) \ + static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL qselect( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_qselect##TYPE == NULL) { \ + CAT(resolve_qselect, TYPE)(); \ + } \ + (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_partial_qsort(TYPE) \ + static void (*internal_partial_qsort##TYPE)( \ + TYPE *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL partial_qsort( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_partial_qsort##TYPE == NULL) { \ + CAT(resolve_partial_qsort, TYPE)(); \ + } \ + (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_argsort(TYPE) \ + static std::vector (*internal_argsort##TYPE)( \ + TYPE *, size_t, bool, bool) \ + = NULL; \ + template <> \ + std::vector XSS_EXPORT_SYMBOL argsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + { \ + if (internal_argsort##TYPE == NULL) { \ + CAT(resolve_argsort, TYPE)(); \ + } \ + return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ + } + +#define DECLARE_INTERNAL_argselect(TYPE) \ + static std::vector (*internal_argselect##TYPE)( \ + TYPE *, size_t, size_t, bool) \ + = NULL; \ + template <> \ + std::vector XSS_EXPORT_SYMBOL argselect( \ + TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + if (internal_argselect##TYPE == NULL) { \ + CAT(resolve_argselect, TYPE)(); \ + } \ + return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ + } + +#else + #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ @@ -110,6 +179,8 @@ namespace x86simdsort { return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ } +#endif // _MSC_VER + /* simple constexpr function as a way around having #ifdef __FLT16_MAX__ block * within the DISPATCH macro */ template From b78e3867d4f9bcb8cfb6c231d895600b4423c7b6 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 4 Feb 2026 16:27:59 +0530 Subject: [PATCH 11/20] Remove unnecessary tab --- .github/workflows/c-cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 793cd86b..2c97e0cd 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,4 +1,4 @@ - name: Build and run tests +name: Build and run tests on: push: From ccacfc57ba965123a8fcb93aa3cdb1a3d9a37e9d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 4 Feb 2026 21:26:25 +0530 Subject: [PATCH 12/20] Declare/define resolve function prior --- lib/x86simdsort.cpp | 77 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index b3e24685..32925364 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -59,6 +59,45 @@ namespace x86simdsort { #define CAT_(a, b) a##b #define CAT(a, b) CAT_(a, b) +/* runtime dispatch mechanism */ +#define DISPATCH(func, TYPE, ISA) \ + DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(resolve_, func), TYPE)(void) \ + { \ + CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ + xss_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu(ISA); \ + if constexpr (dispatch_requested("avx512", ISA)) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + if constexpr (IS_TYPE_FLOAT16()) { \ + if (preferred_cpu.find("avx512_spr") \ + != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::fp16_spr::func; \ + return; \ + } \ + if (preferred_cpu.find("avx512_icl") \ + != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::fp16_icl::func; \ + return; \ + } \ + } \ + else { \ + CAT(CAT(internal_, func), TYPE) \ + = &xss::avx512::func; \ + } \ + return; \ + } \ + } \ + if constexpr (dispatch_requested("avx2", ISA)) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ + return; \ + } \ + } \ + } + #ifdef _MSC_VER #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ @@ -192,44 +231,6 @@ constexpr bool IS_TYPE_FLOAT16() return false; } -/* runtime dispatch mechanism */ -#define DISPATCH(func, TYPE, ISA) \ - DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ - CAT(resolve_, func), TYPE)(void) \ - { \ - CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ - xss_cpu_init(); \ - std::string_view preferred_cpu = find_preferred_cpu(ISA); \ - if constexpr (dispatch_requested("avx512", ISA)) { \ - if (preferred_cpu.find("avx512") != std::string_view::npos) { \ - if constexpr (IS_TYPE_FLOAT16()) { \ - if (preferred_cpu.find("avx512_spr") \ - != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::fp16_spr::func; \ - return; \ - } \ - if (preferred_cpu.find("avx512_icl") \ - != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::fp16_icl::func; \ - return; \ - } \ - } \ - else { \ - CAT(CAT(internal_, func), TYPE) \ - = &xss::avx512::func; \ - } \ - return; \ - } \ - } \ - if constexpr (dispatch_requested("avx2", ISA)) { \ - if (preferred_cpu.find("avx2") != std::string_view::npos) { \ - CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ - return; \ - } \ - } \ - } #define ISA_LIST(...) \ std::initializer_list \ From 1ba4ff8e484ba3dcba7bd40d3001a7a6114c8d69 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 10 Feb 2026 11:05:32 +0530 Subject: [PATCH 13/20] temp --- lib/x86simdsort.cpp | 108 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 22 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 32925364..db9ba1b4 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -100,6 +100,7 @@ namespace x86simdsort { #ifdef _MSC_VER #define DECLARE_INTERNAL_qsort(TYPE) \ + static void CAT(resolve_qsort, TYPE)(void); \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ @@ -111,6 +112,7 @@ namespace x86simdsort { } #define DECLARE_INTERNAL_qselect(TYPE) \ + static void CAT(resolve_qselect, TYPE)(void); \ static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ @@ -124,6 +126,7 @@ namespace x86simdsort { } #define DECLARE_INTERNAL_partial_qsort(TYPE) \ + static void CAT(resolve_partial_qsort, TYPE)(void); \ static void (*internal_partial_qsort##TYPE)( \ TYPE *, size_t, size_t, bool, bool) \ = NULL; \ @@ -138,6 +141,7 @@ namespace x86simdsort { } #define DECLARE_INTERNAL_argsort(TYPE) \ + static void CAT(resolve_argsort, TYPE)(void); \ static std::vector (*internal_argsort##TYPE)( \ TYPE *, size_t, bool, bool) \ = NULL; \ @@ -152,6 +156,7 @@ namespace x86simdsort { } #define DECLARE_INTERNAL_argselect(TYPE) \ + static void CAT(resolve_argselect, TYPE)(void); \ static std::vector (*internal_argselect##TYPE)( \ TYPE *, size_t, size_t, bool) \ = NULL; \ @@ -278,7 +283,36 @@ DISPATCH_ALL(argselect, (ISA_LIST("avx512_skx", "avx2"))) /* Key-Value methods */ + +#define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ + static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ + { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::scalar::func; \ + xss_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu(ISA); \ + if constexpr (dispatch_requested("avx512", ISA)) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::avx512::func; \ + return; \ + } \ + } \ + if constexpr (dispatch_requested("avx2", ISA)) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ + = &xss::avx2::func; \ + return; \ + } \ + } \ + } + +#ifdef _MSC_VER #define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ + static void CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(void); \ + static void CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(void); \ + static void CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(void); \ static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ TYPE1 *, TYPE2 *, size_t, bool, bool) \ = NULL; \ @@ -295,6 +329,9 @@ DISPATCH_ALL(argselect, bool hasnan, \ bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2)) == NULL) { \ + CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ key, val, arrsize, hasnan, descending); \ } \ @@ -306,6 +343,9 @@ DISPATCH_ALL(argselect, bool hasnan, \ bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2)) == NULL) { \ + CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } \ @@ -317,33 +357,57 @@ DISPATCH_ALL(argselect, bool hasnan, \ bool descending) \ { \ + if ((CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2)) == NULL) { \ + CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(); \ + } \ (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } - -#define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ - static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ - CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ +#else +#define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ + static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::scalar::func; \ - xss_cpu_init(); \ - std::string_view preferred_cpu = find_preferred_cpu(ISA); \ - if constexpr (dispatch_requested("avx512", ISA)) { \ - if (preferred_cpu.find("avx512") != std::string_view::npos) { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::avx512::func; \ - return; \ - } \ - } \ - if constexpr (dispatch_requested("avx2", ISA)) { \ - if (preferred_cpu.find("avx2") != std::string_view::npos) { \ - CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ - = &xss::avx2::func; \ - return; \ - } \ - } \ + (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + key, val, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ } +#endif // _MSC_VER + #define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ From 8bc432cb71ecb03a11439ca76feeed2062c5d015 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sun, 15 Feb 2026 11:59:43 +0530 Subject: [PATCH 14/20] use lazy initialization pointer everywhere --- lib/x86simdsort.cpp | 47 --------------------------------------------- 1 file changed, 47 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index db9ba1b4..4b34b229 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -98,7 +98,6 @@ namespace x86simdsort { } \ } -#ifdef _MSC_VER #define DECLARE_INTERNAL_qsort(TYPE) \ static void CAT(resolve_qsort, TYPE)(void); \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ @@ -308,7 +307,6 @@ DISPATCH_ALL(argselect, } \ } -#ifdef _MSC_VER #define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ static void CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(void); \ static void CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(void); \ @@ -363,51 +361,6 @@ DISPATCH_ALL(argselect, (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } -#else -#define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ - static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ - TYPE1 *, TYPE2 *, size_t, bool, bool) \ - = NULL; \ - static void(CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ - TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ - = NULL; \ - static void(CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ - TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ - = NULL; \ - template <> \ - void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ - TYPE2 *val, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ - { \ - (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ - key, val, arrsize, hasnan, descending); \ - } \ - template <> \ - void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ - { \ - (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ - key, val, k, arrsize, hasnan, descending); \ - } \ - template <> \ - void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ - { \ - (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ - key, val, k, arrsize, hasnan, descending); \ - } -#endif // _MSC_VER - #define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ From c7a113c5c719ba18340c3225a406def36d7cf991 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sun, 15 Feb 2026 12:02:23 +0530 Subject: [PATCH 15/20] Fix Formating --- lib/x86simdsort-icl.cpp | 1 - lib/x86simdsort.cpp | 47 ++++++++++++++++++----------------------- lib/x86simdsortcpuid.h | 38 ++++++++++++++++----------------- 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 96456aba..3e5c4b5b 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -5,7 +5,6 @@ #include "avx512-16bit-qsort.hpp" #endif - namespace xss { namespace avx512 { template <> diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 4b34b229..6964009d 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -102,11 +102,10 @@ namespace x86simdsort { static void CAT(resolve_qsort, TYPE)(void); \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - if (internal_qsort##TYPE == NULL) { \ - CAT(resolve_qsort, TYPE)(); \ - } \ + if (internal_qsort##TYPE == NULL) { CAT(resolve_qsort, TYPE)(); } \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -118,9 +117,7 @@ namespace x86simdsort { void XSS_EXPORT_SYMBOL qselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - if (internal_qselect##TYPE == NULL) { \ - CAT(resolve_qselect, TYPE)(); \ - } \ + if (internal_qselect##TYPE == NULL) { CAT(resolve_qselect, TYPE)(); } \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ } @@ -148,9 +145,7 @@ namespace x86simdsort { std::vector XSS_EXPORT_SYMBOL argsort( \ TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - if (internal_argsort##TYPE == NULL) { \ - CAT(resolve_argsort, TYPE)(); \ - } \ + if (internal_argsort##TYPE == NULL) { CAT(resolve_argsort, TYPE)(); } \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -235,7 +230,6 @@ constexpr bool IS_TYPE_FLOAT16() return false; } - #define ISA_LIST(...) \ std::initializer_list \ { \ @@ -322,10 +316,10 @@ DISPATCH_ALL(argselect, = NULL; \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ - TYPE2 *val, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ if ((CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2)) == NULL) { \ CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(); \ @@ -335,11 +329,11 @@ DISPATCH_ALL(argselect, } \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ if ((CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2)) == NULL) { \ CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(); \ @@ -349,13 +343,14 @@ DISPATCH_ALL(argselect, } \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ - if ((CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2)) == NULL) { \ + if ((CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2)) \ + == NULL) { \ CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(); \ } \ (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h index 6da177d2..f90079d8 100644 --- a/lib/x86simdsortcpuid.h +++ b/lib/x86simdsortcpuid.h @@ -14,9 +14,8 @@ static bool os_supports_avx() __cpuid(cpuInfo, 1); bool osxsaveSupported = (cpuInfo[2] & (1 << 27)) != 0; // OSXSAVE bit - bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit - if (!(avxSupported && osxsaveSupported)) - return false; + bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit + if (!(avxSupported && osxsaveSupported)) return false; // Check XCR0[2:1] (XMM and YMM state) unsigned long long xcr0 = _xgetbv(0); @@ -25,8 +24,7 @@ static bool os_supports_avx() static bool os_supports_avx512() { - if (!os_supports_avx()) - return false; + if (!os_supports_avx()) return false; // Need XCR0[7:5] = opmask/ZMM/YMM state enabled unsigned long long xcr0 = _xgetbv(0); @@ -40,34 +38,34 @@ void xss_cpu_init() int maxLeaf = cpuInfo[0]; bool hasAVX2 = false; - bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, hasAVX512VL = false; + bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, + hasAVX512VL = false; bool hasAVX512VBMI2 = false, hasAVX512FP16 = false; - if (maxLeaf >= 7) - { + if (maxLeaf >= 7) { __cpuidex(cpuInfo, 7, 0); // EBX bits - hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); - hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); - hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); - hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); - hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); + hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); + hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); + hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); + hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); + hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); // ECX bits hasAVX512VBMI2 = os_supports_avx512() && (cpuInfo[2] & (1 << 6)); // EDX bits - hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); + hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); } - xss_cpu_features["avx2"] = hasAVX2; - xss_cpu_features["avx512f"] = hasAVX512F; - xss_cpu_features["avx512dq"] = hasAVX512DQ; - xss_cpu_features["avx512bw"] = hasAVX512BW; - xss_cpu_features["avx512vl"] = hasAVX512VL; + xss_cpu_features["avx2"] = hasAVX2; + xss_cpu_features["avx512f"] = hasAVX512F; + xss_cpu_features["avx512dq"] = hasAVX512DQ; + xss_cpu_features["avx512bw"] = hasAVX512BW; + xss_cpu_features["avx512vl"] = hasAVX512VL; xss_cpu_features["avx512vbmi2"] = hasAVX512VBMI2; - xss_cpu_features["avx512fp16"] = hasAVX512FP16; + xss_cpu_features["avx512fp16"] = hasAVX512FP16; } inline bool xss_cpu_supports(const char *feature) From 158787491de3c923710b6015070b574dc5bf784c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 16 Feb 2026 10:56:44 +0530 Subject: [PATCH 16/20] Set timeout to 0 for x86 simd sort tests --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 703ec385..093b6027 100644 --- a/meson.build +++ b/meson.build @@ -100,7 +100,7 @@ if get_option('build_tests') dependencies : [gtest_dep, x86simdsortcpp_dep], link_whole : [libtests], ) - test('x86 simd sort tests', testexe) + test('x86 simd sort tests', testexe, timeout : 0) endif # Build benchmarking suite if option build_benchmarks is set to true From 32dd776e6c04dacea2bdfda76e3762760a3957a6 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 7 Mar 2026 15:34:53 +0000 Subject: [PATCH 17/20] Revert "use lazy initialization pointer everywhere" This reverts commit 946ac037217eac59c545bda4ddc12ae169c53a6b. --- lib/x86simdsort.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 6964009d..ef891f84 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -98,6 +98,7 @@ namespace x86simdsort { } \ } +#ifdef _MSC_VER #define DECLARE_INTERNAL_qsort(TYPE) \ static void CAT(resolve_qsort, TYPE)(void); \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ @@ -301,6 +302,7 @@ DISPATCH_ALL(argselect, } \ } +#ifdef _MSC_VER #define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ static void CAT(CAT(resolve_keyvalue_select_, TYPE1), TYPE2)(void); \ static void CAT(CAT(resolve_keyvalue_partial_sort_, TYPE1), TYPE2)(void); \ @@ -356,6 +358,51 @@ DISPATCH_ALL(argselect, (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } +#else +#define DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ + static void(CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + static void(CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ + = NULL; \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ + key, val, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ + } \ + template <> \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ + { \ + (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ + key, val, k, arrsize, hasnan, descending); \ + } +#endif // _MSC_VER + #define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ From 5ff22064199b15ce34248e23844a5b2a984826ac Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 7 Mar 2026 15:37:22 +0000 Subject: [PATCH 18/20] formatting --- lib/x86simdsort.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index ef891f84..eda55742 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -170,7 +170,8 @@ namespace x86simdsort { #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort( \ + TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -371,39 +372,38 @@ DISPATCH_ALL(argselect, = NULL; \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ - TYPE2 *val, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ (CAT(CAT(*internal_keyvalue_qsort_, TYPE1), TYPE2))( \ key, val, arrsize, hasnan, descending); \ } \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ (CAT(CAT(*internal_keyvalue_select_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } \ template <> \ void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ - TYPE2 *val, \ - size_t k, \ - size_t arrsize, \ - bool hasnan, \ - bool descending) \ + TYPE2 *val, \ + size_t k, \ + size_t arrsize, \ + bool hasnan, \ + bool descending) \ { \ (CAT(CAT(*internal_keyvalue_partial_sort_, TYPE1), TYPE2))( \ key, val, k, arrsize, hasnan, descending); \ } #endif // _MSC_VER - #define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ DECLARE_ALL_KEYVALUE_METHODS(TYPE1, TYPE2) \ DISPATCH_KV_FUNC(keyvalue_qsort, TYPE1, TYPE2, ISA) \ From 99da5666e7e2c67009492ed0c2e7a644e20f429c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 7 Mar 2026 15:44:07 +0000 Subject: [PATCH 19/20] Minor chnages --- .github/workflows/c-cpp.yml | 7 ++----- lib/x86simdsortcpuid.h | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 2c97e0cd..eb25cc74 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -217,11 +217,8 @@ jobs: cd builddir ninja - - name: Run test suite on ADL - run: sde -adl -- ./builddir/testexe --gtest_filter="*simdsort*" - - - name: Run test suite on SPR - run: sde -spr -- ./builddir/testexe --gtest_filter="*simdsort*" + - name: Run test suite + run: ./builddir/testexe --gtest_filter="*simdsort*" SKX-SKL-openmp: diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h index f90079d8..c2911dbe 100644 --- a/lib/x86simdsortcpuid.h +++ b/lib/x86simdsortcpuid.h @@ -31,7 +31,7 @@ static bool os_supports_avx512() return (xcr0 & 0xE0) == 0xE0; } -void xss_cpu_init() +static void xss_cpu_init() { int cpuInfo[4]; __cpuid(cpuInfo, 0); From 43124bda91ab9f7f6bfa13a38ff4885c4332c732 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Sat, 7 Mar 2026 15:58:42 +0000 Subject: [PATCH 20/20] add missing const for arg functions --- lib/x86simdsort.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index eda55742..776ec56d 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -140,11 +140,11 @@ namespace x86simdsort { #define DECLARE_INTERNAL_argsort(TYPE) \ static void CAT(resolve_argsort, TYPE)(void); \ static std::vector (*internal_argsort##TYPE)( \ - TYPE *, size_t, bool, bool) \ + const TYPE *, size_t, bool, bool) \ = NULL; \ template <> \ std::vector XSS_EXPORT_SYMBOL argsort( \ - TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + const TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ if (internal_argsort##TYPE == NULL) { CAT(resolve_argsort, TYPE)(); } \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ @@ -153,11 +153,11 @@ namespace x86simdsort { #define DECLARE_INTERNAL_argselect(TYPE) \ static void CAT(resolve_argselect, TYPE)(void); \ static std::vector (*internal_argselect##TYPE)( \ - TYPE *, size_t, size_t, bool) \ + const TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ std::vector XSS_EXPORT_SYMBOL argselect( \ - TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + const TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ if (internal_argselect##TYPE == NULL) { \ CAT(resolve_argselect, TYPE)(); \ @@ -203,7 +203,7 @@ namespace x86simdsort { = NULL; \ template <> \ std::vector XSS_EXPORT_SYMBOL argsort( \ - TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + const TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -214,7 +214,7 @@ namespace x86simdsort { = NULL; \ template <> \ std::vector XSS_EXPORT_SYMBOL argselect( \ - TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + const TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ }