diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 017a5a6efb26..51b9fc8b2e08 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -191,6 +191,9 @@ takes precedence over ccache if a storage backend is configured" ON) "SSE4_2" "AVX2" "AVX512" + "SVE128" # fixed size SVE + "SVE256" # " + "SVE512" # " "MAX") define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index c35fc6a6fe73..13a5e72275b2 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -134,7 +134,29 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc") elseif(ARROW_CPU_FLAG STREQUAL "aarch64") # Arm64 compiler flags, gcc/clang only set(ARROW_ARMV8_MARCH "armv8-a") - check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) + set(ARROW_SVE_FLAGS "-march=${ARROW_ARMV8_MARCH}+sve") + set(ARROW_SVE128_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=128") + set(ARROW_SVE256_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=256") + set(ARROW_SVE512_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=512") + if(APPLE) + # Clang on MacOS may support SVE but it is not tested anywhere, especially + # in xsimd, therefore there currently are issues. + set(CXX_SUPPORTS_SVE OFF) + else() + check_cxx_compiler_flag("${ARROW_SVE_FLAGS}" CXX_SUPPORTS_SVE) + endif() + if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE128|SVE256|SVE512|MAX)$") + set(ARROW_HAVE_RUNTIME_SVE128 ON) + add_definitions(-DARROW_HAVE_RUNTIME_SVE128) + endif() + if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE256|SVE512|MAX)$") + set(ARROW_HAVE_RUNTIME_SVE256 ON) + add_definitions(-DARROW_HAVE_RUNTIME_SVE256) + endif() + if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE512|MAX)$") + set(ARROW_HAVE_RUNTIME_SVE512 ON) + add_definitions(-DARROW_HAVE_RUNTIME_SVE512) + endif() if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT") set(ARROW_SIMD_LEVEL "NEON") endif() @@ -528,8 +550,6 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64") if(NOT CXX_SUPPORTS_SVE) message(FATAL_ERROR "SVE required but compiler doesn't support it.") endif() - # -march=armv8-a+sve - set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve") string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL}) if(SVE_VECTOR_BITS) set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON) @@ -541,7 +561,7 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64") add_definitions(-DARROW_HAVE_SVE_SIZELESS) endif() endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SVE_FLAGS}") elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by Arm.") endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index eee63b11ca1c..0839cc1fa877 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -343,6 +343,27 @@ macro(append_runtime_avx512_src SRCS SRC) endif() endmacro() +macro(append_runtime_sve128_src SRCS SRC) + if(ARROW_HAVE_RUNTIME_SVE128) + list(APPEND ${SRCS} ${SRC}) + set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE128_FLAGS}") + endif() +endmacro() + +macro(append_runtime_sve256_src SRCS SRC) + if(ARROW_HAVE_RUNTIME_SVE256) + list(APPEND ${SRCS} ${SRC}) + set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE256_FLAGS}") + endif() +endmacro() + +macro(append_runtime_sve512_src SRCS SRC) + if(ARROW_HAVE_RUNTIME_SVE512) + list(APPEND ${SRCS} ${SRC}) + set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE512_FLAGS}") + endif() +endmacro() + # Write out compile-time configuration constants string(REPLACE "${CMAKE_SOURCE_DIR}" "" REDACTED_CXX_FLAGS ${CMAKE_CXX_FLAGS}) @@ -498,7 +519,7 @@ set(ARROW_UTIL_SRCS util/bitmap_ops.cc util/bpacking.cc util/bpacking_scalar.cc - util/bpacking_simd_default.cc + util/bpacking_simd_128.cc util/byte_size.cc util/byte_stream_split_internal.cc util/cancel.cc @@ -543,9 +564,12 @@ set(ARROW_UTIL_SRCS append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc) -append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc) +append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc) append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc) +append_runtime_sve128_src(ARROW_UTIL_SRCS util/bpacking_simd_128_alt.cc) +append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc) + if(ARROW_WITH_BROTLI) list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc) endif() diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index e959a9f9c411..56978a0a48d6 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -33,16 +33,26 @@ struct UnpackDynamicFunction { static constexpr auto implementations() { return std::array{ + // x86 implementations #if defined(ARROW_HAVE_SSE4_2) Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2}, -#else - Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar}, -#endif -#if defined(ARROW_HAVE_RUNTIME_AVX2) +# if defined(ARROW_HAVE_RUNTIME_AVX2) Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2}, -#endif -#if defined(ARROW_HAVE_RUNTIME_AVX512) +# endif +# if defined(ARROW_HAVE_RUNTIME_AVX512) Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512}, +# endif + + // ARM implementations +#elif defined(ARROW_HAVE_NEON) + Implementation{DispatchLevel::NONE, &bpacking::unpack_neon}, +# if defined(ARROW_HAVE_RUNTIME_SVE256) + Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256}, +# endif + + // Other implementations +#else + Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar}, #endif }; } @@ -52,12 +62,14 @@ struct UnpackDynamicFunction { template void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) { -#if defined(ARROW_HAVE_NEON) - return bpacking::unpack_neon(in, out, opts); -#else - static DynamicDispatch > dispatch; - return dispatch.func(in, out, opts); -#endif + auto constexpr kImplementations = UnpackDynamicFunction::implementations(); + if constexpr (kImplementations.size() == 1) { + constexpr auto func = kImplementations.front().second; + func(in, out, opts); + } else { + static DynamicDispatch > dispatch; + return dispatch.func(in, out, opts); + } } template void unpack(const uint8_t*, bool*, const UnpackOptions&); diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 354bc76ace0c..1e9ad6035dd5 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -26,7 +26,7 @@ #include "arrow/util/bpacking_scalar_internal.h" #include "arrow/util/bpacking_simd_internal.h" -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128) # include "arrow/util/cpu_info.h" #endif @@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo // will not emit runs larger than 512 (though other implementation might), so we biased // the benchmarks towards a rather small scale. static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2); -constexpr std::initializer_list kBitWidths8 = {1, 2, 8}; -constexpr std::initializer_list kBitWidths16 = {1, 2, 8, 13}; -constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; -constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; +constexpr auto kBitWidths8 = std::initializer_list{1, 2, 8}; +constexpr auto kBitWidths16 = std::initializer_list{1, 2, 8, 13}; +constexpr auto kBitWidths32 = std::initializer_list{1, 2, 8, 20}; +constexpr auto kBitWidths64 = std::initializer_list{1, 2, 8, 20, 47}; static const std::vector> kBitWidthsNumValuesBool = { {0, 1}, @@ -254,6 +254,60 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon< ->ArgsProduct(kBitWidthsNumValues64); #endif +#if defined(ARROW_HAVE_RUNTIME_SVE128) +BENCHMARK_CAPTURE(BM_UnpackBool, Sve128Unaligned, false, &bpacking::unpack_sve128, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128), + "Sve128 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, Sve128Unaligned, false, + &bpacking::unpack_sve128, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128), + "Sve128 not available") + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint16, Sve128Unaligned, false, + &bpacking::unpack_sve128, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128), + "Sve128 not available") + ->ArgsProduct(kBitWidthsNumValues16); +BENCHMARK_CAPTURE(BM_UnpackUint32, Sve128Unaligned, false, + &bpacking::unpack_sve128, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128), + "Sve128 not available") + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, Sve128Unaligned, false, + &bpacking::unpack_sve128, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128), + "Sve128 not available") + ->ArgsProduct(kBitWidthsNumValues64); +#endif + +#if defined(ARROW_HAVE_RUNTIME_SVE256) +BENCHMARK_CAPTURE(BM_UnpackBool, Sve256Unaligned, false, &bpacking::unpack_sve256, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256), + "Sve256 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, Sve256Unaligned, false, + &bpacking::unpack_sve256, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256), + "Sve256 not available") + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint16, Sve256Unaligned, false, + &bpacking::unpack_sve256, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256), + "Sve256 not available") + ->ArgsProduct(kBitWidthsNumValues16); +BENCHMARK_CAPTURE(BM_UnpackUint32, Sve256Unaligned, false, + &bpacking::unpack_sve256, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256), + "Sve256 not available") + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, Sve256Unaligned, false, + &bpacking::unpack_sve256, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256), + "Sve256 not available") + ->ArgsProduct(kBitWidthsNumValues64); +#endif + BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack) ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack) diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_128.cc similarity index 89% rename from cpp/src/arrow/util/bpacking_simd_default.cc rename to cpp/src/arrow/util/bpacking_simd_128.cc index 61adee52a339..1bc756b2aa7d 100644 --- a/cpp/src/arrow/util/bpacking_simd_default.cc +++ b/cpp/src/arrow/util/bpacking_simd_128.cc @@ -17,8 +17,10 @@ #if defined(ARROW_HAVE_NEON) # define UNPACK_PLATFORM unpack_neon +# define KERNEL_PLATFORM KernelNeon #elif defined(ARROW_HAVE_SSE4_2) # define UNPACK_PLATFORM unpack_sse4_2 +# define KERNEL_PLATFORM KernelSse42 #endif #if defined(UNPACK_PLATFORM) @@ -30,11 +32,11 @@ namespace arrow::internal::bpacking { template -using Simd128Kernel = Kernel; +using KERNEL_PLATFORM = Kernel; template void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) { - return unpack_jump(in, out, opts); + return unpack_jump(in, out, opts); } template void UNPACK_PLATFORM(const uint8_t*, bool*, const UnpackOptions&); diff --git a/cpp/src/arrow/util/bpacking_simd_128_alt.cc b/cpp/src/arrow/util/bpacking_simd_128_alt.cc new file mode 100644 index 000000000000..092d4b478fa4 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd_128_alt.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if defined(ARROW_HAVE_RUNTIME_SVE128) +# define UNPACK_PLATFORM unpack_sve128 +# define KERNEL_PLATFORM KernelSve128 +#endif + +#if defined(UNPACK_PLATFORM) + +# include + +# include "arrow/util/bpacking_dispatch_internal.h" +# include "arrow/util/bpacking_simd_internal.h" +# include "arrow/util/bpacking_simd_kernel_internal.h" + +namespace arrow::internal::bpacking { + +template +using KERNEL_PLATFORM = Kernel; + +template +void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) { + static_assert(std::is_same_v>); + return unpack_jump(in, out, opts); +} + +template void UNPACK_PLATFORM(const uint8_t*, bool*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint8_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint16_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint32_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint64_t*, const UnpackOptions&); + +} // namespace arrow::internal::bpacking + +# undef UNPACK_PLATFORM +#endif // UNPACK_PLATFORM diff --git a/cpp/src/arrow/util/bpacking_simd_256.cc b/cpp/src/arrow/util/bpacking_simd_256.cc new file mode 100644 index 000000000000..dc9cc75c7556 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd_256.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256) +# define UNPACK_PLATFORM unpack_sve256 +# define KERNEL_PLATFORM KernelSve256 +#elif defined(ARROW_HAVE_RUNTIME_AVX2) +# define UNPACK_PLATFORM unpack_avx2 +# define KERNEL_PLATFORM KernelAvx2 +#endif + +#if defined(UNPACK_PLATFORM) + +# include "arrow/util/bpacking_dispatch_internal.h" +# include "arrow/util/bpacking_internal.h" +# include "arrow/util/bpacking_simd_internal.h" +# include "arrow/util/bpacking_simd_kernel_internal.h" + +namespace arrow::internal::bpacking { + +template +using KERNEL_PLATFORM = Kernel; + +template +void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) { + return unpack_jump(in, out, opts); +} + +template void UNPACK_PLATFORM(const uint8_t*, bool*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint8_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint16_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint32_t*, const UnpackOptions&); +template void UNPACK_PLATFORM(const uint8_t*, uint64_t*, const UnpackOptions&); + +} // namespace arrow::internal::bpacking + +# undef UNPACK_PLATFORM +#endif // UNPACK_PLATFORM diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc deleted file mode 100644 index de1f228aec20..000000000000 --- a/cpp/src/arrow/util/bpacking_simd_avx2.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/util/bpacking_dispatch_internal.h" -#include "arrow/util/bpacking_internal.h" -#include "arrow/util/bpacking_simd_internal.h" -#include "arrow/util/bpacking_simd_kernel_internal.h" - -namespace arrow::internal::bpacking { - -template -using Simd256Kernel = Kernel; - -template -void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts) { - return unpack_jump(in, out, opts); -} - -template void unpack_avx2(const uint8_t*, bool*, const UnpackOptions&); -template void unpack_avx2(const uint8_t*, uint8_t*, const UnpackOptions&); -template void unpack_avx2(const uint8_t*, uint16_t*, const UnpackOptions&); -template void unpack_avx2(const uint8_t*, uint32_t*, const UnpackOptions&); -template void unpack_avx2(const uint8_t*, uint64_t*, const UnpackOptions&); - -} // namespace arrow::internal::bpacking diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index 44ad4b0f8638..d5a81baaec09 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -25,68 +25,90 @@ namespace arrow::internal::bpacking { #if defined(ARROW_HAVE_NEON) +# define UNPACK_ARCH128 unpack_neon +#elif defined(ARROW_HAVE_SSE4_2) +# define UNPACK_ARCH128 unpack_sse4_2 +#endif + +#if defined(UNPACK_ARCH128) template -ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, const UnpackOptions& opts); +ARROW_EXPORT void UNPACK_ARCH128(const uint8_t* in, Uint* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_neon( // +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128( // const uint8_t* in, bool* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_neon( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128( const uint8_t* in, uint8_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_neon( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128( const uint8_t* in, uint16_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_neon( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128( const uint8_t* in, uint32_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_neon( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128( const uint8_t* in, uint64_t* out, const UnpackOptions& opts); -#elif defined(ARROW_HAVE_SSE4_2) +#endif // UNPACK_ARCH128 +#undef UNPACK_ARCH128 + +#if defined(ARROW_HAVE_RUNTIME_SVE128) +# define UNPACK_ARCH128_ALT unpack_sve128 +#endif + +#if defined(UNPACK_ARCH128_ALT) template -ARROW_EXPORT void unpack_sse4_2(const uint8_t* in, Uint* out, const UnpackOptions& opts); +ARROW_EXPORT void UNPACK_ARCH128_ALT(const uint8_t* in, Uint* out, + const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( // +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT( // const uint8_t* in, bool* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT( const uint8_t* in, uint8_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT( const uint8_t* in, uint16_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT( const uint8_t* in, uint32_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT( const uint8_t* in, uint64_t* out, const UnpackOptions& opts); +#endif // UNPACK_ARCH128_ALT +#undef UNPACK_ARCH128_ALT + +#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256) +# define UNPACK_ARCH256 unpack_sve256 +#elif defined(UNPACK_ARCH256) || defined(ARROW_HAVE_RUNTIME_AVX2) +# define UNPACK_ARCH256 unpack_avx2 #endif -#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(UNPACK_ARCH256) template -ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts); +ARROW_EXPORT void UNPACK_ARCH256(const uint8_t* in, Uint* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_avx2( // +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256( // const uint8_t* in, bool* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_avx2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256( const uint8_t* in, uint8_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_avx2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256( const uint8_t* in, uint16_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_avx2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256( const uint8_t* in, uint32_t* out, const UnpackOptions& opts); -extern template ARROW_TEMPLATE_EXPORT void unpack_avx2( +extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256( const uint8_t* in, uint64_t* out, const UnpackOptions& opts); -#endif +#endif // UNPACK_ARCH256 +#undef UNPACK_ARCH256 #if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512) diff --git a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h index 318f348b4a7b..9506c30b58fd 100644 --- a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h @@ -382,10 +382,18 @@ struct KernelShape { }; /// Packing all useful and derived information about a kernel in a single type. -template +template struct KernelTraits { + using unpacked_type = UnpackedUint; + /// The integer type to work with, `unpacked_type` or an appropriate type for bool. + using uint_type = std::conditional_t, + SizedUint, unpacked_type>; + using arch_type = Arch; + using simd_batch = xsimd::batch; + using simd_bytes = xsimd::batch; + static constexpr KernelShape kShape = { - .simd_bit_size_ = kSimdBitSize, + .simd_bit_size_ = 8 * simd_bytes ::size, .unpacked_bit_size_ = 8 * sizeof(UnpackedUint), .packed_bit_size_ = kPackedBitSize, }; @@ -393,20 +401,12 @@ struct KernelTraits { static_assert(kShape.simd_bit_size() % kShape.unpacked_bit_size() == 0); static_assert(0 < kShape.packed_bit_size()); static_assert(kShape.packed_bit_size() < kShape.simd_bit_size()); - - using unpacked_type = UnpackedUint; - /// The integer type to work with, `unpacked_type` or an appropriate type for bool. - using uint_type = std::conditional_t, - SizedUint, unpacked_type>; - using simd_batch = xsimd::make_sized_batch_t; - using simd_bytes = xsimd::make_sized_batch_t; - using arch_type = typename simd_batch::arch_type; }; /// Return similar kernel traits but with a different integer unpacking type. template using KernelTraitsWithUnpackUint = KernelTraits; + typename KerTraits::arch_type>; /****************** * MediumKernel * @@ -1131,8 +1131,7 @@ template using KernelDispatch = decltype(KernelDispatchImpl()); /// The public kernel exposed for any size. -template -struct Kernel : KernelDispatch> { -}; +template +struct Kernel : KernelDispatch> {}; } // namespace arrow::internal::bpacking diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 9072c0b8d1ae..ee134a9cbfdf 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -27,7 +27,7 @@ #include "arrow/util/bpacking_scalar_internal.h" #include "arrow/util/bpacking_simd_internal.h" -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128) # include "arrow/util/cpu_info.h" #endif @@ -349,6 +349,72 @@ TEST_P(TestUnpack, Unpack32Neon) { this->TestAll(&bpacking::unpack_neonTestAll(&bpacking::unpack_neon); } #endif +#if defined(ARROW_HAVE_RUNTIME_SVE128) +TEST_P(TestUnpack, UnpackBoolSve128) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) { + GTEST_SKIP() << "Test requires SVE128"; + } + this->TestAll(&bpacking::unpack_sve128); +} +TEST_P(TestUnpack, Unpack8Sve128) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) { + GTEST_SKIP() << "Test requires SVE128"; + } + this->TestAll(&bpacking::unpack_sve128); +} +TEST_P(TestUnpack, Unpack16Sve128) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) { + GTEST_SKIP() << "Test requires SVE128"; + } + this->TestAll(&bpacking::unpack_sve128); +} +TEST_P(TestUnpack, Unpack32Sve128) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) { + GTEST_SKIP() << "Test requires SVE128"; + } + this->TestAll(&bpacking::unpack_sve128); +} +TEST_P(TestUnpack, Unpack64Sve128) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) { + GTEST_SKIP() << "Test requires SVE128"; + } + this->TestAll(&bpacking::unpack_sve128); +} +#endif + +#if defined(ARROW_HAVE_RUNTIME_SVE256) +TEST_P(TestUnpack, UnpackBoolSve256) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) { + GTEST_SKIP() << "Test requires SVE256"; + } + this->TestAll(&bpacking::unpack_sve256); +} +TEST_P(TestUnpack, Unpack8Sve256) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) { + GTEST_SKIP() << "Test requires SVE256"; + } + this->TestAll(&bpacking::unpack_sve256); +} +TEST_P(TestUnpack, Unpack16Sve256) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) { + GTEST_SKIP() << "Test requires SVE256"; + } + this->TestAll(&bpacking::unpack_sve256); +} +TEST_P(TestUnpack, Unpack32Sve256) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) { + GTEST_SKIP() << "Test requires SVE256"; + } + this->TestAll(&bpacking::unpack_sve256); +} +TEST_P(TestUnpack, Unpack64Sve256) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) { + GTEST_SKIP() << "Test requires SVE256"; + } + this->TestAll(&bpacking::unpack_sve256); +} +#endif + TEST_P(TestUnpack, UnpackBool) { this->TestAll(&unpack); } TEST_P(TestUnpack, Unpack8) { this->TestAll(&unpack); } TEST_P(TestUnpack, Unpack16) { this->TestAll(&unpack); } diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc index e24a3bbfe299..2cc1ac802bd8 100644 --- a/cpp/src/arrow/util/cpu_info.cc +++ b/cpp/src/arrow/util/cpu_info.cc @@ -297,6 +297,12 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, #else //------------------------------ LINUX ------------------------------// +# if defined(CPUINFO_ARCH_ARM) +# include +# include +# include +# endif + // Get cache size, return 0 on error int64_t LinuxGetCacheSize(int level) { // get cache size by sysconf() @@ -413,8 +419,30 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, } } } + +# if defined(CPUINFO_ARCH_ARM) + // Detect SVE and vector length via getauxval/prctl (more reliable than /proc/cpuinfo) +# ifdef HWCAP_SVE + const auto hwcap = getauxval(AT_HWCAP); + if (hwcap & HWCAP_SVE) { + *hardware_flags |= CpuInfo::SVE; +# ifdef PR_SVE_GET_VL + const int vl = prctl(PR_SVE_GET_VL); + assert(vl >= 0); + // prctl returns vector length in bytes; mask off status flags + const int vl_bytes = vl & PR_SVE_VL_LEN_MASK; + // Running SVE128 on a SVE256 machine is more tricky than the x86 equivalent of + // running SSE code on an AVX machine and requires to explicitly change the + // vector length using `prctl` (per thread setting). + if (vl_bytes == 16) *hardware_flags |= CpuInfo::SVE128; // 128 bits + if (vl_bytes == 32) *hardware_flags |= CpuInfo::SVE256; // 256 bits + if (vl_bytes == 64) *hardware_flags |= CpuInfo::SVE512; // 512 bits +# endif // PR_SVE_GET_VL + } +# endif // HWCAP_SVE +# endif // CPUINFO_ARCH_ARM } -#endif // WINDOWS, MACOS, LINUX +#endif // WINDOWS, MACOS, LINUX //============================== Arch Dependent ==============================// @@ -473,11 +501,35 @@ void ArchVerifyCpuRequirements(const CpuInfo* ci) { #elif defined(CPUINFO_ARCH_ARM) //------------------------------ AARCH64 ------------------------------// bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - if (simd_level == "NONE") { - *hardware_flags &= ~CpuInfo::ASIMD; - return true; + enum { + USER_SIMD_NONE, + USER_SIMD_SVE, + USER_SIMD_SVE128, + USER_SIMD_SVE256, + USER_SIMD_SVE512, + USER_SIMD_MAX, + }; + + int level = USER_SIMD_MAX; + if (simd_level == "SVE") { + level = USER_SIMD_SVE; + } else if (simd_level == "SVE128") { + level = USER_SIMD_SVE128; + } else if (simd_level == "SVE256") { + level = USER_SIMD_SVE256; + } else if (simd_level == "SVE512") { + level = USER_SIMD_SVE512; + } else if (simd_level == "NONE") { + level = USER_SIMD_NONE; + } else { + return false; } - return false; + + if (level < USER_SIMD_SVE512) *hardware_flags &= ~CpuInfo::SVE512; + if (level < USER_SIMD_SVE256) *hardware_flags &= ~CpuInfo::SVE256; + if (level < USER_SIMD_SVE128) *hardware_flags &= ~CpuInfo::SVE128; + if (level < USER_SIMD_SVE) *hardware_flags &= ~CpuInfo::SVE; + return true; } void ArchVerifyCpuRequirements(const CpuInfo* ci) { diff --git a/cpp/src/arrow/util/cpu_info.h b/cpp/src/arrow/util/cpu_info.h index 949719b97ed8..de0ef13cc598 100644 --- a/cpp/src/arrow/util/cpu_info.h +++ b/cpp/src/arrow/util/cpu_info.h @@ -56,6 +56,10 @@ class ARROW_EXPORT CpuInfo { /// Arm features static constexpr int64_t ASIMD = (1LL << 32); + static constexpr int64_t SVE = (1LL << 33); + static constexpr int64_t SVE128 = (1LL << 36); + static constexpr int64_t SVE256 = (1LL << 34); + static constexpr int64_t SVE512 = (1LL << 35); /// Cache enums for L1 (data), L2 and L3 enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; diff --git a/cpp/src/arrow/util/dispatch_internal.h b/cpp/src/arrow/util/dispatch_internal.h index 7ac19b0b2443..75bde89e3a7d 100644 --- a/cpp/src/arrow/util/dispatch_internal.h +++ b/cpp/src/arrow/util/dispatch_internal.h @@ -23,8 +23,7 @@ #include "arrow/status.h" #include "arrow/util/cpu_info.h" -namespace arrow { -namespace internal { +namespace arrow::internal { enum class DispatchLevel : int { // These dispatch levels, corresponding to instruction set features, @@ -34,6 +33,9 @@ enum class DispatchLevel : int { AVX2, AVX512, NEON, + SVE128, + SVE256, + SVE512, MAX }; @@ -106,11 +108,16 @@ class DynamicDispatch { return cpu_info->IsSupported(CpuInfo::AVX2); case DispatchLevel::AVX512: return cpu_info->IsSupported(CpuInfo::AVX512); + case DispatchLevel::NEON: + return cpu_info->IsSupported(CpuInfo::ASIMD); + case DispatchLevel::SVE256: + return cpu_info->IsSupported(CpuInfo::SVE256); + case DispatchLevel::SVE512: + return cpu_info->IsSupported(CpuInfo::SVE512); default: return false; } } }; -} // namespace internal -} // namespace arrow +} // namespace arrow::internal