Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ takes precedence over ccache if a storage backend is configured" ON)
"SSE4_2"
"AVX2"
"AVX512"
"SVE128" # fixed size SVE
"SVE256" # "
"SVE512" # "
"MAX")

define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)
Expand Down
28 changes: 24 additions & 4 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,29 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc")
elseif(ARROW_CPU_FLAG STREQUAL "aarch64")
# Arm64 compiler flags, gcc/clang only
set(ARROW_ARMV8_MARCH "armv8-a")
check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE)
set(ARROW_SVE_FLAGS "-march=${ARROW_ARMV8_MARCH}+sve")
set(ARROW_SVE128_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=128")
set(ARROW_SVE256_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=256")
set(ARROW_SVE512_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=512")
if(APPLE)
# Clang on MacOS may support SVE but it is not tested anywhere, especially
# in xsimd, therefore there currently are issues.
set(CXX_SUPPORTS_SVE OFF)
else()
check_cxx_compiler_flag("${ARROW_SVE_FLAGS}" CXX_SUPPORTS_SVE)
endif()
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE128|SVE256|SVE512|MAX)$")
set(ARROW_HAVE_RUNTIME_SVE128 ON)
add_definitions(-DARROW_HAVE_RUNTIME_SVE128)
endif()
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE256|SVE512|MAX)$")
set(ARROW_HAVE_RUNTIME_SVE256 ON)
add_definitions(-DARROW_HAVE_RUNTIME_SVE256)
endif()
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE512|MAX)$")
set(ARROW_HAVE_RUNTIME_SVE512 ON)
add_definitions(-DARROW_HAVE_RUNTIME_SVE512)
endif()
if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
set(ARROW_SIMD_LEVEL "NEON")
endif()
Expand Down Expand Up @@ -528,8 +550,6 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
if(NOT CXX_SUPPORTS_SVE)
message(FATAL_ERROR "SVE required but compiler doesn't support it.")
endif()
# -march=armv8-a+sve
set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve")
string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL})
if(SVE_VECTOR_BITS)
set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON)
Expand All @@ -541,7 +561,7 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
add_definitions(-DARROW_HAVE_SVE_SIZELESS)
endif()
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SVE_FLAGS}")
elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by Arm.")
endif()
Expand Down
28 changes: 26 additions & 2 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,27 @@ macro(append_runtime_avx512_src SRCS SRC)
endif()
endmacro()

macro(append_runtime_sve128_src SRCS SRC)
if(ARROW_HAVE_RUNTIME_SVE128)
list(APPEND ${SRCS} ${SRC})
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE128_FLAGS}")
endif()
endmacro()

macro(append_runtime_sve256_src SRCS SRC)
if(ARROW_HAVE_RUNTIME_SVE256)
list(APPEND ${SRCS} ${SRC})
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE256_FLAGS}")
endif()
endmacro()

macro(append_runtime_sve512_src SRCS SRC)
if(ARROW_HAVE_RUNTIME_SVE512)
list(APPEND ${SRCS} ${SRC})
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE512_FLAGS}")
endif()
endmacro()

# Write out compile-time configuration constants
string(REPLACE "${CMAKE_SOURCE_DIR}" "<CMAKE_SOURCE_DIR>" REDACTED_CXX_FLAGS
${CMAKE_CXX_FLAGS})
Expand Down Expand Up @@ -498,7 +519,7 @@ set(ARROW_UTIL_SRCS
util/bitmap_ops.cc
util/bpacking.cc
util/bpacking_scalar.cc
util/bpacking_simd_default.cc
util/bpacking_simd_128.cc
util/byte_size.cc
util/byte_stream_split_internal.cc
util/cancel.cc
Expand Down Expand Up @@ -543,9 +564,12 @@ set(ARROW_UTIL_SRCS

append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc)

append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)

append_runtime_sve128_src(ARROW_UTIL_SRCS util/bpacking_simd_128_alt.cc)
append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)

if(ARROW_WITH_BROTLI)
list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
endif()
Expand Down
36 changes: 24 additions & 12 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,26 @@ struct UnpackDynamicFunction {

static constexpr auto implementations() {
return std::array{
// x86 implementations
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
#else
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
# if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
# endif
# if defined(ARROW_HAVE_RUNTIME_AVX512)
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
# endif

// ARM implementations
#elif defined(ARROW_HAVE_NEON)
Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>},
# if defined(ARROW_HAVE_RUNTIME_SVE256)
Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>},
# endif

// Other implementations
#else
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
#endif
};
}
Expand All @@ -52,12 +62,14 @@ struct UnpackDynamicFunction {

template <typename Uint>
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
#if defined(ARROW_HAVE_NEON)
return bpacking::unpack_neon(in, out, opts);
#else
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, opts);
#endif
auto constexpr kImplementations = UnpackDynamicFunction<Uint>::implementations();
if constexpr (kImplementations.size() == 1) {
constexpr auto func = kImplementations.front().second;
func(in, out, opts);
} else {
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, opts);
}
}

template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
Expand Down
64 changes: 59 additions & 5 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"

#if defined(ARROW_HAVE_RUNTIME_AVX2)
#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128)
# include "arrow/util/cpu_info.h"
#endif

Expand Down Expand Up @@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
// will not emit runs larger than 512 (though other implementation might), so we biased
// the benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8};
constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13};
constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20};
constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47};

static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
Expand Down Expand Up @@ -254,6 +254,60 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_SVE128)
BENCHMARK_CAPTURE(BM_UnpackBool, Sve128Unaligned, false, &bpacking::unpack_sve128<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sve128Unaligned, false,
&bpacking::unpack_sve128<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sve128Unaligned, false,
&bpacking::unpack_sve128<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sve128Unaligned, false,
&bpacking::unpack_sve128<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sve128Unaligned, false,
&bpacking::unpack_sve128<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_SVE256)
BENCHMARK_CAPTURE(BM_UnpackBool, Sve256Unaligned, false, &bpacking::unpack_sve256<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
"Sve256 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sve256Unaligned, false,
&bpacking::unpack_sve256<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
"Sve256 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sve256Unaligned, false,
&bpacking::unpack_sve256<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
"Sve256 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sve256Unaligned, false,
&bpacking::unpack_sve256<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
"Sve256 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sve256Unaligned, false,
&bpacking::unpack_sve256<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
"Sve256 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@

#if defined(ARROW_HAVE_NEON)
# define UNPACK_PLATFORM unpack_neon
# define KERNEL_PLATFORM KernelNeon
#elif defined(ARROW_HAVE_SSE4_2)
# define UNPACK_PLATFORM unpack_sse4_2
# define KERNEL_PLATFORM KernelSse42
#endif

#if defined(UNPACK_PLATFORM)
Expand All @@ -30,11 +32,11 @@
namespace arrow::internal::bpacking {

template <typename UnpackedUint, int kPackedBitSize>
using Simd128Kernel = Kernel<UnpackedUint, kPackedBitSize, 128>;
using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, xsimd::default_arch>;

template <typename Uint>
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
return unpack_jump<Simd128Kernel>(in, out, opts);
return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}

template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const UnpackOptions&);
Expand Down
51 changes: 51 additions & 0 deletions cpp/src/arrow/util/bpacking_simd_128_alt.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#if defined(ARROW_HAVE_RUNTIME_SVE128)
# define UNPACK_PLATFORM unpack_sve128
# define KERNEL_PLATFORM KernelSve128
#endif

#if defined(UNPACK_PLATFORM)

# include <xsimd/xsimd.hpp>

# include "arrow/util/bpacking_dispatch_internal.h"
# include "arrow/util/bpacking_simd_internal.h"
# include "arrow/util/bpacking_simd_kernel_internal.h"

namespace arrow::internal::bpacking {

template <typename UnpackedUint, int kPackedBitSize>
using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, xsimd::default_arch>;

template <typename Uint>
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
static_assert(std::is_same_v<xsimd::default_arch, xsimd::detail::sve<128>>);
return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}

template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal::bpacking

# undef UNPACK_PLATFORM
#endif // UNPACK_PLATFORM
52 changes: 52 additions & 0 deletions cpp/src/arrow/util/bpacking_simd_256.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
# define UNPACK_PLATFORM unpack_sve256
# define KERNEL_PLATFORM KernelSve256
#elif defined(ARROW_HAVE_RUNTIME_AVX2)
# define UNPACK_PLATFORM unpack_avx2
# define KERNEL_PLATFORM KernelAvx2
#endif

#if defined(UNPACK_PLATFORM)

# include "arrow/util/bpacking_dispatch_internal.h"
# include "arrow/util/bpacking_internal.h"
# include "arrow/util/bpacking_simd_internal.h"
# include "arrow/util/bpacking_simd_kernel_internal.h"

namespace arrow::internal::bpacking {

template <typename UnpackedUint, int kPackedBitSize>
using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, xsimd::default_arch>;

template <typename Uint>
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}

template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal::bpacking

# undef UNPACK_PLATFORM
#endif // UNPACK_PLATFORM
Loading
Loading