You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
12204 lines
504 KiB
12204 lines
504 KiB
diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time
|
|
diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10
|
|
diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt
|
|
--- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 2021-06-02 10:56:05.305904746 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -19,7 +19,7 @@ if(POLICY CMP0083)
|
|
cmake_policy(SET CMP0083 NEW)
|
|
endif()
|
|
|
|
-project(hwy VERSION 0.1)
|
|
+project(hwy VERSION 0.12.2) # Keep in sync with highway.h version
|
|
|
|
set(CMAKE_CXX_STANDARD 11)
|
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE)
|
|
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
|
endif()
|
|
|
|
+set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
|
|
+
|
|
include(CheckCXXSourceCompiles)
|
|
check_cxx_source_compiles(
|
|
"int main() {
|
|
@@ -51,10 +53,13 @@ check_cxx_source_compiles(
|
|
HWY_EMSCRIPTEN
|
|
)
|
|
|
|
+set(HWY_CONTRIB_SOURCES
|
|
+ hwy/contrib/image/image.cc
|
|
+ hwy/contrib/image/image.h
|
|
+ hwy/contrib/math/math-inl.h
|
|
+)
|
|
+
|
|
set(HWY_SOURCES
|
|
- contrib/image/image.cc
|
|
- contrib/image/image.h
|
|
- contrib/math/math-inl.h
|
|
hwy/aligned_allocator.cc
|
|
hwy/aligned_allocator.h
|
|
hwy/base.h
|
|
@@ -64,6 +69,7 @@ set(HWY_SOURCES
|
|
hwy/nanobenchmark.cc
|
|
hwy/nanobenchmark.h
|
|
hwy/ops/arm_neon-inl.h
|
|
+ hwy/ops/arm_sve-inl.h
|
|
hwy/ops/scalar-inl.h
|
|
hwy/ops/set_macros-inl.h
|
|
hwy/ops/shared-inl.h
|
|
@@ -146,13 +152,28 @@ else()
|
|
-fno-exceptions
|
|
)
|
|
endif()
|
|
-endif()
|
|
+
|
|
+ if (HWY_CMAKE_ARM7)
|
|
+ list(APPEND HWY_FLAGS
|
|
+ -march=armv7-a
|
|
+ -mfpu=neon-vfpv4
|
|
+ -mfloat-abi=hard # must match the toolchain specified as CXX=
|
|
+ -mfp16-format=ieee # required for vcvt_f32_f16
|
|
+ )
|
|
+ endif() # HWY_CMAKE_ARM7
|
|
+
|
|
+endif() # !MSVC
|
|
|
|
add_library(hwy STATIC ${HWY_SOURCES})
|
|
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
|
|
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
|
|
|
+add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
|
|
+target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
|
|
+set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
+target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
|
+
|
|
# -------------------------------------------------------- install library
|
|
install(TARGETS hwy
|
|
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
|
@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES})
|
|
endif()
|
|
endforeach()
|
|
|
|
-# Add a pkg-config file for libhwy and the test library.
|
|
+install(TARGETS hwy_contrib
|
|
+ DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
|
+# Install all the headers keeping the relative path to the current directory
|
|
+# when installing them.
|
|
+foreach (source ${HWY_CONTRIB_SOURCES})
|
|
+ if ("${source}" MATCHES "\.h$")
|
|
+ get_filename_component(dirname "${source}" DIRECTORY)
|
|
+ install(FILES "${source}"
|
|
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
|
+ endif()
|
|
+endforeach()
|
|
+
|
|
+# Add a pkg-config file for libhwy and the contrib/test libraries.
|
|
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
|
|
-foreach (pc libhwy.pc libhwy-test.pc)
|
|
+foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
|
|
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
|
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
|
|
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
|
@@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD
|
|
# Avoids mismatch between GTest's static CRT and our dynamic.
|
|
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
|
|
|
-add_executable(skeleton hwy/examples/skeleton_main.cc)
|
|
-target_sources(skeleton PRIVATE
|
|
- hwy/examples/skeleton-inl.h
|
|
- hwy/examples/skeleton.cc
|
|
- hwy/examples/skeleton.h
|
|
- hwy/examples/skeleton_shared.h)
|
|
-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
|
|
-# observe the difference in targets printed.
|
|
-target_compile_options(skeleton PRIVATE ${HWY_FLAGS})
|
|
-target_link_libraries(skeleton hwy)
|
|
-set_target_properties(skeleton
|
|
- PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
|
-
|
|
-# Similar: shared headers but without the runtime dispatch in skeleton.cc/h
|
|
-add_executable(skeleton_static hwy/examples/skeleton_static_main.cc)
|
|
-target_sources(skeleton_static PRIVATE
|
|
- hwy/examples/skeleton-inl.h
|
|
- hwy/examples/skeleton_shared.h)
|
|
-target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS})
|
|
-target_link_libraries(skeleton_static hwy)
|
|
-set_target_properties(skeleton_static
|
|
- PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
|
-
|
|
# Programming exercise with integrated benchmark
|
|
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
|
|
target_sources(hwy_benchmark PRIVATE
|
|
hwy/nanobenchmark.cc
|
|
hwy/nanobenchmark.h)
|
|
+# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
|
|
+# observe the difference in targets printed.
|
|
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
|
|
target_link_libraries(hwy_benchmark hwy)
|
|
set_target_properties(hwy_benchmark
|
|
@@ -272,19 +284,21 @@ endif()
|
|
endif() # HWY_SYSTEM_GTEST
|
|
|
|
set(HWY_TEST_FILES
|
|
- contrib/image/image_test.cc
|
|
- # contrib/math/math_test.cc
|
|
+ hwy/contrib/image/image_test.cc
|
|
+ # hwy/contrib/math/math_test.cc
|
|
+ hwy/aligned_allocator_test.cc
|
|
+ hwy/base_test.cc
|
|
+ hwy/highway_test.cc
|
|
+ hwy/targets_test.cc
|
|
hwy/examples/skeleton_test.cc
|
|
hwy/tests/arithmetic_test.cc
|
|
hwy/tests/combine_test.cc
|
|
hwy/tests/compare_test.cc
|
|
hwy/tests/convert_test.cc
|
|
- hwy/tests/hwy_test.cc
|
|
hwy/tests/logical_test.cc
|
|
hwy/tests/memory_test.cc
|
|
hwy/tests/swizzle_test.cc
|
|
- hwy/aligned_allocator_test.cc
|
|
- hwy/targets_test.cc
|
|
+ hwy/tests/test_util_test.cc
|
|
)
|
|
|
|
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
|
@@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE
|
|
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
|
|
add_executable(${TESTNAME} ${TESTFILE})
|
|
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
|
|
+ # Test all targets, not just the best/baseline. This changes the default
|
|
+ # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
|
|
+ # cause compile errors because only one may be set, and other CMakeLists.txt
|
|
+ # that include us may set them.
|
|
+ target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
|
|
|
|
if(HWY_SYSTEM_GTEST)
|
|
- target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
|
|
+ target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
|
|
else()
|
|
- target_link_libraries(${TESTNAME} hwy gtest gtest_main)
|
|
+ target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
|
|
endif()
|
|
# Output test targets in the test directory.
|
|
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog
|
|
--- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 2021-06-02 10:56:05.151903967 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -1,3 +1,26 @@
|
|
+highway (0.12.2-1) UNRELEASED; urgency=medium
|
|
+
|
|
+ * fix scalar-only test and Windows macro conflict with Load/StoreFence
|
|
+ * replace deprecated wasm intrinsics
|
|
+
|
|
+ -- Jan Wassenberg <janwas@google.com> Mon, 31 May 2021 16:00:00 +0200
|
|
+
|
|
+highway (0.12.1-1) UNRELEASED; urgency=medium
|
|
+
|
|
+ * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
|
|
+ * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
|
|
+ * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
|
|
+
|
|
+ -- Jan Wassenberg <janwas@google.com> Wed, 19 May 2021 15:00:00 +0200
|
|
+
|
|
+highway (0.12.0-1) UNRELEASED; urgency=medium
|
|
+
|
|
+ * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
|
|
+ * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
|
|
+ * Proper IEEE rounding, reduce libstdc++ usage, inlined math
|
|
+
|
|
+ -- Jan Wassenberg <janwas@google.com> Thu, 15 Apr 2021 20:00:00 +0200
|
|
+
|
|
highway (0.11.1-1) UNRELEASED; urgency=medium
|
|
|
|
* Fix clang7 asan error, finish f16 conversions and add test
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf
|
|
Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md
|
|
--- chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 2021-06-02 10:56:05.117903795 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -33,6 +33,12 @@ The public headers are:
|
|
* hwy/cache_control.h: defines stand-alone functions to control caching (e.g.
|
|
prefetching) and memory barriers, independent of actual SIMD.
|
|
|
|
+* hwy/nanobenchmark.h: library for precisely measuring elapsed time (under
|
|
+ varying inputs) for benchmarking small/medium regions of code.
|
|
+
|
|
+* hwy/tests/test_util-inl.h: defines macros for invoking tests on all
|
|
+ available targets, plus per-target functions useful in tests (e.g. Print).
|
|
+
|
|
SIMD implementations must be preceded and followed by the following:
|
|
|
|
```
|
|
@@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE();
|
|
|
|
## Vector and descriptor types
|
|
|
|
-Highway vectors consist of one or more 'lanes' of the same built-in type `T =
|
|
-uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16,
|
|
-32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load,
|
|
-store, and conversion to/from `float32_t`; infinity or NaN have
|
|
-implementation-defined results.
|
|
-
|
|
-Each vector has `N` lanes (a power of two, possibly unknown at compile time).
|
|
-
|
|
-Platforms such as x86 support multiple vector types, and other platforms require
|
|
-that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be
|
|
-wrapped inside a class. The Highway API satisfies these constraints because it
|
|
-is designed around overloaded functions selected via a zero-sized tag parameter
|
|
-`d` of type `D = Simd<T, N>`. These are typically constructed using aliases:
|
|
-
|
|
-* `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native
|
|
- vector for the current target. For targets (e.g. RVV) that support register
|
|
- groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers
|
|
- in the group. This effectively multiplies the lane count in each operation
|
|
- by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of
|
|
- the sizes of the largest and smallest type. `LMUL > 1` is more efficient on
|
|
- single-issue machines, but larger values reduce the effective number of
|
|
- registers, which may cause the compiler to spill them to memory.
|
|
+Highway vectors consist of one or more 'lanes' of the same built-in type
|
|
+`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32,
|
|
+64`.
|
|
+
|
|
+In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store,
|
|
+and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN
|
|
+are implementation-defined due to ARMv7.
|
|
+
|
|
+On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway
|
|
+API allows using built-in types as vectors because operations are expressed as
|
|
+overloaded functions. Instead of constructors, overloaded initialization
|
|
+functions such as `Set` take a zero-sized tag argument called `d` of type `D =
|
|
+Simd<T, N>` and return an actual vector of unspecified type.
|
|
+
|
|
+`T` is one of the lane types above, and may be retrieved via `TFromD<D>`.
|
|
+
|
|
+`N` is target-dependent and not directly user-specified. The actual lane count
|
|
+may not be known at compile time, but can be obtained via `Lanes(d)`. Use this
|
|
+value, which is potentially different from `N`, to increment loop counters etc.
|
|
+It is typically a power of two, but that is not guaranteed e.g. on SVE.
|
|
+
|
|
+`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases:
|
|
+
|
|
+* Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a
|
|
+ native vector. This is preferred because it fully utilizes vector lanes.
|
|
+
|
|
+ For targets (e.g. RVV) that support register groups, the optional `LMUL` (1,
|
|
+ 2, 4, 8) specifies the number of registers in the group. This effectively
|
|
+ multiplies the lane count in each operation by `LMUL`. For mixed-precision
|
|
+ code, `LMUL` must be at least the ratio of the sizes of the largest and
|
|
+ smallest type. `LMUL > 1` is more efficient on single-issue machines, but
|
|
+ larger values reduce the effective number of registers, which may cause the
|
|
+ compiler to spill them to memory.
|
|
+
|
|
+* Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector
|
|
+ which may be native width, but no more than `N` lanes have observable
|
|
+ effects such as loading/storing to memory. This is less performance-portable
|
|
+ because it may not use all available lanes. Note that the resulting lane
|
|
+ count may also be less than `N`.
|
|
+
|
|
+ For targets (e.g. RVV) that have compile-time-unknown lane counts, such
|
|
+ vectors incur additional runtime cost in `Load` etc.
|
|
+
|
|
+User-specified lane counts or tuples of vectors could cause spills on targets
|
|
+with fewer or smaller vectors. By contrast, Highway encourages vector-length
|
|
+agnostic code, which is more performance-portable.
|
|
+
|
|
+Given that lane counts are potentially compile-time-unknown, storage for vectors
|
|
+should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For
|
|
+applications that require a compile-time estimate, `MaxLanes(d)` returns the `N`
|
|
+from `Simd<T, N>`, which is NOT necessarily the actual lane count. This is
|
|
+DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may
|
|
+be very large) and some compilers are not able to interpret it as constexpr.
|
|
|
|
-* `const HWY_CAPPED(T, N) d;` for up to `N` lanes.
|
|
-
|
|
-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors
|
|
-for the smaller types must be obtained from those of the larger type (e.g. via
|
|
+For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for
|
|
+the smaller types must be obtained from those of the larger type (e.g. via
|
|
`Rebind<uint8_t, HWY_FULL(float)>`).
|
|
|
|
-The type `T` may be accessed as `TFromD<D>`. There are three possibilities for
|
|
-the template parameter `N`:
|
|
-
|
|
-1. Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a
|
|
- target with compile-time constant vectors.
|
|
+## Using unspecified vector types
|
|
|
|
-1. Less than the hardware vector width. This is the result of a compile-time
|
|
- decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of
|
|
- lanes, even when the hardware vector width could be greater.
|
|
-
|
|
-1. Unrelated to the hardware vector width, e.g. when the hardware vector width
|
|
- is not known at compile-time and may be very large.
|
|
-
|
|
-In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by
|
|
-which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd<T, N>`,
|
|
-which is NOT necessarily the actual vector size (see above) and some compilers
|
|
-are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use
|
|
-alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of
|
|
-`Lanes(d)` elements.
|
|
-
|
|
-Highway is designed to map a vector variable to a (possibly partial) hardware
|
|
-register or register group. By discouraging user-specified `N` and tuples of
|
|
-vector variables, we improve performance portability (e.g. by reducing spills to
|
|
-memory for platforms that have smaller vectors than the developer expected).
|
|
-
|
|
-To construct vectors, call factory functions (see "Initialization" below) with
|
|
-a tag parameter `d`.
|
|
-
|
|
-Local variables typically use auto for type deduction. For some generic
|
|
-functions, a template argument `V` is sufficient: `template<class V> V Squared(V
|
|
-v) { return v * v; }`. In general, functions have a `D` template argument and
|
|
-can return vectors of type `Vec<D>`.
|
|
-
|
|
-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
|
|
-functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway
|
|
-functions generally take either a `Simd` or vector argument, which are also
|
|
-defined in namespace `hwy`, they will typically be found via Argument-Dependent
|
|
-Lookup and namespace qualifiers are not necessary. As an exception, Highway
|
|
-functions that are templates (e.g. because they require a compile-time argument
|
|
-such as a lane index or shift count) require a using-declaration such as
|
|
-`using hwy::HWY_NAMESPACE::ShiftLeft`.
|
|
+Because vector types are unspecified, local vector variables are typically
|
|
+defined using `auto` for type deduction. A template argument `V` suffices for
|
|
+simple generic functions: `template<class V> V Squared(V v) { return v * v; }`.
|
|
+
|
|
+Many functions will need a `D` template argument in order to initialize any
|
|
+constants. They can use a separate `V` template argument for vectors, or use
|
|
+`Vec<D>`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such
|
|
+aliases instead of auto may improve readability of mixed-type code. They can
|
|
+also be used for member variables, which are discouraged because compilers often
|
|
+have difficulty mapping them to registers.
|
|
|
|
## Operations
|
|
|
|
@@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ
|
|
bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and
|
|
bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used.
|
|
|
|
+Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
|
|
+functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions
|
|
+generally take either a `Simd` or vector/mask argument. For targets where
|
|
+vectors and masks are defined in namespace `hwy`, the functions will be found
|
|
+via Argument-Dependent Lookup. However, this does not work for function
|
|
+templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using
|
|
+hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`.
|
|
+
|
|
### Initialization
|
|
|
|
* <code>V **Zero**(D)</code>: returns N-lane vector with all bits set to 0.
|
|
@@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f
|
|
* `V`: `{i,f}` \
|
|
<code>V **Neg**(V a)</code>: returns `-a[i]`.
|
|
|
|
-* `V`: `{i}{8,16,32}, {f}` \
|
|
+* `V`: `{i,f}` \
|
|
<code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
|
|
integers, `LimitsMin()` maps to `LimitsMax() + 1`.
|
|
|
|
@@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift
|
|
shifting `MakeUnsigned<T>` and casting to `T`. Right-shifting negative signed
|
|
`T` is the same as an unsigned shift, except that 1-bits are shifted in.
|
|
|
|
-Compile-time constant shifts, generally the most efficient variant:
|
|
+Compile-time constant shifts, generally the most efficient variant (though 8-bit
|
|
+shifts are potentially slower than other lane sizes):
|
|
|
|
-* `V`: `{u,i}{16,32,64}` \
|
|
+* `V`: `{u,i}` \
|
|
<code>V **ShiftLeft**<int>(V a)</code> returns `a[i] << int`.
|
|
|
|
-* `V`: `{u,i}{16,32,64}` \
|
|
+* `V`: `{u,i}` \
|
|
<code>V **ShiftRight**<int>(V a)</code> returns `a[i] >> int`.
|
|
|
|
Shift all lanes by the same (not necessarily compile-time constant) amount:
|
|
|
|
-* `V`: `{u,i}{16,32,64}` \
|
|
+* `V`: `{u,i}` \
|
|
<code>V **ShiftLeftSame**(V a, int bits)</code> returns `a[i] << bits`.
|
|
|
|
-* `V`: `{u,i}{16,32,64}` \
|
|
+* `V`: `{u,i}` \
|
|
<code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.
|
|
|
|
-Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2):
|
|
+Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2):
|
|
|
|
* `V`: `{u,i}{16,32,64}` \
|
|
<code>V **operator<<**(V a, V b)</code> returns `a[i] << b[i]`.
|
|
@@ -332,12 +347,17 @@ Special functions for signed types:
|
|
slightly more efficient; requires the first argument to be non-negative.
|
|
|
|
* `V`: `i32/64` \
|
|
- <code>V **BroadcastSignBit(V a)</code> returns `a[i] < 0 ? -1 : 0`.
|
|
+ <code>V **BroadcastSignBit**(V a)</code> returns `a[i] < 0 ? -1 : 0`.
|
|
|
|
### Masks
|
|
|
|
Let `M` denote a mask capable of storing true/false for each lane.
|
|
|
|
+* <code>M **FirstN**(D, size_t N)</code>: returns mask with the first `N`
|
|
+ lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in
|
|
+ an all-true mask. Useful for implementing "masked" stores by loading `prev`
|
|
+ followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`.
|
|
+
|
|
* <code>M1 **RebindMask**(D, M2 m)</code>: returns same mask bits as `m`, but
|
|
reinterpreted as a mask for lanes of type `TFromD<D>`. `M1` and `M2` must
|
|
have the same number of lanes.
|
|
@@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing
|
|
* <code>size_t **CountTrue**(M m)</code>: returns how many of `m[i]` are true
|
|
[0, N]. This is typically more expensive than AllTrue/False.
|
|
|
|
-* `V`: `{u,i,f}{32,64}` \
|
|
+* `V`: `{u,i,f}{16,32,64}` \
|
|
<code>V **Compress**(V v, M m)</code>: returns `r` such that `r[n]` is
|
|
`v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true.
|
|
Compacts lanes whose mask is set into the lower lanes; upper lanes are
|
|
- implementation-defined.
|
|
+ implementation-defined. Slow with 16-bit lanes.
|
|
|
|
-* `V`: `{u,i,f}{32,64}` \
|
|
+* `V`: `{u,i,f}{16,32,64}` \
|
|
<code>size_t **CompressStore**(V v, M m, D, T* aligned)</code>: writes lanes
|
|
whose mask is set into `aligned`, starting from lane 0. Returns
|
|
`CountTrue(m)`, the number of valid lanes. All subsequent lanes may be
|
|
- overwritten! Alignment ensures inactive lanes will not cause faults.
|
|
+ overwritten! Alignment ensures inactive lanes will not cause faults. Slow
|
|
+ with 16-bit lanes.
|
|
|
|
### Comparisons
|
|
|
|
@@ -429,10 +450,16 @@ Memory operands are little-endian, other
|
|
lane configuration. Pointers are the addresses of `N` consecutive `T` values,
|
|
either naturally-aligned (`aligned`) or possibly unaligned (`p`).
|
|
|
|
+**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
|
|
+bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
|
|
+are naturally aligned. An unaligned access may require two load ports.
|
|
+
|
|
#### Load
|
|
|
|
* <code>Vec<D> **Load**(D, const T* aligned)</code>: returns
|
|
- `aligned[i]`.
|
|
+ `aligned[i]`. May fault if the pointer is not aligned to the vector size.
|
|
+ Using this whenever possible improves codegen on SSE4: unlike `LoadU`,
|
|
+ `Load` can be fused into a memory operand, which reduces register pressure.
|
|
* <code>Vec<D> **LoadU**(D, const T* p)</code>: returns `p[i]`.
|
|
|
|
* <code>Vec<D> **LoadDup128**(D, const T* p)</code>: returns one 128-bit
|
|
@@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or
|
|
be faster than broadcasting single values, and is more convenient than
|
|
preparing constants for the actual vector length.
|
|
|
|
-#### Gather
|
|
+#### Scatter/Gather
|
|
|
|
-**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`:
|
|
+**Note**: Offsets/indices are of type `VI = Vec<RebindToSigned<D>>` and need not
|
|
+be unique. The results are implementation-defined if any are negative.
|
|
|
|
-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
|
|
- <code>Vec<D> **GatherOffset**(D, const T* base, VI offsets)</code>.
|
|
- Returns elements of base selected by possibly repeated *byte* `offsets[i]`.
|
|
- Results are implementation-defined if `offsets[i]` is negative.
|
|
-
|
|
-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
|
|
- <code>Vec<D> **GatherIndex**(D, const T* base, VI indices)</code>.
|
|
- Returns vector of `base[indices[i]]`. Indices need not be unique, but
|
|
- results are implementation-defined if they are negative.
|
|
+**Note**: Where possible, applications should `Load/Store/TableLookup*` entire
|
|
+vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form
|
|
+`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] =
|
|
+F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
|
|
+
|
|
+* `D`: `{u,i,f}{32,64}` \
|
|
+ <code>void **ScatterOffset**(Vec<D> v, D, const T* base, VI
|
|
+ offsets)</code>: stores `v[i]` to the base address plus *byte* `offsets[i]`.
|
|
+
|
|
+* `D`: `{u,i,f}{32,64}` \
|
|
+ <code>void **ScatterIndex**(Vec<D> v, D, const T* base, VI
|
|
+ indices)</code>: stores `v[i]` to `base[indices[i]]`.
|
|
+
|
|
+* `D`: `{u,i,f}{32,64}` \
|
|
+ <code>Vec<D> **GatherOffset**(D, const T* base, VI offsets)</code>:
|
|
+ returns elements of base selected by *byte* `offsets[i]`.
|
|
+
|
|
+* `D`: `{u,i,f}{32,64}` \
|
|
+ <code>Vec<D> **GatherIndex**(D, const T* base, VI indices)</code>:
|
|
+ returns vector of `base[indices[i]]`.
|
|
|
|
#### Store
|
|
|
|
@@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or
|
|
* <code>void **StoreU**(Vec<D> a, D, T* p)</code>: as Store, but without
|
|
the alignment requirement.
|
|
|
|
+* `D`: `u8` \
|
|
+ <code>void **StoreInterleaved3**(Vec<D> v0, Vec<D> v1,
|
|
+ Vec<D> v2, D, T* p)</code>: equivalent to shuffling `v0, v1, v2`
|
|
+ followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0],
|
|
+ p[2] == v1[0]`. Useful for RGB samples.
|
|
+
|
|
+* `D`: `u8` \
|
|
+ <code>void **StoreInterleaved4**(Vec<D> v0, Vec<D> v1,
|
|
+ Vec<D> v2, Vec<D> v3, D, T* p)</code>: as above, but for four
|
|
+ vectors (e.g. RGBA samples).
|
|
+
|
|
### Cache control
|
|
|
|
All functions except Stream are defined in cache_control.h.
|
|
@@ -483,6 +533,9 @@ All functions except Stream are defined
|
|
* <code>void **Prefetch**(const T* p)</code>: begins loading the cache line
|
|
containing "p".
|
|
|
|
+* <code>void **Pause**()</code>: when called inside a spin-loop, may reduce
|
|
+ power consumption.
|
|
+
|
|
### Type conversion
|
|
|
|
* <code>Vec<D> **BitCast**(D, V)</code>: returns the bits of `V`
|
|
@@ -525,7 +578,8 @@ if the input exceeds the destination ran
|
|
zero and converts the value to same-sized integer.
|
|
|
|
* `V`: `f32`; `Ret`: `i32` \
|
|
- <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`.
|
|
+ <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
|
|
+ results are undefined for NaN.
|
|
|
|
### Swizzle
|
|
|
|
@@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with
|
|
|
|
### Reductions
|
|
|
|
-**Note**: the following are only available for full vectors (including scalar).
|
|
-These 'reduce' all lanes to a single result. This result is broadcasted to all
|
|
-lanes at no extra cost; you can use `GetLane` to obtain the value.
|
|
+**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
|
|
+broadcasted to all lanes at no extra cost. To obtain a scalar, you can call
|
|
+`GetLane`.
|
|
|
|
Being a horizontal operation (across lanes of the same vector), these are slower
|
|
than normal SIMD operations and are typically used outside critical loops.
|
|
@@ -697,9 +751,6 @@ generate such instructions (implying the
|
|
finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to
|
|
provide an alternative to functions which are not supported by HWY_SCALAR.
|
|
|
|
-* `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by
|
|
- HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof.
|
|
-
|
|
* `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as
|
|
`#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out.
|
|
|
|
@@ -707,26 +758,15 @@ The following signal capabilities and ex
|
|
|
|
* `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes.
|
|
* `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes.
|
|
+
|
|
+The following were used to signal the maximum number of lanes for certain
|
|
+operations, but this is no longer necessary (nor possible on SVE/RVV), so they
|
|
+are DEPRECATED:
|
|
+
|
|
+* `HWY_GATHER_LANES(T)`.
|
|
* `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits.
|
|
* `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits.
|
|
|
|
-The following indicate the maximum number of lanes for certain operations. For
|
|
-targets that support the feature/operation, the macro evaluates to
|
|
-`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))`
|
|
-generates the best possible code (or scalar fallback) from the same source code.
|
|
-
|
|
-* `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset.
|
|
-* `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2).
|
|
- DEPRECATED, this always matches HWY_LANES(T) and will be removed.
|
|
-
|
|
-As above, but the feature implies the type so there is no T parameter, thus
|
|
-these can be used in `#if` expressions.
|
|
-
|
|
-* `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this
|
|
- always matches HWY_LANES(int64_t) and will be removed.
|
|
-* `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED,
|
|
- this always matches HWY_LANES(int64_t) and will be removed.
|
|
-
|
|
## Detecting supported targets
|
|
|
|
`SupportedTargets()` returns a cached (initialized on-demand) bitfield of the
|
|
@@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`:
|
|
and permitted by the compiler, independently of autovectorization), which
|
|
maximizes coverage in tests.
|
|
|
|
-If none are defined, the default is to select all attainable targets except any
|
|
-non-best baseline (typically `HWY_SCALAR`), which reduces code size.
|
|
+If none are defined, but `HWY_IS_TEST` is defined, the default is
|
|
+`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
|
|
+targets except any non-best baseline (typically `HWY_SCALAR`), which reduces
|
|
+code size.
|
|
|
|
## Compiler support
|
|
|
|
@@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags
|
|
However, this enables AVX2 instructions in the entire translation unit, which
|
|
may violate the one-definition rule and cause crashes. Instead, we use
|
|
target-specific attributes introduced via #pragma. Function using SIMD must
|
|
-reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`.
|
|
+reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively,
|
|
+individual functions or lambdas may be prefixed with `HWY_ATTR`.
|
|
|
|
Immediates (compile-time constants) are specified as template arguments to avoid
|
|
constant-propagation issues with Clang on ARM.
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 2021-06-02 10:56:05.278904609 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -111,6 +111,32 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
|
|
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
|
|
}
|
|
|
|
+// Helpers for array allocators (avoids overflow)
|
|
+namespace detail {
|
|
+
|
|
+// Returns x such that 1u << x == n (if n is a power of two).
|
|
+static inline constexpr size_t ShiftCount(size_t n) {
|
|
+ return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
|
|
+ constexpr size_t size = sizeof(T);
|
|
+
|
|
+ constexpr bool is_pow2 = (size & (size - 1)) == 0;
|
|
+ constexpr size_t bits = ShiftCount(size);
|
|
+ static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
|
|
+
|
|
+ const size_t bytes = is_pow2 ? items << bits : items * size;
|
|
+ const size_t check = is_pow2 ? bytes >> bits : bytes / size;
|
|
+ if (check != items) {
|
|
+ return nullptr; // overflowed
|
|
+ }
|
|
+ return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
// Aligned memory equivalent of make_unique<T[]> for array types using the
|
|
// custom allocators alloc/free. This function calls the constructor with the
|
|
// passed Args... on every created item. The destructor of each element will be
|
|
@@ -118,10 +144,11 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
|
|
template <typename T, typename... Args>
|
|
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
|
|
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
|
|
- T* ptr =
|
|
- static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
|
|
- for (size_t i = 0; i < items; i++) {
|
|
- new (ptr + i) T(std::forward<Args>(args)...);
|
|
+ T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
|
|
+ if (ptr != nullptr) {
|
|
+ for (size_t i = 0; i < items; i++) {
|
|
+ new (ptr + i) T(std::forward<Args>(args)...);
|
|
+ }
|
|
}
|
|
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
|
|
}
|
|
@@ -165,7 +192,7 @@ template <typename T>
|
|
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
|
|
FreePtr free, void* opaque) {
|
|
return AlignedFreeUniquePtr<T[]>(
|
|
- static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
|
|
+ detail::AllocateAlignedItems<T>(items, alloc, opaque),
|
|
AlignedFreer(free, opaque));
|
|
}
|
|
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 2021-06-02 10:56:05.273904584 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -16,6 +16,7 @@
|
|
|
|
#include <stddef.h>
|
|
|
|
+#include <array>
|
|
#include <new>
|
|
#include <random>
|
|
#include <vector>
|
|
@@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr)
|
|
/*opaque_ptr=*/nullptr);
|
|
}
|
|
|
|
+TEST(AlignedAllocatorTest, Log2) {
|
|
+ EXPECT_EQ(0u, detail::ShiftCount(1));
|
|
+ EXPECT_EQ(1u, detail::ShiftCount(2));
|
|
+ EXPECT_EQ(3u, detail::ShiftCount(8));
|
|
+}
|
|
+
|
|
+// Allocator returns null when it detects overflow of items * sizeof(T).
|
|
+TEST(AlignedAllocatorTest, Overflow) {
|
|
+ constexpr size_t max = ~size_t(0);
|
|
+ constexpr size_t msb = (max >> 1) + 1;
|
|
+ using Size5 = std::array<uint8_t, 5>;
|
|
+ using Size10 = std::array<uint8_t, 10>;
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
|
|
+ EXPECT_EQ(nullptr,
|
|
+ detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
|
|
+}
|
|
+
|
|
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
|
|
const size_t kSize = 7777;
|
|
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
|
|
/*opaque_ptr=*/nullptr);
|
|
ASSERT_NE(nullptr, ptr);
|
|
// Make sure the pointer is actually aligned.
|
|
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
|
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
|
char* p = static_cast<char*>(ptr);
|
|
size_t ret = 0;
|
|
for (size_t i = 0; i < kSize; i++) {
|
|
@@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP
|
|
p[i] = static_cast<char>(i & 0x7F);
|
|
if (i) ret += p[i] * p[i - 1];
|
|
}
|
|
- EXPECT_NE(0, ret);
|
|
+ EXPECT_NE(0U, ret);
|
|
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
|
|
}
|
|
|
|
@@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc)
|
|
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
|
|
ASSERT_NE(nullptr, ptr);
|
|
// We should have only requested one alloc from the allocator.
|
|
- EXPECT_EQ(1u, fake_alloc.PendingAllocs());
|
|
+ EXPECT_EQ(1U, fake_alloc.PendingAllocs());
|
|
// Make sure the pointer is actually aligned.
|
|
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
|
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
|
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
|
|
- EXPECT_EQ(0u, fake_alloc.PendingAllocs());
|
|
+ EXPECT_EQ(0U, fake_alloc.PendingAllocs());
|
|
}
|
|
|
|
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
|
|
@@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
|
|
TEST(AlignedAllocatorTest, AllocSingleInt) {
|
|
auto ptr = AllocateAligned<uint32_t>(1);
|
|
ASSERT_NE(nullptr, ptr.get());
|
|
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
|
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
|
// Force delete of the unique_ptr now to check that it doesn't crash.
|
|
ptr.reset(nullptr);
|
|
EXPECT_EQ(nullptr, ptr.get());
|
|
@@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
|
|
const size_t kSize = 7777;
|
|
auto ptr = AllocateAligned<uint32_t>(kSize);
|
|
ASSERT_NE(nullptr, ptr.get());
|
|
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
|
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
|
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
|
|
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
|
|
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
|
|
@@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
|
|
ptr[i] = static_cast<uint32_t>(i);
|
|
if (i) ret += ptr[i] * ptr[i - 1];
|
|
}
|
|
- EXPECT_NE(0, ret);
|
|
+ EXPECT_NE(0U, ret);
|
|
}
|
|
|
|
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
|
|
@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
|
|
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
|
|
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
|
|
&counter);
|
|
- // An array shold still only call a single allocation.
|
|
+ ASSERT_NE(nullptr, arr.get());
|
|
+ // An array should still only call a single allocation.
|
|
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
|
|
EXPECT_EQ(7, counter);
|
|
for (size_t i = 0; i < 7; i++) {
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 2021-06-02 10:56:05.266904549 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -34,7 +34,10 @@
|
|
//------------------------------------------------------------------------------
|
|
// Detect compiler using predefined macros
|
|
|
|
-#ifdef _MSC_VER
|
|
+// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
|
|
+// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
|
|
+// purpose.
|
|
+#if defined(_MSC_VER) && !defined(__clang__)
|
|
#define HWY_COMPILER_MSVC _MSC_VER
|
|
#else
|
|
#define HWY_COMPILER_MSVC 0
|
|
@@ -200,6 +203,10 @@
|
|
#define HWY_ARCH_X86_64 0
|
|
#endif
|
|
|
|
+#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
|
|
+#error "Cannot have both x86-32 and x86-64"
|
|
+#endif
|
|
+
|
|
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
|
#define HWY_ARCH_X86 1
|
|
#else
|
|
@@ -212,14 +219,29 @@
|
|
#define HWY_ARCH_PPC 0
|
|
#endif
|
|
|
|
-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
|
|
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
|
+#define HWY_ARCH_ARM_A64 1
|
|
+#else
|
|
+#define HWY_ARCH_ARM_A64 0
|
|
+#endif
|
|
+
|
|
+#if defined(__arm__) || defined(_M_ARM)
|
|
+#define HWY_ARCH_ARM_V7 1
|
|
+#else
|
|
+#define HWY_ARCH_ARM_V7 0
|
|
+#endif
|
|
+
|
|
+#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
|
+#error "Cannot have both A64 and V7"
|
|
+#endif
|
|
+
|
|
+#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
|
#define HWY_ARCH_ARM 1
|
|
#else
|
|
#define HWY_ARCH_ARM 0
|
|
#endif
|
|
|
|
-// There isn't yet a standard __wasm or __wasm__.
|
|
-#ifdef __EMSCRIPTEN__
|
|
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
|
#define HWY_ARCH_WASM 1
|
|
#else
|
|
#define HWY_ARCH_WASM 0
|
|
@@ -231,9 +253,11 @@
|
|
#define HWY_ARCH_RVV 0
|
|
#endif
|
|
|
|
+// It is an error to detect multiple architectures at the same time, but OK to
|
|
+// detect none of the above.
|
|
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
|
- HWY_ARCH_RVV) != 1
|
|
-#error "Must detect exactly one platform"
|
|
+ HWY_ARCH_RVV) > 1
|
|
+#error "Must not detect more than one architecture"
|
|
#endif
|
|
|
|
//------------------------------------------------------------------------------
|
|
@@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t
|
|
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
|
// by concatenating base type and bits.
|
|
|
|
-// RVV already has a builtin type.
|
|
-#if !HWY_ARCH_RVV
|
|
+// RVV already has a builtin type and the GCC intrinsics require it.
|
|
+#if HWY_ARCH_RVV && HWY_COMPILER_GCC
|
|
+#define HWY_NATIVE_FLOAT16 1
|
|
+#else
|
|
+#define HWY_NATIVE_FLOAT16 0
|
|
+#endif
|
|
+
|
|
+#if HWY_NATIVE_FLOAT16
|
|
+using float16_t = __fp16;
|
|
+// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
|
|
+// arguments, so use a wrapper.
|
|
+// TODO(janwas): replace with _Float16 when that is supported?
|
|
+#else
|
|
+#pragma pack(push, 1)
|
|
struct float16_t {
|
|
- // __fp16 cannot be used as a function parameter in clang, so use a wrapper.
|
|
uint16_t bits;
|
|
};
|
|
+#pragma pack(pop)
|
|
#endif
|
|
+
|
|
using float32_t = float;
|
|
using float64_t = double;
|
|
|
|
@@ -506,6 +543,13 @@ struct Relations<int64_t> {
|
|
using Narrow = int32_t;
|
|
};
|
|
template <>
|
|
+struct Relations<float16_t> {
|
|
+ using Unsigned = uint16_t;
|
|
+ using Signed = int16_t;
|
|
+ using Float = float16_t;
|
|
+ using Wide = float;
|
|
+};
|
|
+template <>
|
|
struct Relations<float> {
|
|
using Unsigned = uint32_t;
|
|
using Signed = int32_t;
|
|
@@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t
|
|
|
|
// Undefined results for x == 0.
|
|
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|
-#ifdef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
unsigned long index; // NOLINT
|
|
_BitScanForward(&index, x);
|
|
return index;
|
|
-#else
|
|
+#else // HWY_COMPILER_MSVC
|
|
return static_cast<size_t>(__builtin_ctz(x));
|
|
-#endif
|
|
+#endif // HWY_COMPILER_MSVC
|
|
}
|
|
|
|
HWY_API size_t PopCount(uint64_t x) {
|
|
@@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) {
|
|
return static_cast<size_t>(__builtin_popcountll(x));
|
|
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
|
return _mm_popcnt_u64(x);
|
|
-#elif HWY_COMPILER_MSVC
|
|
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
|
|
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
|
|
#else
|
|
x -= ((x >> 1) & 0x55555555U);
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 2021-06-02 10:56:05.280904620 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -20,7 +20,9 @@
|
|
|
|
#include "hwy/base.h"
|
|
|
|
-#ifndef __SSE2__
|
|
+// Requires SSE2; fails to compile on 32-bit Clang 7 (see
|
|
+// https://github.com/gperftools/gperftools/issues/946).
|
|
+#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
|
|
#undef HWY_DISABLE_CACHE_CONTROL
|
|
#define HWY_DISABLE_CACHE_CONTROL
|
|
#endif
|
|
@@ -30,6 +32,14 @@
|
|
#include <emmintrin.h> // SSE2
|
|
#endif
|
|
|
|
+// Windows.h #defines these, which causes infinite recursion. Temporarily
|
|
+// undefine them in this header; these functions are anyway deprecated.
|
|
+// TODO(janwas): remove when these functions are removed.
|
|
+#pragma push_macro("LoadFence")
|
|
+#pragma push_macro("StoreFence")
|
|
+#undef LoadFence
|
|
+#undef StoreFence
|
|
+
|
|
namespace hwy {
|
|
|
|
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
|
|
@@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach
|
|
#endif
|
|
}
|
|
|
|
+// Reduces power consumption in spin-loops. No effect on non-x86.
|
|
+HWY_INLINE HWY_ATTR_CACHE void Pause() {
|
|
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
|
+ _mm_pause();
|
|
+#endif
|
|
+}
|
|
+
|
|
} // namespace hwy
|
|
|
|
+// TODO(janwas): remove when these functions are removed. (See above.)
|
|
+#pragma pop_macro("StoreFence")
|
|
+#pragma pop_macro("LoadFence")
|
|
+
|
|
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 2021-06-02 10:56:05.195904190 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -19,7 +19,6 @@
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
|
|
-#include <cmath>
|
|
#include <memory>
|
|
#include <numeric> // iota
|
|
|
|
@@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi
|
|
|
|
class TwoArray {
|
|
public:
|
|
- // Passed to ctor as a value NOT known to the compiler. Must be a multiple of
|
|
- // the vector lane count * 8.
|
|
+ // Must be a multiple of the vector lane count * 8.
|
|
static size_t NumItems() { return 3456; }
|
|
|
|
- explicit TwoArray(const size_t num_items)
|
|
- : a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
|
|
- const float init = num_items / NumItems(); // 1, but compiler doesn't know
|
|
- std::iota(a_.get(), a_.get() + num_items, init);
|
|
- std::iota(b_, b_ + num_items, init);
|
|
+ TwoArray()
|
|
+ : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
|
|
+ // = 1, but compiler doesn't know
|
|
+ const float init = static_cast<float>(Unpredictable1());
|
|
+ std::iota(a_.get(), a_.get() + NumItems(), init);
|
|
+ std::iota(b_, b_ + NumItems(), init);
|
|
}
|
|
|
|
protected:
|
|
@@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) {
|
|
const FuncInput inputs[kNumInputs] = {num_items};
|
|
Result results[kNumInputs];
|
|
|
|
- Benchmark benchmark(num_items);
|
|
+ Benchmark benchmark;
|
|
|
|
Params p;
|
|
p.verbose = false;
|
|
@@ -101,7 +100,7 @@ void Intro() {
|
|
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
|
|
class BenchmarkDot : public TwoArray {
|
|
public:
|
|
- explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
|
|
+ BenchmarkDot() : dot_{-1.0f} {}
|
|
|
|
FuncOutput operator()(const size_t num_items) {
|
|
HWY_FULL(float) d;
|
|
@@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray {
|
|
sum[i] += sum[i + power];
|
|
}
|
|
}
|
|
- return dot_ = GetLane(SumOfLanes(sum[0]));
|
|
+ dot_ = GetLane(SumOfLanes(sum[0]));
|
|
+ return static_cast<FuncOutput>(dot_);
|
|
}
|
|
void Verify(size_t num_items) {
|
|
if (dot_ == -1.0f) {
|
|
@@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray {
|
|
// INTERMEDIATE: delta coding
|
|
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
|
|
struct BenchmarkDelta : public TwoArray {
|
|
- explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
|
|
-
|
|
FuncOutput operator()(const size_t num_items) const {
|
|
#if HWY_TARGET == HWY_SCALAR
|
|
b_[0] = a_[0];
|
|
@@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray
|
|
Store(a - shifted, df, &b_[i]);
|
|
}
|
|
#endif
|
|
- return b_[num_items - 1];
|
|
+ return static_cast<FuncOutput>(b_[num_items - 1]);
|
|
}
|
|
|
|
void Verify(size_t num_items) {
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 2021-06-02 10:56:05.189904159 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -22,27 +22,62 @@
|
|
// For runtime dispatch, specify the name of the current file (unfortunately
|
|
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
|
|
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
|
|
-// Re-include this file once per enabled target to generate code for it.
|
|
+// Generates code for each enabled target by re-including this source file.
|
|
#include "hwy/foreach_target.h"
|
|
|
|
-#include "hwy/examples/skeleton_shared.h"
|
|
#include "hwy/highway.h"
|
|
|
|
-// Optional: factor out parts of the implementation into *-inl.h
|
|
-#include "hwy/examples/skeleton-inl.h"
|
|
-
|
|
// Optional, can instead add HWY_ATTR to all functions.
|
|
HWY_BEFORE_NAMESPACE();
|
|
namespace skeleton {
|
|
namespace HWY_NAMESPACE {
|
|
|
|
-// Compiled once per target via multiple inclusion.
|
|
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
|
|
- float* HWY_RESTRICT out) {
|
|
- printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET),
|
|
- ExampleGatherStrategy());
|
|
+// Highway ops reside here; ADL does not find templates nor builtins.
|
|
+using namespace hwy::HWY_NAMESPACE;
|
|
+
|
|
+// Computes log2 by converting to a vector of floats. Compiled once per target.
|
|
+template <class DF>
|
|
+HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
|
+ uint8_t* HWY_RESTRICT log2) {
|
|
+ // Type tags for converting to other element types (Rebind = same count).
|
|
+ const Rebind<int32_t, DF> d32;
|
|
+ const Rebind<uint8_t, DF> d8;
|
|
+
|
|
+ const auto u8 = Load(d8, values);
|
|
+ const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
|
|
+ const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
|
|
+ Store(DemoteTo(d8, exponent), d8, log2);
|
|
+}
|
|
+
|
|
+HWY_NOINLINE void CodepathDemo() {
|
|
+ // Highway defaults to portability, but per-target codepaths may be selected
|
|
+ // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
|
|
+#if HWY_CAP_INTEGER64
|
|
+ const char* gather = "Has int64";
|
|
+#else
|
|
+ const char* gather = "No int64";
|
|
+#endif
|
|
+ printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
|
|
+}
|
|
|
|
- ExampleMulAdd(in1, in2, out);
|
|
+HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
|
+ uint8_t* HWY_RESTRICT log2) {
|
|
+ CodepathDemo();
|
|
+
|
|
+ // Second argument is necessary on RVV until it supports fractional lengths.
|
|
+ HWY_FULL(float, 4) df;
|
|
+
|
|
+ const size_t N = Lanes(df);
|
|
+ size_t i = 0;
|
|
+ for (; i + N <= count; i += N) {
|
|
+ OneFloorLog2(df, values + i, log2 + i);
|
|
+ }
|
|
+ // TODO(janwas): implement
|
|
+#if HWY_TARGET != HWY_RVV
|
|
+ for (; i < count; ++i) {
|
|
+ OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
|
|
+ }
|
|
+#endif
|
|
}
|
|
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
@@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE();
|
|
|
|
namespace skeleton {
|
|
|
|
-// This macro declares a static array SkeletonHighwayDispatchTable used for
|
|
-// dynamic dispatch. This macro should be placed in the same namespace that
|
|
-// defines the Skeleton function above.
|
|
-HWY_EXPORT(Skeleton);
|
|
+// This macro declares a static array used for dynamic dispatch; it resides in
|
|
+// the same outer namespace that contains FloorLog2.
|
|
+HWY_EXPORT(FloorLog2);
|
|
|
|
// This function is optional and only needed in the case of exposing it in the
|
|
-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in
|
|
-// this module is equivalent to inlining this optional function..
|
|
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
|
|
- float* HWY_RESTRICT out) {
|
|
- return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out);
|
|
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
|
|
+// is equivalent to inlining this function.
|
|
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
|
+ uint8_t* HWY_RESTRICT out) {
|
|
+ return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
|
|
}
|
|
|
|
// Optional: anything to compile only once, e.g. non-SIMD implementations of
|
|
-// public functions provided by this module, can go inside #if HWY_ONCE
|
|
-// (after end_target-inl.h).
|
|
+// public functions provided by this module, can go inside #if HWY_ONCE.
|
|
|
|
} // namespace skeleton
|
|
#endif // HWY_ONCE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 2021-06-02 10:56:05.213904281 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -18,15 +18,17 @@
|
|
#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
|
#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
|
|
|
-// Tiny subset of Highway API: essentials for declaring an interface, without
|
|
-// any implementation details.
|
|
+#include <stddef.h>
|
|
+
|
|
+// Platform-specific definitions used for declaring an interface, independent of
|
|
+// the SIMD instruction set.
|
|
#include "hwy/base.h" // HWY_RESTRICT
|
|
|
|
namespace skeleton {
|
|
|
|
-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
|
|
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
|
|
- float* HWY_RESTRICT out);
|
|
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
|
|
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
|
+ uint8_t* HWY_RESTRICT out);
|
|
|
|
} // namespace skeleton
|
|
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 2021-06-02 10:56:05.164904033 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -29,41 +29,31 @@
|
|
// It is fine to #include normal or *-inl headers.
|
|
#include <stddef.h>
|
|
|
|
-#include "hwy/examples/skeleton_shared.h"
|
|
#include "hwy/highway.h"
|
|
|
|
HWY_BEFORE_NAMESPACE();
|
|
namespace skeleton {
|
|
namespace HWY_NAMESPACE {
|
|
|
|
-using hwy::HWY_NAMESPACE::MulAdd;
|
|
+using namespace hwy::HWY_NAMESPACE;
|
|
|
|
-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
|
|
-HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1,
|
|
- const float* HWY_RESTRICT in2,
|
|
- float* HWY_RESTRICT out) {
|
|
- // Descriptor(s) for all vector types used in this function.
|
|
- HWY_FULL(float) df;
|
|
-
|
|
- const auto mul = Set(df, kMultiplier);
|
|
- for (size_t i = 0; i < 256; i += Lanes(df)) {
|
|
- const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i));
|
|
- Store(result, df, out + i);
|
|
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
|
|
+// (uses best available instruction set) function in a header.
|
|
+//
|
|
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
|
|
+template <class D, typename T>
|
|
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
|
|
+ const T* HWY_RESTRICT add_array,
|
|
+ const size_t size, T* HWY_RESTRICT x_array) {
|
|
+ for (size_t i = 0; i < size; i += Lanes(d)) {
|
|
+ const auto mul = Load(d, mul_array + i);
|
|
+ const auto add = Load(d, add_array + i);
|
|
+ auto x = Load(d, x_array + i);
|
|
+ x = MulAdd(mul, x, add);
|
|
+ Store(x, d, x_array + i);
|
|
}
|
|
}
|
|
|
|
-// (This doesn't generate SIMD instructions, so is not required here)
|
|
-HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() {
|
|
- // Highway functions generate per-target implementations from the same source
|
|
- // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed,
|
|
- // entirely different codepaths can also be selected like so:
|
|
-#if HWY_GATHER_LANES > 1
|
|
- return "Has gather";
|
|
-#else
|
|
- return "Gather is limited to one lane";
|
|
-#endif
|
|
-}
|
|
-
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
} // namespace HWY_NAMESPACE
|
|
} // namespace skeleton
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 2021-06-02 10:56:05.170904063 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -12,30 +12,96 @@
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
-// Example of unit test for the "skeleton" module.
|
|
+// Example of unit test for the "skeleton" library.
|
|
|
|
-#include "hwy/examples/skeleton.h" // Skeleton
|
|
+#include "hwy/examples/skeleton.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
-#include "hwy/tests/test_util-inl.h" // RunTest
|
|
+#undef HWY_TARGET_INCLUDE
|
|
+#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
|
|
+#include "hwy/foreach_target.h"
|
|
+#include "hwy/highway.h"
|
|
+#include "hwy/tests/test_util-inl.h"
|
|
|
|
+// Optional: factor out parts of the implementation into *-inl.h
|
|
+#include "hwy/examples/skeleton-inl.h"
|
|
+
|
|
+HWY_BEFORE_NAMESPACE();
|
|
namespace skeleton {
|
|
+namespace HWY_NAMESPACE {
|
|
+
|
|
+using namespace hwy::HWY_NAMESPACE;
|
|
+
|
|
+// Calls function defined in skeleton.cc.
|
|
+struct TestFloorLog2 {
|
|
+ template <class T, class DF>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, DF df) {
|
|
+ const size_t count = 5 * Lanes(df);
|
|
+ auto in = hwy::AllocateAligned<uint8_t>(count);
|
|
+ auto expected = hwy::AllocateAligned<uint8_t>(count);
|
|
+
|
|
+ hwy::RandomState rng;
|
|
+ for (size_t i = 0; i < count; ++i) {
|
|
+ expected[i] = Random32(&rng) & 7;
|
|
+ in[i] = static_cast<uint8_t>(1u << expected[i]);
|
|
+ }
|
|
+ auto out = hwy::AllocateAligned<uint8_t>(count);
|
|
+ CallFloorLog2(in.get(), count, out.get());
|
|
+ int sum = 0;
|
|
+ for (size_t i = 0; i < count; ++i) {
|
|
+ // TODO(janwas): implement
|
|
+#if HWY_TARGET != HWY_RVV
|
|
+ HWY_ASSERT_EQ(expected[i], out[i]);
|
|
+#endif
|
|
+ sum += out[i];
|
|
+ }
|
|
+ hwy::PreventElision(sum);
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllFloorLog2() {
|
|
+ ForPartialVectors<TestFloorLog2>()(float());
|
|
+}
|
|
+
|
|
+// Calls function defined in skeleton-inl.h.
|
|
+struct TestSumMulAdd {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ hwy::RandomState rng;
|
|
+ const size_t count = 4096;
|
|
+ EXPECT_TRUE(count % Lanes(d) == 0);
|
|
+ auto mul = hwy::AllocateAligned<T>(count);
|
|
+ auto x = hwy::AllocateAligned<T>(count);
|
|
+ auto add = hwy::AllocateAligned<T>(count);
|
|
+ for (size_t i = 0; i < count; ++i) {
|
|
+ mul[i] = static_cast<T>(Random32(&rng) & 0xF);
|
|
+ x[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
|
+ add[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
|
+ }
|
|
+ double expected_sum = 0.0;
|
|
+ for (size_t i = 0; i < count; ++i) {
|
|
+ expected_sum += mul[i] * x[i] + add[i];
|
|
+ }
|
|
|
|
-TEST(SkeletonTest, MainTest) {
|
|
- HWY_ALIGN_MAX float in1[256];
|
|
- HWY_ALIGN_MAX float in2[256];
|
|
- HWY_ALIGN_MAX float out[256];
|
|
- for (size_t i = 0; i < 256; ++i) {
|
|
- in1[i] = static_cast<float>(i);
|
|
- in2[i] = in1[i] + 300;
|
|
+ MulAddLoop(d, mul.get(), add.get(), count, x.get());
|
|
+ HWY_ASSERT_EQ(4344240.0, expected_sum);
|
|
}
|
|
+};
|
|
|
|
- // Tests will run for all compiled targets to ensure all are OK.
|
|
- hwy::RunTest([&in1, &in2, &out]() {
|
|
- Skeleton(in1, in2, out);
|
|
- // Add EXPECT_... calls here.
|
|
- });
|
|
+HWY_NOINLINE void TestAllSumMulAdd() {
|
|
+ ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
|
|
}
|
|
|
|
+// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
+} // namespace HWY_NAMESPACE
|
|
+} // namespace skeleton
|
|
+HWY_AFTER_NAMESPACE();
|
|
+
|
|
+#if HWY_ONCE
|
|
+namespace skeleton {
|
|
+HWY_BEFORE_TEST(SkeletonTest);
|
|
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
|
|
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
|
|
} // namespace skeleton
|
|
+#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 2021-06-02 10:56:05.269904564 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -25,10 +25,10 @@
|
|
|
|
namespace hwy {
|
|
|
|
-// API version (https://semver.org/)
|
|
+// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
|
#define HWY_MAJOR 0
|
|
-#define HWY_MINOR 11
|
|
-#define HWY_PATCH 1
|
|
+#define HWY_MINOR 12
|
|
+#define HWY_PATCH 2
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
|
|
@@ -49,7 +49,7 @@ namespace hwy {
|
|
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
|
|
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
|
|
|
-// Vector of up to MAX_N lanes.
|
|
+// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
|
|
#define HWY_CAPPED(T, MAX_N) \
|
|
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
|
|
|
|
@@ -75,6 +75,10 @@ namespace hwy {
|
|
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
|
#elif HWY_STATIC_TARGET == HWY_NEON
|
|
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
|
|
+#elif HWY_STATIC_TARGET == HWY_SVE
|
|
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
|
|
+#elif HWY_STATIC_TARGET == HWY_SVE2
|
|
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
|
|
#elif HWY_STATIC_TARGET == HWY_PPC8
|
|
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
|
#elif HWY_STATIC_TARGET == HWY_SSE4
|
|
@@ -143,6 +147,18 @@ FunctionCache<RetType, Args...> Function
|
|
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
|
|
#endif
|
|
|
|
+#if HWY_TARGETS & HWY_SVE
|
|
+#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
|
|
+#else
|
|
+#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
|
|
+#endif
|
|
+
|
|
+#if HWY_TARGETS & HWY_SVE2
|
|
+#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
|
|
+#else
|
|
+#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
|
|
+#endif
|
|
+
|
|
#if HWY_TARGETS & HWY_PPC8
|
|
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
|
|
#else
|
|
@@ -261,8 +277,11 @@ FunctionCache<RetType, Args...> Function
|
|
#elif HWY_TARGET == HWY_AVX3
|
|
#include "hwy/ops/x86_512-inl.h"
|
|
#elif HWY_TARGET == HWY_PPC8
|
|
+#error "PPC is not yet supported"
|
|
#elif HWY_TARGET == HWY_NEON
|
|
#include "hwy/ops/arm_neon-inl.h"
|
|
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
|
+#include "hwy/ops/arm_sve-inl.h"
|
|
#elif HWY_TARGET == HWY_WASM
|
|
#include "hwy/ops/wasm_128-inl.h"
|
|
#elif HWY_TARGET == HWY_RVV
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 2021-06-02 10:56:05.276904599 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -29,128 +29,43 @@
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
+#if defined(_WIN32) || defined(_WIN64)
|
|
+#ifndef NOMINMAX
|
|
+#define NOMINMAX
|
|
+#endif // NOMINMAX
|
|
+#include <windows.h>
|
|
+#endif
|
|
+
|
|
+#if defined(__MACH__)
|
|
+#include <mach/mach.h>
|
|
+#include <mach/mach_time.h>
|
|
+#endif
|
|
+
|
|
+#if defined(__HAIKU__)
|
|
+#include <OS.h>
|
|
+#endif
|
|
+
|
|
#include "hwy/base.h"
|
|
#if HWY_ARCH_PPC
|
|
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
|
#elif HWY_ARCH_X86
|
|
|
|
-#ifdef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
#include <intrin.h>
|
|
#else
|
|
#include <cpuid.h> // NOLINT
|
|
-#endif // _MSC_VER
|
|
+#endif // HWY_COMPILER_MSVC
|
|
|
|
#endif // HWY_ARCH_X86
|
|
|
|
namespace hwy {
|
|
-namespace platform {
|
|
-namespace {
|
|
-
|
|
-#if HWY_ARCH_X86
|
|
-
|
|
-void Cpuid(const uint32_t level, const uint32_t count,
|
|
- uint32_t* HWY_RESTRICT abcd) {
|
|
-#if HWY_COMPILER_MSVC
|
|
- int regs[4];
|
|
- __cpuidex(regs, level, count);
|
|
- for (int i = 0; i < 4; ++i) {
|
|
- abcd[i] = regs[i];
|
|
- }
|
|
-#else
|
|
- uint32_t a;
|
|
- uint32_t b;
|
|
- uint32_t c;
|
|
- uint32_t d;
|
|
- __cpuid_count(level, count, a, b, c, d);
|
|
- abcd[0] = a;
|
|
- abcd[1] = b;
|
|
- abcd[2] = c;
|
|
- abcd[3] = d;
|
|
-#endif
|
|
-}
|
|
-
|
|
-std::string BrandString() {
|
|
- char brand_string[49];
|
|
- std::array<uint32_t, 4> abcd;
|
|
-
|
|
- // Check if brand string is supported (it is on all reasonable Intel/AMD)
|
|
- Cpuid(0x80000000U, 0, abcd.data());
|
|
- if (abcd[0] < 0x80000004U) {
|
|
- return std::string();
|
|
- }
|
|
-
|
|
- for (size_t i = 0; i < 3; ++i) {
|
|
- Cpuid(0x80000002U + i, 0, abcd.data());
|
|
- memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
|
|
- }
|
|
- brand_string[48] = 0;
|
|
- return brand_string;
|
|
-}
|
|
-
|
|
-// Returns the frequency quoted inside the brand string. This does not
|
|
-// account for throttling nor Turbo Boost.
|
|
-double NominalClockRate() {
|
|
- const std::string& brand_string = BrandString();
|
|
- // Brand strings include the maximum configured frequency. These prefixes are
|
|
- // defined by Intel CPUID documentation.
|
|
- const char* prefixes[3] = {"MHz", "GHz", "THz"};
|
|
- const double multipliers[3] = {1E6, 1E9, 1E12};
|
|
- for (size_t i = 0; i < 3; ++i) {
|
|
- const size_t pos_prefix = brand_string.find(prefixes[i]);
|
|
- if (pos_prefix != std::string::npos) {
|
|
- const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
|
|
- if (pos_space != std::string::npos) {
|
|
- const std::string digits =
|
|
- brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
|
|
- return std::stod(digits) * multipliers[i];
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- return 0.0;
|
|
-}
|
|
-
|
|
-#endif // HWY_ARCH_X86
|
|
-
|
|
-} // namespace
|
|
-
|
|
-// Returns tick rate. Invariant means the tick counter frequency is independent
|
|
-// of CPU throttling or sleep. May be expensive, caller should cache the result.
|
|
-double InvariantTicksPerSecond() {
|
|
-#if HWY_ARCH_PPC
|
|
- return __ppc_get_timebase_freq();
|
|
-#elif HWY_ARCH_X86
|
|
- // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
|
|
- return NominalClockRate();
|
|
-#else
|
|
- // Fall back to clock_gettime nanoseconds.
|
|
- return 1E9;
|
|
-#endif
|
|
-}
|
|
-
|
|
-} // namespace platform
|
|
namespace {
|
|
-
|
|
-// Prevents the compiler from eliding the computations that led to "output".
|
|
-template <class T>
|
|
-inline void PreventElision(T&& output) {
|
|
-#if HWY_COMPILER_MSVC == 0
|
|
- // Works by indicating to the compiler that "output" is being read and
|
|
- // modified. The +r constraint avoids unnecessary writes to memory, but only
|
|
- // works for built-in types (typically FuncOutput).
|
|
- asm volatile("" : "+r"(output) : : "memory");
|
|
-#else
|
|
- // MSVC does not support inline assembly anymore (and never supported GCC's
|
|
- // RTL constraints). Self-assignment with #pragma optimize("off") might be
|
|
- // expected to prevent elision, but it does not with MSVC 2015. Type-punning
|
|
- // with volatile pointers generates inefficient code on MSVC 2017.
|
|
- static std::atomic<T> dummy(T{});
|
|
- dummy.store(output, std::memory_order_relaxed);
|
|
-#endif
|
|
-}
|
|
-
|
|
namespace timer {
|
|
|
|
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
|
|
+// unsigned to guarantee wraparound on overflow.
|
|
+using Ticks = uint64_t;
|
|
+
|
|
// Start/Stop return absolute timestamps and must be placed immediately before
|
|
// and after the region to measure. We provide separate Start/Stop functions
|
|
// because they use different fences.
|
|
@@ -202,8 +117,8 @@ namespace timer {
|
|
|
|
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
|
|
// divide by InvariantTicksPerSecond.
|
|
-inline uint64_t Start64() {
|
|
- uint64_t t;
|
|
+inline Ticks Start() {
|
|
+ Ticks t;
|
|
#if HWY_ARCH_PPC
|
|
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
@@ -228,8 +143,15 @@ inline uint64_t Start64() {
|
|
: "rdx", "memory", "cc");
|
|
#elif HWY_ARCH_RVV
|
|
asm volatile("rdcycle %0" : "=r"(t));
|
|
-#else
|
|
- // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
|
|
+#elif defined(_WIN32) || defined(_WIN64)
|
|
+ LARGE_INTEGER counter;
|
|
+ (void)QueryPerformanceCounter(&counter);
|
|
+ t = counter.QuadPart;
|
|
+#elif defined(__MACH__)
|
|
+ t = mach_absolute_time();
|
|
+#elif defined(__HAIKU__)
|
|
+ t = system_time_nsecs(); // since boot
|
|
+#else // POSIX
|
|
timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
|
|
@@ -237,7 +159,7 @@ inline uint64_t Start64() {
|
|
return t;
|
|
}
|
|
|
|
-inline uint64_t Stop64() {
|
|
+inline Ticks Stop() {
|
|
uint64_t t;
|
|
#if HWY_ARCH_PPC
|
|
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
@@ -261,61 +183,7 @@ inline uint64_t Stop64() {
|
|
// "cc" = flags modified by SHL.
|
|
: "rcx", "rdx", "memory", "cc");
|
|
#else
|
|
- t = Start64();
|
|
-#endif
|
|
- return t;
|
|
-}
|
|
-
|
|
-// Returns a 32-bit timestamp with about 4 cycles less overhead than
|
|
-// Start64. Only suitable for measuring very short regions because the
|
|
-// timestamp overflows about once a second.
|
|
-inline uint32_t Start32() {
|
|
- uint32_t t;
|
|
-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
- _ReadWriteBarrier();
|
|
- _mm_lfence();
|
|
- _ReadWriteBarrier();
|
|
- t = static_cast<uint32_t>(__rdtsc());
|
|
- _ReadWriteBarrier();
|
|
- _mm_lfence();
|
|
- _ReadWriteBarrier();
|
|
-#elif HWY_ARCH_X86_64
|
|
- asm volatile(
|
|
- "lfence\n\t"
|
|
- "rdtsc\n\t"
|
|
- "lfence"
|
|
- : "=a"(t)
|
|
- :
|
|
- // "memory" avoids reordering. rdx = TSC >> 32.
|
|
- : "rdx", "memory");
|
|
-#elif HWY_ARCH_RVV
|
|
- asm volatile("rdcycle %0" : "=r"(t));
|
|
-#else
|
|
- t = static_cast<uint32_t>(Start64());
|
|
-#endif
|
|
- return t;
|
|
-}
|
|
-
|
|
-inline uint32_t Stop32() {
|
|
- uint32_t t;
|
|
-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
- _ReadWriteBarrier();
|
|
- unsigned aux;
|
|
- t = static_cast<uint32_t>(__rdtscp(&aux));
|
|
- _ReadWriteBarrier();
|
|
- _mm_lfence();
|
|
- _ReadWriteBarrier();
|
|
-#elif HWY_ARCH_X86_64
|
|
- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
|
- asm volatile(
|
|
- "rdtscp\n\t"
|
|
- "lfence"
|
|
- : "=a"(t)
|
|
- :
|
|
- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
|
- : "rcx", "rdx", "memory");
|
|
-#else
|
|
- t = static_cast<uint32_t>(Stop64());
|
|
+ t = Start();
|
|
#endif
|
|
return t;
|
|
}
|
|
@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value
|
|
}
|
|
|
|
} // namespace robust_statistics
|
|
+} // namespace
|
|
+namespace platform {
|
|
+namespace {
|
|
|
|
-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
|
|
-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
|
|
-// read than 64 bit.
|
|
-using Ticks = uint32_t;
|
|
+// Prevents the compiler from eliding the computations that led to "output".
|
|
+template <class T>
|
|
+inline void PreventElision(T&& output) {
|
|
+#if HWY_COMPILER_MSVC == 0
|
|
+ // Works by indicating to the compiler that "output" is being read and
|
|
+ // modified. The +r constraint avoids unnecessary writes to memory, but only
|
|
+ // works for built-in types (typically FuncOutput).
|
|
+ asm volatile("" : "+r"(output) : : "memory");
|
|
+#else
|
|
+ // MSVC does not support inline assembly anymore (and never supported GCC's
|
|
+ // RTL constraints). Self-assignment with #pragma optimize("off") might be
|
|
+ // expected to prevent elision, but it does not with MSVC 2015. Type-punning
|
|
+ // with volatile pointers generates inefficient code on MSVC 2017.
|
|
+ static std::atomic<T> dummy(T{});
|
|
+ dummy.store(output, std::memory_order_relaxed);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#if HWY_ARCH_X86
|
|
+
|
|
+void Cpuid(const uint32_t level, const uint32_t count,
|
|
+ uint32_t* HWY_RESTRICT abcd) {
|
|
+#if HWY_COMPILER_MSVC
|
|
+ int regs[4];
|
|
+ __cpuidex(regs, level, count);
|
|
+ for (int i = 0; i < 4; ++i) {
|
|
+ abcd[i] = regs[i];
|
|
+ }
|
|
+#else
|
|
+ uint32_t a;
|
|
+ uint32_t b;
|
|
+ uint32_t c;
|
|
+ uint32_t d;
|
|
+ __cpuid_count(level, count, a, b, c, d);
|
|
+ abcd[0] = a;
|
|
+ abcd[1] = b;
|
|
+ abcd[2] = c;
|
|
+ abcd[3] = d;
|
|
+#endif
|
|
+}
|
|
+
|
|
+std::string BrandString() {
|
|
+ char brand_string[49];
|
|
+ std::array<uint32_t, 4> abcd;
|
|
+
|
|
+ // Check if brand string is supported (it is on all reasonable Intel/AMD)
|
|
+ Cpuid(0x80000000U, 0, abcd.data());
|
|
+ if (abcd[0] < 0x80000004U) {
|
|
+ return std::string();
|
|
+ }
|
|
+
|
|
+ for (size_t i = 0; i < 3; ++i) {
|
|
+ Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
|
|
+ memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
|
|
+ }
|
|
+ brand_string[48] = 0;
|
|
+ return brand_string;
|
|
+}
|
|
+
|
|
+// Returns the frequency quoted inside the brand string. This does not
|
|
+// account for throttling nor Turbo Boost.
|
|
+double NominalClockRate() {
|
|
+ const std::string& brand_string = BrandString();
|
|
+ // Brand strings include the maximum configured frequency. These prefixes are
|
|
+ // defined by Intel CPUID documentation.
|
|
+ const char* prefixes[3] = {"MHz", "GHz", "THz"};
|
|
+ const double multipliers[3] = {1E6, 1E9, 1E12};
|
|
+ for (size_t i = 0; i < 3; ++i) {
|
|
+ const size_t pos_prefix = brand_string.find(prefixes[i]);
|
|
+ if (pos_prefix != std::string::npos) {
|
|
+ const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
|
|
+ if (pos_space != std::string::npos) {
|
|
+ const std::string digits =
|
|
+ brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
|
|
+ return std::stod(digits) * multipliers[i];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0.0;
|
|
+}
|
|
+
|
|
+#endif // HWY_ARCH_X86
|
|
+
|
|
+} // namespace
|
|
+
|
|
+double InvariantTicksPerSecond() {
|
|
+#if HWY_ARCH_PPC
|
|
+ return __ppc_get_timebase_freq();
|
|
+#elif HWY_ARCH_X86
|
|
+ // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
|
|
+ return NominalClockRate();
|
|
+#elif defined(_WIN32) || defined(_WIN64)
|
|
+ LARGE_INTEGER freq;
|
|
+ (void)QueryPerformanceFrequency(&freq);
|
|
+ return double(freq.QuadPart);
|
|
+#elif defined(__MACH__)
|
|
+ // https://developer.apple.com/library/mac/qa/qa1398/_index.html
|
|
+ mach_timebase_info_data_t timebase;
|
|
+ (void)mach_timebase_info(&timebase);
|
|
+ return double(timebase.denom) / timebase.numer * 1E9;
|
|
+#else
|
|
+ // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
|
|
+ return 1E9; // Haiku and clock_gettime return nanoseconds.
|
|
+#endif
|
|
+}
|
|
|
|
-// Returns timer overhead / minimum measurable difference.
|
|
-Ticks TimerResolution() {
|
|
+double Now() {
|
|
+ static const double mul = 1.0 / InvariantTicksPerSecond();
|
|
+ return static_cast<double>(timer::Start()) * mul;
|
|
+}
|
|
+
|
|
+uint64_t TimerResolution() {
|
|
// Nested loop avoids exceeding stack/L1 capacity.
|
|
- Ticks repetitions[Params::kTimerSamples];
|
|
+ timer::Ticks repetitions[Params::kTimerSamples];
|
|
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
|
|
- Ticks samples[Params::kTimerSamples];
|
|
+ timer::Ticks samples[Params::kTimerSamples];
|
|
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
|
|
- const Ticks t0 = timer::Start32();
|
|
- const Ticks t1 = timer::Stop32();
|
|
+ const timer::Ticks t0 = timer::Start();
|
|
+ const timer::Ticks t1 = timer::Stop();
|
|
samples[i] = t1 - t0;
|
|
}
|
|
repetitions[rep] = robust_statistics::Mode(samples);
|
|
@@ -462,18 +439,21 @@ Ticks TimerResolution() {
|
|
return robust_statistics::Mode(repetitions);
|
|
}
|
|
|
|
-static const Ticks timer_resolution = TimerResolution();
|
|
+} // namespace platform
|
|
+namespace {
|
|
+
|
|
+static const timer::Ticks timer_resolution = platform::TimerResolution();
|
|
|
|
// Estimates the expected value of "lambda" values with a variable number of
|
|
// samples until the variability "rel_mad" is less than "max_rel_mad".
|
|
template <class Lambda>
|
|
-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|
- const Params& p, const Lambda& lambda) {
|
|
+timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|
+ const Params& p, const Lambda& lambda) {
|
|
// Choose initial samples_per_eval based on a single estimated duration.
|
|
- Ticks t0 = timer::Start32();
|
|
+ timer::Ticks t0 = timer::Start();
|
|
lambda();
|
|
- Ticks t1 = timer::Stop32();
|
|
- Ticks est = t1 - t0;
|
|
+ timer::Ticks t1 = timer::Stop();
|
|
+ timer::Ticks est = t1 - t0;
|
|
static const double ticks_per_second = platform::InvariantTicksPerSecond();
|
|
const size_t ticks_per_eval =
|
|
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
|
|
@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max
|
|
est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
|
|
samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
|
|
|
|
- std::vector<Ticks> samples;
|
|
+ std::vector<timer::Ticks> samples;
|
|
samples.reserve(1 + samples_per_eval);
|
|
samples.push_back(est);
|
|
|
|
// Percentage is too strict for tiny differences, so also allow a small
|
|
// absolute "median absolute deviation".
|
|
- const Ticks max_abs_mad = (timer_resolution + 99) / 100;
|
|
+ const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
|
|
*rel_mad = 0.0; // ensure initialized
|
|
|
|
for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
|
|
samples.reserve(samples.size() + samples_per_eval);
|
|
for (size_t i = 0; i < samples_per_eval; ++i) {
|
|
- t0 = timer::Start32();
|
|
+ t0 = timer::Start();
|
|
lambda();
|
|
- t1 = timer::Stop32();
|
|
+ t1 = timer::Stop();
|
|
samples.push_back(t1 - t0);
|
|
}
|
|
|
|
@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max
|
|
NANOBENCHMARK_CHECK(est != 0);
|
|
|
|
// Median absolute deviation (mad) is a robust measure of 'variability'.
|
|
- const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
|
|
+ const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
|
|
samples.data(), samples.size(), est);
|
|
- *rel_mad = static_cast<double>(int(abs_mad)) / est;
|
|
+ *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
|
|
|
|
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
|
|
if (p.verbose) {
|
|
- printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
|
|
- samples.size(), est, abs_mad, *rel_mad * 100.0);
|
|
+ printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
|
|
+ samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
|
|
}
|
|
return est;
|
|
}
|
|
@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i
|
|
return unique;
|
|
}
|
|
|
|
-// Returns how often we need to call func for sufficient precision, or zero
|
|
-// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
|
|
+// Returns how often we need to call func for sufficient precision.
|
|
size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
|
|
const Params& p) {
|
|
// Min elapsed ticks for any input.
|
|
- Ticks min_duration = ~0u;
|
|
+ timer::Ticks min_duration = ~timer::Ticks(0);
|
|
|
|
for (const FuncInput input : unique) {
|
|
- // Make sure a 32-bit timer is sufficient.
|
|
- const uint64_t t0 = timer::Start64();
|
|
- PreventElision(func(arg, input));
|
|
- const uint64_t t1 = timer::Stop64();
|
|
- const uint64_t elapsed = t1 - t0;
|
|
- if (elapsed >= (1ULL << 30)) {
|
|
- fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
|
|
- input);
|
|
- return 0;
|
|
- }
|
|
-
|
|
double rel_mad;
|
|
- const Ticks total = SampleUntilStable(
|
|
+ const timer::Ticks total = SampleUntilStable(
|
|
p.target_rel_mad, &rel_mad, p,
|
|
- [func, arg, input]() { PreventElision(func(arg, input)); });
|
|
+ [func, arg, input]() { platform::PreventElision(func(arg, input)); });
|
|
min_duration = std::min(min_duration, total - timer_resolution);
|
|
}
|
|
|
|
@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui
|
|
const size_t num_skip =
|
|
min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
|
|
if (p.verbose) {
|
|
- printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
|
|
- max_skip, min_duration, num_skip);
|
|
+ printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
|
|
+ size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
|
|
}
|
|
return num_skip;
|
|
}
|
|
@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co
|
|
}
|
|
|
|
// Returns total ticks elapsed for all inputs.
|
|
-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
|
|
- const Params& p, double* max_rel_mad) {
|
|
+timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
|
|
+ const InputVec* inputs, const Params& p,
|
|
+ double* max_rel_mad) {
|
|
double rel_mad;
|
|
- const Ticks duration =
|
|
+ const timer::Ticks duration =
|
|
SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
|
|
for (const FuncInput input : *inputs) {
|
|
- PreventElision(func(arg, input));
|
|
+ platform::PreventElision(func(arg, input));
|
|
}
|
|
});
|
|
*max_rel_mad = std::max(*max_rel_mad, rel_mad);
|
|
@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const
|
|
|
|
// Returns overhead of accessing inputs[] and calling a function; this will
|
|
// be deducted from future TotalDuration return values.
|
|
-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
|
|
+timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
|
|
+ const Params& p) {
|
|
double rel_mad;
|
|
// Zero tolerance because repeatability is crucial and EmptyFunc is fast.
|
|
return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
|
|
for (const FuncInput input : *inputs) {
|
|
- PreventElision(EmptyFunc(arg, input));
|
|
+ platform::PreventElision(EmptyFunc(arg, input));
|
|
}
|
|
});
|
|
}
|
|
|
|
} // namespace
|
|
|
|
-int Unpredictable1() { return timer::Start64() != ~0ULL; }
|
|
+int Unpredictable1() { return timer::Start() != ~0ULL; }
|
|
|
|
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
|
const size_t num_inputs, Result* results, const Params& p) {
|
|
@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui
|
|
ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
|
|
InputVec subset(full.size() - num_skip);
|
|
|
|
- const Ticks overhead = Overhead(arg, &full, p);
|
|
- const Ticks overhead_skip = Overhead(arg, &subset, p);
|
|
+ const timer::Ticks overhead = Overhead(arg, &full, p);
|
|
+ const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
|
|
if (overhead < overhead_skip) {
|
|
- fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
|
|
- overhead_skip);
|
|
+ fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
|
|
+ size_t(overhead), size_t(overhead_skip));
|
|
return 0;
|
|
}
|
|
|
|
if (p.verbose) {
|
|
- printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
|
|
- overhead, overhead_skip);
|
|
+ printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
|
|
+ size_t(overhead), size_t(overhead_skip));
|
|
}
|
|
|
|
double max_rel_mad = 0.0;
|
|
- const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
|
|
+ const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
|
|
|
|
for (size_t i = 0; i < unique.size(); ++i) {
|
|
FillSubset(full, unique[i], num_skip, &subset);
|
|
- const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
|
|
+ const timer::Ticks total_skip =
|
|
+ TotalDuration(func, arg, &subset, p, &max_rel_mad);
|
|
|
|
if (total < total_skip) {
|
|
- fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
|
|
+ fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
|
|
+ size_t(total_skip));
|
|
return 0;
|
|
}
|
|
|
|
- const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
|
|
+ const timer::Ticks duration =
|
|
+ (total - overhead) - (total_skip - overhead_skip);
|
|
results[i].input = unique[i];
|
|
results[i].ticks = static_cast<float>(duration) * mul;
|
|
results[i].variability = static_cast<float>(max_rel_mad);
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 2021-06-02 10:56:05.272904579 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -44,11 +44,6 @@
|
|
// central tendency of the measurement samples with the "half sample mode",
|
|
// which is more robust to outliers and skewed data than the mean or median.
|
|
|
|
-// WARNING if included from multiple translation units compiled with distinct
|
|
-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
|
|
-// macro that is unique to the current compile flags. We must also avoid
|
|
-// standard library headers such as vector and functional that define functions.
|
|
-
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
@@ -79,6 +74,16 @@ namespace platform {
|
|
// This call may be expensive, callers should cache the result.
|
|
double InvariantTicksPerSecond();
|
|
|
|
+// Returns current timestamp [in seconds] relative to an unspecified origin.
|
|
+// Features: monotonic (no negative elapsed time), steady (unaffected by system
|
|
+// time changes), high-resolution (on the order of microseconds).
|
|
+double Now();
|
|
+
|
|
+// Returns ticks elapsed in back to back timer calls, i.e. a function of the
|
|
+// timer resolution (minimum measurable difference) and overhead.
|
|
+// This call is expensive, callers should cache the result.
|
|
+uint64_t TimerResolution();
|
|
+
|
|
} // namespace platform
|
|
|
|
// Returns 1, but without the compiler knowing what the value is. This prevents
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 2021-06-02 10:56:05.275904594 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -15,11 +15,11 @@
|
|
#include "hwy/nanobenchmark.h"
|
|
|
|
#include <stdio.h>
|
|
-#include <stdlib.h> // strtol
|
|
-#include <unistd.h> // sleep
|
|
|
|
#include <random>
|
|
|
|
+#include "hwy/tests/test_util-inl.h"
|
|
+
|
|
namespace hwy {
|
|
namespace {
|
|
|
|
@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in
|
|
|
|
template <size_t N>
|
|
void MeasureDiv(const FuncInput (&inputs)[N]) {
|
|
+ printf("Measuring integer division (output on final two lines)\n");
|
|
Result results[N];
|
|
Params params;
|
|
params.max_evals = 4; // avoid test timeout
|
|
@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp
|
|
}
|
|
}
|
|
|
|
-template <size_t N>
|
|
-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
|
|
- printf("Expect a 'measurement failed' below:\n");
|
|
- Result results[N];
|
|
-
|
|
- const size_t num_results = Measure(
|
|
- [](const void*, const FuncInput input) -> FuncOutput {
|
|
- // Loop until the sleep succeeds (not interrupted by signal). We assume
|
|
- // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
|
|
- while (sleep(2) != 0) {
|
|
- }
|
|
- return input;
|
|
- },
|
|
- nullptr, inputs, N, results);
|
|
- NANOBENCHMARK_CHECK(num_results == 0);
|
|
- (void)num_results;
|
|
-}
|
|
-
|
|
-void RunAll(const int argc, char** /*argv*/) {
|
|
- // unpredictable == 1 but the compiler doesn't know that.
|
|
- const int unpredictable = argc != 999;
|
|
+TEST(NanobenchmarkTest, RunAll) {
|
|
+ const int unpredictable = Unpredictable1(); // == 1, unknown to compiler.
|
|
static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
|
|
static_cast<FuncInput>(unpredictable + 9)};
|
|
|
|
MeasureDiv(inputs);
|
|
MeasureRandom(inputs);
|
|
- EnsureLongMeasurementFails(inputs);
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace hwy
|
|
-
|
|
-int main(int argc, char* argv[]) {
|
|
- hwy::RunAll(argc, argv);
|
|
- return 0;
|
|
-}
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 2021-06-02 10:56:05.239904412 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE();
|
|
namespace hwy {
|
|
namespace HWY_NAMESPACE {
|
|
|
|
+namespace detail { // for code folding and Raw128
|
|
+
|
|
// Macros used to define single and double function calls for multiple types
|
|
// for full and half vectors. These macros are undefined at the end of the file.
|
|
|
|
@@ -133,7 +135,7 @@ namespace HWY_NAMESPACE {
|
|
HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
|
|
|
|
// float and double
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
|
|
HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
|
|
HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
|
|
@@ -181,7 +183,7 @@ namespace HWY_NAMESPACE {
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
|
|
|
|
// Emulation of some intrinsics on armv7.
|
|
-#if !defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_V7
|
|
#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
|
|
#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
|
|
#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
|
|
@@ -294,7 +296,7 @@ struct Raw128<float, 4> {
|
|
using type = float32x4_t;
|
|
};
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
template <>
|
|
struct Raw128<double, 2> {
|
|
using type = float64x2_t;
|
|
@@ -352,7 +354,7 @@ struct Raw128<float, 2> {
|
|
using type = float32x2_t;
|
|
};
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
template <>
|
|
struct Raw128<double, 1> {
|
|
using type = float64x1_t;
|
|
@@ -437,12 +439,14 @@ struct Raw128<int8_t, 1> {
|
|
using type = int8x8_t;
|
|
};
|
|
|
|
+} // namespace detail
|
|
+
|
|
template <typename T>
|
|
using Full128 = Simd<T, 16 / sizeof(T)>;
|
|
|
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
class Vec128 {
|
|
- using Raw = typename Raw128<T, N>::type;
|
|
+ using Raw = typename detail::Raw128<T, N>::type;
|
|
|
|
public:
|
|
HWY_INLINE Vec128() {}
|
|
@@ -480,7 +484,8 @@ class Vec128 {
|
|
// FF..FF or 0, also for floating-point - see README.
|
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
class Mask128 {
|
|
- using Raw = typename Raw128<T, N>::type;
|
|
+ // ARM C Language Extensions return and expect unsigned type.
|
|
+ using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
|
|
|
|
public:
|
|
HWY_INLINE Mask128() {}
|
|
@@ -573,7 +578,7 @@ HWY_INLINE Vec128<int64_t, 1> BitCastFro
|
|
Vec128<uint8_t, 1 * 8> v) {
|
|
return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> BitCastFromByte(Simd<double, 1> /* tag */,
|
|
Vec128<uint8_t, 1 * 8> v) {
|
|
return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
|
|
@@ -615,7 +620,7 @@ HWY_INLINE Vec128<int64_t> BitCastFromBy
|
|
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
|
|
}
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
|
|
Vec128<uint8_t> v) {
|
|
return Vec128<double>(vreinterpretq_f64_u8(v.raw));
|
|
@@ -664,15 +669,25 @@ template <typename T, size_t N>
|
|
HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
|
|
HWY_DIAGNOSTICS(push)
|
|
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
|
- typename Raw128<T, N>::type a;
|
|
+ typename detail::Raw128<T, N>::type a;
|
|
return Vec128<T, N>(a);
|
|
HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
-// ------------------------------ Extract lane
|
|
+// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
+template <typename T, size_t N, typename T2>
|
|
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
|
|
+ HWY_ALIGN T lanes[16 / sizeof(T)];
|
|
+ for (size_t i = 0; i < 16 / sizeof(T); ++i) {
|
|
+ lanes[i] = static_cast<T>(first + static_cast<T2>(i));
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
+}
|
|
+
|
|
+// ------------------------------ GetLane
|
|
|
|
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
|
|
- return vget_lane_u8(vget_low_u8(v.raw), 0);
|
|
+ return vgetq_lane_u8(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
|
|
@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128<
|
|
}
|
|
|
|
HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
|
|
- return vget_lane_s8(vget_low_s8(v.raw), 0);
|
|
+ return vgetq_lane_s8(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
|
|
@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128<i
|
|
}
|
|
|
|
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
|
|
- return vget_lane_u16(vget_low_u16(v.raw), 0);
|
|
+ return vgetq_lane_u16(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
|
|
@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128
|
|
}
|
|
|
|
HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
|
|
- return vget_lane_s16(vget_low_s16(v.raw), 0);
|
|
+ return vgetq_lane_s16(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
|
|
@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128<
|
|
}
|
|
|
|
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
|
|
- return vget_lane_u32(vget_low_u32(v.raw), 0);
|
|
+ return vgetq_lane_u32(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
|
|
@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128
|
|
}
|
|
|
|
HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
|
|
- return vget_lane_s32(vget_low_s32(v.raw), 0);
|
|
+ return vgetq_lane_s32(v.raw, 0);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
|
|
@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128<
|
|
}
|
|
|
|
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
|
|
- return vget_lane_u64(vget_low_u64(v.raw), 0);
|
|
+ return vgetq_lane_u64(v.raw, 0);
|
|
}
|
|
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
|
|
return vget_lane_u64(v.raw, 0);
|
|
}
|
|
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
|
|
- return vget_lane_s64(vget_low_s64(v.raw), 0);
|
|
+ return vgetq_lane_s64(v.raw, 0);
|
|
}
|
|
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
|
|
return vget_lane_s64(v.raw, 0);
|
|
}
|
|
|
|
HWY_INLINE float GetLane(const Vec128<float, 4> v) {
|
|
- return vget_lane_f32(vget_low_f32(v.raw), 0);
|
|
+ return vgetq_lane_f32(v.raw, 0);
|
|
}
|
|
HWY_INLINE float GetLane(const Vec128<float, 2> v) {
|
|
return vget_lane_f32(v.raw, 0);
|
|
@@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128<fl
|
|
HWY_INLINE float GetLane(const Vec128<float, 1> v) {
|
|
return vget_lane_f32(v.raw, 0);
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE double GetLane(const Vec128<double, 2> v) {
|
|
- return vget_lane_f64(vget_low_f64(v.raw), 0);
|
|
+ return vgetq_lane_f64(v.raw, 0);
|
|
}
|
|
HWY_INLINE double GetLane(const Vec128<double, 1> v) {
|
|
return vget_lane_f64(v.raw, 0);
|
|
@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu
|
|
// ------------------------------ Average
|
|
|
|
// Returns (a + b + 1) / 2
|
|
-
|
|
-// Unsigned
|
|
HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
|
|
HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
|
|
|
|
@@ -802,6 +815,7 @@ HWY_INLINE Vec128<int16_t> Abs(const Vec
|
|
HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
|
|
return Vec128<int32_t>(vabsq_s32(v.raw));
|
|
}
|
|
+// i64 is implemented after BroadcastSignBit.
|
|
HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
|
|
return Vec128<float>(vabsq_f32(v.raw));
|
|
}
|
|
@@ -823,7 +837,7 @@ HWY_INLINE Vec128<float, N> Abs(const Ve
|
|
return Vec128<float, N>(vabs_f32(v.raw));
|
|
}
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
|
|
return Vec128<double>(vabsq_f64(v.raw));
|
|
}
|
|
@@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn
|
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
|
|
|
|
HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return Vec128<int64_t, 1>(vneg_s64(v.raw));
|
|
#else
|
|
return Zero(Simd<int64_t, 1>()) - v;
|
|
@@ -847,7 +861,7 @@ HWY_INLINE Vec128<int64_t, 1> Neg(const
|
|
}
|
|
|
|
HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return Vec128<int64_t>(vnegq_s64(v.raw));
|
|
#else
|
|
return Zero(Full128<int64_t>()) - v;
|
|
@@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v
|
|
|
|
// ------------------------------ Shl
|
|
|
|
+HWY_INLINE Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
|
|
+ const Vec128<uint8_t> bits) {
|
|
+ return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
|
|
+}
|
|
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
|
|
+HWY_INLINE Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
|
|
+ const Vec128<uint8_t, N> bits) {
|
|
+ return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
|
|
+}
|
|
+
|
|
HWY_INLINE Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
|
|
const Vec128<uint16_t> bits) {
|
|
return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
|
|
@@ -905,6 +929,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator<
|
|
return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
|
|
}
|
|
|
|
+HWY_INLINE Vec128<int8_t> operator<<(const Vec128<int8_t> v,
|
|
+ const Vec128<int8_t> bits) {
|
|
+ return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
|
|
+}
|
|
+template <size_t N, HWY_IF_LE64(int8_t, N)>
|
|
+HWY_INLINE Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
|
|
+ const Vec128<int8_t, N> bits) {
|
|
+ return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
|
|
+}
|
|
+
|
|
HWY_INLINE Vec128<int16_t> operator<<(const Vec128<int16_t> v,
|
|
const Vec128<int16_t> bits) {
|
|
return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
|
|
@@ -936,6 +970,18 @@ HWY_INLINE Vec128<int64_t, 1> operator<<
|
|
|
|
// ------------------------------ Shr (Neg)
|
|
|
|
+HWY_INLINE Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
|
|
+ const Vec128<uint8_t> bits) {
|
|
+ const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
|
|
+ return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
|
|
+}
|
|
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
|
|
+HWY_INLINE Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
|
|
+ const Vec128<uint8_t, N> bits) {
|
|
+ const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
|
|
+ return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
|
|
+}
|
|
+
|
|
HWY_INLINE Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
|
|
const Vec128<uint16_t> bits) {
|
|
const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
|
|
@@ -971,6 +1017,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator>
|
|
return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
|
|
}
|
|
|
|
+HWY_INLINE Vec128<int8_t> operator>>(const Vec128<int8_t> v,
|
|
+ const Vec128<int8_t> bits) {
|
|
+ return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
|
|
+}
|
|
+template <size_t N, HWY_IF_LE64(int8_t, N)>
|
|
+HWY_INLINE Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
|
|
+ const Vec128<int8_t, N> bits) {
|
|
+ return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
|
|
+}
|
|
+
|
|
HWY_INLINE Vec128<int16_t> operator>>(const Vec128<int16_t> v,
|
|
const Vec128<int16_t> bits) {
|
|
return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
|
|
@@ -1059,7 +1115,7 @@ HWY_INLINE Vec128<int32_t, N> operator*(
|
|
HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
|
|
const Vec128<int16_t> b) {
|
|
int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
|
|
#else
|
|
int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
|
|
@@ -1070,7 +1126,7 @@ HWY_INLINE Vec128<int16_t> MulHigh(const
|
|
HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
|
|
const Vec128<uint16_t> b) {
|
|
uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
|
|
#else
|
|
uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
|
|
@@ -1139,24 +1195,37 @@ HWY_INLINE Vec128<float, N> ApproximateR
|
|
return Vec128<float, N>(vrecpe_f32(v.raw));
|
|
}
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
|
|
#else
|
|
-// Emulated with approx reciprocal + Newton-Raphson + mul
|
|
+// Not defined on armv7: approximate
|
|
+namespace detail {
|
|
+
|
|
+HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
|
|
+ const Vec128<float> recip, const Vec128<float> divisor) {
|
|
+ return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
|
|
+}
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
|
|
+ const Vec128<float, N> recip, Vec128<float, N> divisor) {
|
|
+ return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
|
|
const Vec128<float, N> b) {
|
|
auto x = ApproximateReciprocal(b);
|
|
- // Newton-Raphson on 1/x - b
|
|
- const auto two = Set(Simd<float, N>(), 2);
|
|
- x = x * (two - b * x);
|
|
- x = x * (two - b * x);
|
|
- x = x * (two - b * x);
|
|
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
|
|
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
|
|
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
|
|
return a * x;
|
|
}
|
|
#endif
|
|
|
|
-// Absolute value of difference.
|
|
+// ------------------------------ Absolute value of difference.
|
|
+
|
|
HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
|
|
return Vec128<float>(vabdq_f32(a.raw, b.raw));
|
|
}
|
|
@@ -1169,7 +1238,7 @@ HWY_INLINE Vec128<float, N> AbsDiff(cons
|
|
// ------------------------------ Floating-point multiply-add variants
|
|
|
|
// Returns add + mul * x
|
|
-#if defined(__aarch64__)
|
|
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
|
template <size_t N, HWY_IF_LE64(float, N)>
|
|
HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
|
|
const Vec128<float, N> x,
|
|
@@ -1180,6 +1249,17 @@ HWY_INLINE Vec128<float> MulAdd(const Ve
|
|
const Vec128<float> add) {
|
|
return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
|
|
}
|
|
+#else
|
|
+// Emulate FMA for floats.
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
|
|
+ const Vec128<float, N> x,
|
|
+ const Vec128<float, N> add) {
|
|
+ return mul * x + add;
|
|
+}
|
|
+#endif
|
|
+
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
|
|
const Vec128<double, 1> x,
|
|
const Vec128<double, 1> add) {
|
|
@@ -1190,18 +1270,10 @@ HWY_INLINE Vec128<double> MulAdd(const V
|
|
const Vec128<double> add) {
|
|
return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
|
|
}
|
|
-#else
|
|
-// Emulate FMA for floats.
|
|
-template <size_t N>
|
|
-HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
|
|
- const Vec128<float, N> x,
|
|
- const Vec128<float, N> add) {
|
|
- return mul * x + add;
|
|
-}
|
|
#endif
|
|
|
|
// Returns add - mul * x
|
|
-#if defined(__aarch64__)
|
|
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
|
template <size_t N, HWY_IF_LE64(float, N)>
|
|
HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
|
|
const Vec128<float, N> x,
|
|
@@ -1213,7 +1285,17 @@ HWY_INLINE Vec128<float> NegMulAdd(const
|
|
const Vec128<float> add) {
|
|
return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
|
|
}
|
|
+#else
|
|
+// Emulate FMA for floats.
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
|
|
+ const Vec128<float, N> x,
|
|
+ const Vec128<float, N> add) {
|
|
+ return add - mul * x;
|
|
+}
|
|
+#endif
|
|
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
|
|
const Vec128<double, 1> x,
|
|
const Vec128<double, 1> add) {
|
|
@@ -1224,14 +1306,6 @@ HWY_INLINE Vec128<double> NegMulAdd(cons
|
|
const Vec128<double> add) {
|
|
return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
|
|
}
|
|
-#else
|
|
-// Emulate FMA for floats.
|
|
-template <size_t N>
|
|
-HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
|
|
- const Vec128<float, N> x,
|
|
- const Vec128<float, N> add) {
|
|
- return add - mul * x;
|
|
-}
|
|
#endif
|
|
|
|
// Returns mul * x - sub
|
|
@@ -1241,12 +1315,6 @@ HWY_INLINE Vec128<float, N> MulSub(const
|
|
const Vec128<float, N> sub) {
|
|
return MulAdd(mul, x, Neg(sub));
|
|
}
|
|
-template <size_t N>
|
|
-HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
|
|
- const Vec128<double, N> x,
|
|
- const Vec128<double, N> sub) {
|
|
- return MulAdd(mul, x, Neg(sub));
|
|
-}
|
|
|
|
// Returns -mul * x - sub
|
|
template <size_t N>
|
|
@@ -1255,14 +1323,23 @@ HWY_INLINE Vec128<float, N> NegMulSub(co
|
|
const Vec128<float, N> sub) {
|
|
return Neg(MulAdd(mul, x, sub));
|
|
}
|
|
+
|
|
+#if HWY_ARCH_ARM_A64
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
|
|
+ const Vec128<double, N> x,
|
|
+ const Vec128<double, N> sub) {
|
|
+ return MulAdd(mul, x, Neg(sub));
|
|
+}
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
|
|
const Vec128<double, N> x,
|
|
const Vec128<double, N> sub) {
|
|
return Neg(MulAdd(mul, x, sub));
|
|
}
|
|
+#endif
|
|
|
|
-// ------------------------------ Floating-point square root
|
|
+// ------------------------------ Floating-point square root (IfThenZeroElse)
|
|
|
|
// Approximate reciprocal square root
|
|
HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
|
|
@@ -1275,80 +1352,36 @@ HWY_INLINE Vec128<float, N> ApproximateR
|
|
}
|
|
|
|
// Full precision square root
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
|
|
#else
|
|
-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
|
|
-template <size_t N>
|
|
-HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
|
|
- auto b = v;
|
|
- auto Y = ApproximateReciprocalSqrt(v);
|
|
- auto x = v * Y;
|
|
- const auto half = Set(Simd<float, N>(), 0.5);
|
|
- const auto oneandhalf = Set(Simd<float, N>(), 1.5);
|
|
- for (size_t i = 0; i < 3; i++) {
|
|
- b = b * Y * Y;
|
|
- Y = oneandhalf - half * b;
|
|
- x = x * Y;
|
|
- }
|
|
- return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
|
|
-}
|
|
-#endif
|
|
-
|
|
-// ================================================== COMPARE
|
|
-
|
|
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
|
+namespace detail {
|
|
|
|
-template <typename TFrom, typename TTo, size_t N>
|
|
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
|
|
- static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
|
|
- return Mask128<TTo, N>{m.raw};
|
|
+HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
|
|
+ const Vec128<float> recip) {
|
|
+ return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
|
|
+}
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
|
|
+ Vec128<float, N> recip) {
|
|
+ return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
|
|
}
|
|
|
|
-#define HWY_NEON_BUILD_TPL_HWY_COMPARE
|
|
-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
|
|
-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
|
|
- const Vec128<type, size> a, const Vec128<type, size> b
|
|
-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
|
|
-
|
|
-// ------------------------------ Equality
|
|
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
|
|
-#if defined(__aarch64__)
|
|
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
|
|
-#else
|
|
-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
|
|
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
|
|
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
|
|
-#endif
|
|
+} // namespace detail
|
|
|
|
-// ------------------------------ Strict inequality
|
|
+// Not defined on armv7: approximate
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
|
|
+ auto recip = ApproximateReciprocalSqrt(v);
|
|
|
|
-// Signed/float < (no unsigned)
|
|
-#if defined(__aarch64__)
|
|
-HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
|
|
-#else
|
|
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
|
|
-#endif
|
|
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
|
|
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
|
|
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
|
|
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
|
|
|
|
-// Signed/float > (no unsigned)
|
|
-#if defined(__aarch64__)
|
|
-HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
|
|
-#else
|
|
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
|
|
+ const auto root = v * recip;
|
|
+ return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
|
|
+}
|
|
#endif
|
|
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
|
|
-
|
|
-// ------------------------------ Weak inequality
|
|
-
|
|
-// Float <= >=
|
|
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
|
|
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
|
|
-
|
|
-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
|
|
-#undef HWY_NEON_BUILD_RET_HWY_COMPARE
|
|
-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
|
|
-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
|
|
|
|
// ================================================== LOGICAL
|
|
|
|
@@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato
|
|
// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
|
|
template <typename T>
|
|
HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
|
|
- const Full128<uint8_t> d8;
|
|
- return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
|
|
+ const Full128<T> d;
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
|
|
}
|
|
template <typename T, size_t N, HWY_IF_LE64(T, N)>
|
|
HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
|
|
- const Repartition<uint8_t, Simd<T, N>> d8;
|
|
- return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
|
|
+ const Simd<T, N> d;
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ using V8 = decltype(Zero(d8));
|
|
+ return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
|
|
}
|
|
|
|
// ------------------------------ And
|
|
@@ -1463,33 +1499,38 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
|
|
return ShiftRight<sizeof(T) * 8 - 1>(v);
|
|
}
|
|
|
|
-// ------------------------------ Make mask
|
|
+// ================================================== MASK
|
|
|
|
-template <typename T, size_t N>
|
|
-HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
|
|
- static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
|
|
- return (v & bit) == bit;
|
|
-}
|
|
+// ------------------------------ To/from vector
|
|
|
|
-// Mask and Vec are the same (true = FF..FF).
|
|
+// Mask and Vec have the same representation (true = FF..FF).
|
|
template <typename T, size_t N>
|
|
HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
|
|
- return Mask128<T, N>(v.raw);
|
|
+ const Simd<MakeUnsigned<T>, N> du;
|
|
+ return Mask128<T, N>(BitCast(du, v).raw);
|
|
}
|
|
|
|
+// DEPRECATED
|
|
template <typename T, size_t N>
|
|
HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
- return Vec128<T, N>(v.raw);
|
|
+ return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
-HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
|
|
- const Mask128<T, N> v) {
|
|
- return Vec128<T, N>(v.raw);
|
|
+HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
|
|
+ return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
|
|
}
|
|
|
|
-// IfThenElse(mask, yes, no)
|
|
-// Returns mask ? b : a.
|
|
+// ------------------------------ RebindMask
|
|
+
|
|
+template <typename TFrom, typename TTo, size_t N>
|
|
+HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
|
|
+ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
|
|
+ return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
|
|
+}
|
|
+
|
|
+// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
|
|
+
|
|
#define HWY_NEON_BUILD_TPL_HWY_IF
|
|
#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
|
|
#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
|
|
@@ -1524,7 +1565,6 @@ HWY_INLINE Vec128<T, N> ZeroIfNegative(V
|
|
return Max(zero, v);
|
|
}
|
|
|
|
-
|
|
// ------------------------------ Mask logical
|
|
|
|
template <typename T, size_t N>
|
|
@@ -1557,30 +1597,183 @@ HWY_API Mask128<T, N> Xor(const Mask128<
|
|
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
|
}
|
|
|
|
-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
|
|
+// ================================================== COMPARE
|
|
|
|
-namespace detail {
|
|
+// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
|
|
|
-#if defined(__aarch64__)
|
|
+// ------------------------------ Shuffle2301 (for i64 compares)
|
|
|
|
-HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
- return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
|
|
+// Swap 32-bit halves in 64-bits
|
|
+HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
|
|
+ return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
|
|
+}
|
|
+HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
|
|
+ return Vec128<int32_t, 2>(vrev64_s32(v.raw));
|
|
+}
|
|
+HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
|
|
+ return Vec128<float, 2>(vrev64_f32(v.raw));
|
|
+}
|
|
+HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
|
|
+ return Vec128<uint32_t>(vrev64q_u32(v.raw));
|
|
}
|
|
-HWY_INLINE Vec128<uint64_t, 1> Gt(Vec128<uint64_t, 1> a,
|
|
- Vec128<uint64_t, 1> b) {
|
|
- return Vec128<uint64_t, 1>(vcgt_u64(a.raw, b.raw));
|
|
+HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
|
|
+ return Vec128<int32_t>(vrev64q_s32(v.raw));
|
|
+}
|
|
+HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
|
|
+ return Vec128<float>(vrev64q_f32(v.raw));
|
|
}
|
|
|
|
-HWY_INLINE Vec128<int64_t> Gt(Vec128<int64_t> a, Vec128<int64_t> b) {
|
|
- return Vec128<int64_t>(vcgtq_s64(a.raw, b.raw));
|
|
+#define HWY_NEON_BUILD_TPL_HWY_COMPARE
|
|
+#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
|
|
+#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
|
|
+ const Vec128<type, size> a, const Vec128<type, size> b
|
|
+#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
|
|
+
|
|
+// ------------------------------ Equality
|
|
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
|
|
+#if HWY_ARCH_ARM_A64
|
|
+HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
|
|
+#else
|
|
+// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
|
|
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
|
|
+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
|
|
+#endif
|
|
+
|
|
+// ------------------------------ Strict inequality (signed, float)
|
|
+#if HWY_ARCH_ARM_A64
|
|
+HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
|
|
+#else
|
|
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
|
|
+#endif
|
|
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
|
|
+
|
|
+// ------------------------------ Weak inequality (float)
|
|
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
|
|
+
|
|
+#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
|
|
+#undef HWY_NEON_BUILD_RET_HWY_COMPARE
|
|
+#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
|
|
+#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
|
|
+
|
|
+// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
|
|
+
|
|
+#if HWY_ARCH_ARM_V7
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
|
|
+ const Vec128<int64_t, N> b) {
|
|
+ const Simd<int32_t, N * 2> d32;
|
|
+ const Simd<int64_t, N> d64;
|
|
+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
|
|
+ const auto cmp64 = cmp32 & Shuffle2301(cmp32);
|
|
+ return MaskFromVec(BitCast(d64, cmp64));
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
|
|
+ const Vec128<uint64_t, N> b) {
|
|
+ const Simd<uint32_t, N * 2> d32;
|
|
+ const Simd<uint64_t, N> d64;
|
|
+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
|
|
+ const auto cmp64 = cmp32 & Shuffle2301(cmp32);
|
|
+ return MaskFromVec(BitCast(d64, cmp64));
|
|
}
|
|
-HWY_INLINE Vec128<int64_t, 1> Gt(Vec128<int64_t, 1> a, Vec128<int64_t, 1> b) {
|
|
- return Vec128<int64_t, 1>(vcgt_s64(a.raw, b.raw));
|
|
+
|
|
+HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
|
|
+ const Vec128<int64_t> b) {
|
|
+ const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
|
|
+ return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
|
|
+}
|
|
+HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
|
|
+ const Vec128<int64_t, 1> b) {
|
|
+ const int64x1_t sub = vqsub_s64(a.raw, b.raw);
|
|
+ return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
|
|
}
|
|
|
|
#endif
|
|
|
|
-} // namespace detail
|
|
+// ------------------------------ Reversed comparisons
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
|
|
+ return operator<(b, a);
|
|
+}
|
|
+template <typename T, size_t N>
|
|
+HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
|
|
+ return operator<=(b, a);
|
|
+}
|
|
+
|
|
+// ------------------------------ FirstN (Iota, Lt)
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
|
|
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
|
|
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
|
|
+}
|
|
+
|
|
+// ------------------------------ TestBit (Eq)
|
|
+
|
|
+#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
|
|
+#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
|
|
+#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
|
|
+ Vec128<type, size> v, Vec128<type, size> bit
|
|
+#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
|
|
+
|
|
+#if HWY_ARCH_ARM_A64
|
|
+HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
|
|
+#else
|
|
+// No 64-bit versions on armv7
|
|
+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
|
|
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
|
|
+ Vec128<uint64_t, N> bit) {
|
|
+ return (v & bit) == bit;
|
|
+}
|
|
+template <size_t N>
|
|
+HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
|
|
+ Vec128<int64_t, N> bit) {
|
|
+ return (v & bit) == bit;
|
|
+}
|
|
+
|
|
+#endif
|
|
+#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
|
|
+#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
|
|
+#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
|
|
+#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
|
|
+
|
|
+// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
|
|
+HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return Vec128<int64_t>(vabsq_s64(v.raw));
|
|
+#else
|
|
+ const auto zero = Zero(Full128<int64_t>());
|
|
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
+#endif
|
|
+}
|
|
+HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return Vec128<int64_t, 1>(vabs_s64(v.raw));
|
|
+#else
|
|
+ const auto zero = Zero(Simd<int64_t, 1>());
|
|
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
+#endif
|
|
+}
|
|
+
|
|
+// ------------------------------ Min (IfThenElse, BroadcastSignBit)
|
|
+
|
|
+#if HWY_ARCH_ARM_A64
|
|
+
|
|
+HWY_INLINE Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
+ return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
|
|
+}
|
|
+HWY_INLINE Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
|
|
+ Vec128<uint64_t, 1> b) {
|
|
+ return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
|
|
+}
|
|
+
|
|
+#endif
|
|
|
|
// Unsigned
|
|
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
|
|
@@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min,
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
|
|
const Vec128<uint64_t, N> b) {
|
|
-#if defined(__aarch64__)
|
|
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return IfThenElse(b < a, b, a);
|
|
#else
|
|
const Simd<uint64_t, N> du;
|
|
const Simd<int64_t, N> di;
|
|
@@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
|
|
const Vec128<int64_t, N> b) {
|
|
-#if defined(__aarch64__)
|
|
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return IfThenElse(b < a, b, a);
|
|
#else
|
|
const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
|
|
return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
|
|
@@ -1612,7 +1805,7 @@ HWY_INLINE Vec128<int64_t, N> Min(const
|
|
}
|
|
|
|
// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
|
|
#else
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
|
|
@@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max,
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
|
|
const Vec128<uint64_t, N> b) {
|
|
-#if defined(__aarch64__)
|
|
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return IfThenElse(b < a, a, b);
|
|
#else
|
|
const Simd<uint64_t, N> du;
|
|
const Simd<int64_t, N> di;
|
|
@@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
|
|
const Vec128<int64_t, N> b) {
|
|
-#if defined(__aarch64__)
|
|
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ return IfThenElse(b < a, a, b);
|
|
#else
|
|
const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
|
|
return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
|
|
@@ -1650,7 +1843,7 @@ HWY_INLINE Vec128<int64_t, N> Max(const
|
|
}
|
|
|
|
// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
|
|
#else
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
|
|
@@ -1696,7 +1889,7 @@ HWY_INLINE Vec128<float> LoadU(Full128<f
|
|
const float* HWY_RESTRICT aligned) {
|
|
return Vec128<float>(vld1q_f32(aligned));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
|
|
const double* HWY_RESTRICT aligned) {
|
|
return Vec128<double>(vld1q_f64(aligned));
|
|
@@ -1741,7 +1934,7 @@ HWY_INLINE Vec128<float, 2> LoadU(Simd<f
|
|
const float* HWY_RESTRICT p) {
|
|
return Vec128<float, 2>(vld1_f32(p));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
|
|
const double* HWY_RESTRICT p) {
|
|
return Vec128<double, 1>(vld1_f64(p));
|
|
@@ -1755,73 +1948,72 @@ HWY_INLINE Vec128<double, 1> LoadU(Simd<
|
|
// we don't actually care what is in it, and we don't want
|
|
// to introduce extra overhead by initializing it to something.
|
|
|
|
-HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
|
|
+HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
|
|
const uint8_t* HWY_RESTRICT p) {
|
|
- uint32x2_t a = Undefined(d).raw;
|
|
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
|
|
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
|
|
return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
|
|
}
|
|
-HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
|
|
+HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
|
|
const uint16_t* HWY_RESTRICT p) {
|
|
- uint32x2_t a = Undefined(d).raw;
|
|
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
|
|
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
|
|
return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
|
|
}
|
|
-HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
|
|
+HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
|
|
const uint32_t* HWY_RESTRICT p) {
|
|
- uint32x2_t a = Undefined(d).raw;
|
|
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
|
|
uint32x2_t b = vld1_lane_u32(p, a, 0);
|
|
return Vec128<uint32_t, 1>(b);
|
|
}
|
|
-HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
|
|
+HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
|
|
const int8_t* HWY_RESTRICT p) {
|
|
- int32x2_t a = Undefined(d).raw;
|
|
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
|
|
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
|
|
return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
|
|
}
|
|
-HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
|
|
+HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
|
|
const int16_t* HWY_RESTRICT p) {
|
|
- int32x2_t a = Undefined(d).raw;
|
|
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
|
|
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
|
|
return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
|
|
}
|
|
-HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
|
|
+HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
|
|
const int32_t* HWY_RESTRICT p) {
|
|
- int32x2_t a = Undefined(d).raw;
|
|
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
|
|
int32x2_t b = vld1_lane_s32(p, a, 0);
|
|
return Vec128<int32_t, 1>(b);
|
|
}
|
|
-HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
|
|
+HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
|
|
const float* HWY_RESTRICT p) {
|
|
- float32x2_t a = Undefined(d).raw;
|
|
+ float32x2_t a = Undefined(Simd<float, 2>()).raw;
|
|
float32x2_t b = vld1_lane_f32(p, a, 0);
|
|
return Vec128<float, 1>(b);
|
|
}
|
|
|
|
// ------------------------------ Load 16
|
|
|
|
-HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
|
|
+HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
|
|
const uint8_t* HWY_RESTRICT p) {
|
|
- uint16x4_t a = Undefined(d).raw;
|
|
+ uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
|
|
uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
|
|
return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
|
|
}
|
|
-HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
|
|
+HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
|
|
const uint16_t* HWY_RESTRICT p) {
|
|
- uint16x4_t a = Undefined(d).raw;
|
|
+ uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
|
|
uint16x4_t b = vld1_lane_u16(p, a, 0);
|
|
return Vec128<uint16_t, 1>(b);
|
|
}
|
|
-
|
|
-HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
|
|
+HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
|
|
const int8_t* HWY_RESTRICT p) {
|
|
- int16x4_t a = Undefined(d).raw;
|
|
+ int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
|
|
int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
|
|
return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
|
|
}
|
|
-HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
|
|
+HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
|
|
const int16_t* HWY_RESTRICT p) {
|
|
- int16x4_t a = Undefined(d).raw;
|
|
+ int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
|
|
int16x4_t b = vld1_lane_s16(p, a, 0);
|
|
return Vec128<int16_t, 1>(b);
|
|
}
|
|
@@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128<floa
|
|
float* HWY_RESTRICT aligned) {
|
|
vst1q_f32(aligned, v.raw);
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
|
|
double* HWY_RESTRICT aligned) {
|
|
vst1q_f64(aligned, v.raw);
|
|
@@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128<floa
|
|
float* HWY_RESTRICT p) {
|
|
vst1_f32(p, v.raw);
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
|
|
double* HWY_RESTRICT p) {
|
|
vst1_f64(p, v.raw);
|
|
@@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128<doub
|
|
HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
|
|
uint8_t* HWY_RESTRICT p) {
|
|
uint32x2_t a = vreinterpret_u32_u8(v.raw);
|
|
- vst1_lane_u32(p, a, 0);
|
|
+ vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
|
|
uint16_t* HWY_RESTRICT p) {
|
|
uint32x2_t a = vreinterpret_u32_u16(v.raw);
|
|
- vst1_lane_u32(p, a, 0);
|
|
+ vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
|
|
uint32_t* HWY_RESTRICT p) {
|
|
@@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128<uint
|
|
HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
|
|
int8_t* HWY_RESTRICT p) {
|
|
int32x2_t a = vreinterpret_s32_s8(v.raw);
|
|
- vst1_lane_s32(p, a, 0);
|
|
+ vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
|
|
int16_t* HWY_RESTRICT p) {
|
|
int32x2_t a = vreinterpret_s32_s16(v.raw);
|
|
- vst1_lane_s32(p, a, 0);
|
|
+ vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
|
|
int32_t* HWY_RESTRICT p) {
|
|
@@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128<floa
|
|
HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
|
|
uint8_t* HWY_RESTRICT p) {
|
|
uint16x4_t a = vreinterpret_u16_u8(v.raw);
|
|
- vst1_lane_u16(p, a, 0);
|
|
+ vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
|
|
uint16_t* HWY_RESTRICT p) {
|
|
@@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128<uint
|
|
HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
|
|
int8_t* HWY_RESTRICT p) {
|
|
int16x4_t a = vreinterpret_s16_s8(v.raw);
|
|
- vst1_lane_s16(p, a, 0);
|
|
+ vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
|
|
}
|
|
HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
|
|
int16_t* HWY_RESTRICT p) {
|
|
@@ -2068,18 +2260,18 @@ HWY_INLINE Vec128<uint64_t> PromoteTo(Fu
|
|
const Vec128<uint32_t, 2> v) {
|
|
return Vec128<uint64_t>(vmovl_u32(v.raw));
|
|
}
|
|
-HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
|
|
+HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
|
|
const Vec128<uint8_t, 8> v) {
|
|
- return Vec128<int16_t>(vmovl_u8(v.raw));
|
|
+ return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
|
|
}
|
|
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
|
|
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
|
|
const Vec128<uint8_t, 4> v) {
|
|
uint16x8_t a = vmovl_u8(v.raw);
|
|
- return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
|
|
+ return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
|
|
}
|
|
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
|
|
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
|
|
const Vec128<uint16_t, 4> v) {
|
|
- return Vec128<int32_t>(vmovl_u16(v.raw));
|
|
+ return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
|
|
}
|
|
|
|
// Unsigned: zero-extend to half vector.
|
|
@@ -2105,9 +2297,9 @@ HWY_INLINE Vec128<uint64_t, N> PromoteTo
|
|
return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int16_t, N)>
|
|
-HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
|
|
+HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
|
|
const Vec128<uint8_t, N> v) {
|
|
- return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
|
|
+ return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int32_t, N)>
|
|
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
|
|
@@ -2170,12 +2362,14 @@ HWY_INLINE Vec128<int64_t, N> PromoteTo(
|
|
|
|
HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
|
|
const Vec128<float16_t, 4> v) {
|
|
- return Vec128<float>(vcvt_f32_f16(v.raw));
|
|
+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
|
|
+ return Vec128<float>(f32);
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
|
|
const Vec128<float16_t, N> v) {
|
|
- return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
|
|
+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
|
|
+ return Vec128<float, N>(vget_low_f32(f32));
|
|
}
|
|
|
|
#else
|
|
@@ -2204,7 +2398,7 @@ HWY_INLINE Vec128<float, N> PromoteTo(Si
|
|
|
|
#endif
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
|
|
HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
|
|
const Vec128<float, 2> v) {
|
|
@@ -2298,12 +2492,13 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
|
|
|
|
HWY_INLINE Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
|
|
const Vec128<float> v) {
|
|
- return Vec128<float16_t, 4>{vcvt_f16_f32(v.raw)};
|
|
+ return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
|
|
}
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
|
|
const Vec128<float, N> v) {
|
|
- return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
|
|
+ const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
|
|
+ return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
|
|
}
|
|
|
|
#else
|
|
@@ -2339,7 +2534,7 @@ HWY_INLINE Vec128<float16_t, N> DemoteTo
|
|
}
|
|
|
|
#endif
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
|
|
HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
|
|
const Vec128<double> v) {
|
|
@@ -2397,7 +2592,7 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
|
|
const Vec128<int32_t> v) {
|
|
Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
|
|
Vec128<int16_t, N> b;
|
|
- uint16x8_t c = vcombine_s16(a.raw, b.raw);
|
|
+ int16x8_t c = vcombine_s16(a.raw, b.raw);
|
|
return Vec128<int8_t, N>(vqmovn_s16(c));
|
|
}
|
|
|
|
@@ -2426,7 +2621,7 @@ HWY_INLINE Vec128<int32_t, N> ConvertTo(
|
|
return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
|
|
}
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
|
|
HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
|
|
const Vec128<int64_t> v) {
|
|
@@ -2451,7 +2646,7 @@ HWY_INLINE Vec128<int64_t, 1> ConvertTo(
|
|
|
|
// ------------------------------ Round (IfThenElse, mask, logical)
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
// Toward nearest integer
|
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
|
|
|
|
@@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor,
|
|
// representation, clearing the lowest 23-exp mantissa bits. This requires 9
|
|
// integer operations and 3 constants, which is likely more expensive.
|
|
|
|
+namespace detail {
|
|
+
|
|
+// The original value is already the desired result if NaN or the magnitude is
|
|
+// large (i.e. the value is already an integer).
|
|
+template <size_t N>
|
|
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
|
|
+ return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
|
|
const Simd<float, N> df;
|
|
- const Simd<int32_t, N> di;
|
|
+ const RebindToSigned<decltype(df)> di;
|
|
|
|
const auto integer = ConvertTo(di, v); // round toward 0
|
|
const auto int_f = ConvertTo(df, integer);
|
|
|
|
- // The original value is already the desired result if NaN or the magnitude is
|
|
- // large (i.e. the value is already an integer).
|
|
- const auto max = Set(df, MantissaEnd<float>());
|
|
- return IfThenElse(Abs(v) < max, int_f, v);
|
|
+ return IfThenElse(detail::UseInt(v), int_f, v);
|
|
}
|
|
|
|
template <size_t N>
|
|
@@ -2506,7 +2709,7 @@ HWY_INLINE Vec128<float, N> Round(const
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
|
|
const Simd<float, N> df;
|
|
- const Simd<int32_t, N> di;
|
|
+ const RebindToSigned<decltype(df)> di;
|
|
|
|
const auto integer = ConvertTo(di, v); // round toward 0
|
|
const auto int_f = ConvertTo(df, integer);
|
|
@@ -2514,9 +2717,7 @@ HWY_INLINE Vec128<float, N> Ceil(const V
|
|
// Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
|
|
|
|
- // Keep original if NaN or the magnitude is large (already an int).
|
|
- const auto max = Set(df, MantissaEnd<float>());
|
|
- return IfThenElse(Abs(v) < max, int_f - neg1, v);
|
|
+ return IfThenElse(detail::UseInt(v), int_f - neg1, v);
|
|
}
|
|
|
|
template <size_t N>
|
|
@@ -2530,16 +2731,14 @@ HWY_INLINE Vec128<float, N> Floor(const
|
|
// Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
|
|
|
|
- // Keep original if NaN or the magnitude is large (already an int).
|
|
- const auto max = Set(df, MantissaEnd<float>());
|
|
- return IfThenElse(Abs(v) < max, int_f + neg1, v);
|
|
+ return IfThenElse(detail::UseInt(v), int_f + neg1, v);
|
|
}
|
|
|
|
#endif
|
|
|
|
// ------------------------------ NearestInt (Round)
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
|
|
HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
|
|
return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
|
|
@@ -2596,7 +2795,7 @@ HWY_INLINE Vec128<int64_t, 1> LowerHalf(
|
|
HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
|
|
return Vec128<float, 2>(vget_low_f32(v.raw));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
|
|
return Vec128<double, 1>(vget_low_f64(v.raw));
|
|
}
|
|
@@ -2629,7 +2828,7 @@ HWY_INLINE Vec128<int64_t, 1> UpperHalf(
|
|
HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
|
|
return Vec128<float, 2>(vget_high_f32(v.raw));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
|
|
return Vec128<double, 1>(vget_high_f64(v.raw));
|
|
}
|
|
@@ -2714,7 +2913,7 @@ HWY_INLINE Vec128<T, N> ShiftRightLanes(
|
|
|
|
// ------------------------------ Broadcast/splat any lane
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
// Unsigned
|
|
template <int kLane>
|
|
HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
|
|
@@ -2886,7 +3085,7 @@ HWY_API Vec128<T> TableLookupBytes(const
|
|
const Vec128<T> from) {
|
|
const Full128<T> d;
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
|
|
BitCast(d8, from).raw)));
|
|
#else
|
|
@@ -2911,33 +3110,58 @@ HWY_INLINE Vec128<T, N> TableLookupBytes
|
|
BitCast(d8, from).raw)));
|
|
}
|
|
|
|
-// ------------------------------ Hard-coded shuffles
|
|
+// ------------------------------ TableLookupLanes
|
|
|
|
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
|
|
-// Shuffle0321 rotates one lane to the right (the previous least-significant
|
|
-// lane is now most-significant). These could also be implemented via
|
|
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
|
|
+// Returned by SetTableIndices for use by TableLookupLanes.
|
|
+template <typename T, size_t N>
|
|
+struct Indices128 {
|
|
+ typename detail::Raw128<T, N>::type raw;
|
|
+};
|
|
|
|
-// Swap 32-bit halves in 64-bits
|
|
-HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
|
|
- return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
|
|
-}
|
|
-HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
|
|
- return Vec128<int32_t, 2>(vrev64_s32(v.raw));
|
|
-}
|
|
-HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
|
|
- return Vec128<float, 2>(vrev64_f32(v.raw));
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
|
|
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ alignas(16) uint8_t control[16] = {0};
|
|
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
|
|
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
|
+ control[idx_lane * sizeof(T) + idx_byte] =
|
|
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
|
|
+ }
|
|
+ }
|
|
+ return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
|
|
}
|
|
-HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>(vrev64q_u32(v.raw));
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
|
|
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
|
|
}
|
|
-HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>(vrev64q_s32(v.raw));
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
|
|
+ const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
|
|
}
|
|
-HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
|
|
- return Vec128<float>(vrev64q_f32(v.raw));
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
|
|
+ const Indices128<float, N> idx) {
|
|
+ const Simd<int32_t, N> di;
|
|
+ const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
|
|
+ return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
|
|
}
|
|
|
|
+// ------------------------------ Other shuffles (TableLookupBytes)
|
|
+
|
|
+// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
|
|
+// Shuffle0321 rotates one lane to the right (the previous least-significant
|
|
+// lane is now most-significant). These could also be implemented via
|
|
+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
|
|
+
|
|
// Swap 64-bit halves
|
|
template <typename T>
|
|
HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
|
|
@@ -2975,49 +3199,6 @@ HWY_INLINE Vec128<T> Shuffle0123(const V
|
|
return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
|
|
}
|
|
|
|
-// ------------------------------ TableLookupLanes
|
|
-
|
|
-// Returned by SetTableIndices for use by TableLookupLanes.
|
|
-template <typename T>
|
|
-struct Indices128 {
|
|
- uint8x16_t raw;
|
|
-};
|
|
-
|
|
-template <typename T>
|
|
-HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
|
|
-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
|
|
- const size_t N = 16 / sizeof(T);
|
|
- for (size_t i = 0; i < N; ++i) {
|
|
- HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
|
|
- }
|
|
-#endif
|
|
-
|
|
- const Full128<uint8_t> d8;
|
|
- alignas(16) uint8_t control[16];
|
|
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
|
|
- const size_t idx_lane = idx_byte / sizeof(T);
|
|
- const size_t mod = idx_byte % sizeof(T);
|
|
- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
|
|
- }
|
|
- return Indices128<T>{Load(d8, control).raw};
|
|
-}
|
|
-
|
|
-HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
|
|
- const Indices128<uint32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
|
|
-}
|
|
-HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
|
|
- const Indices128<int32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
|
|
-}
|
|
-HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
|
|
- const Indices128<float> idx) {
|
|
- const Full128<int32_t> di;
|
|
- const Full128<float> df;
|
|
- return BitCast(df,
|
|
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
|
|
-}
|
|
-
|
|
// ------------------------------ Interleave lanes
|
|
|
|
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
|
|
@@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter
|
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
|
|
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
// For 64 bit types, we only have the "q" version of the function defined as
|
|
// interleaving 64-wide registers with 64-wide types in them makes no sense.
|
|
HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
|
|
@@ -3079,7 +3260,7 @@ HWY_INLINE Vec128<float> InterleaveLower
|
|
const Vec128<float> b) {
|
|
return Vec128<float>(vzip1q_f32(a.raw, b.raw));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
|
|
const Vec128<double> b) {
|
|
return Vec128<double>(vzip1q_f64(a.raw, b.raw));
|
|
@@ -3090,10 +3271,10 @@ HWY_INLINE Vec128<float> InterleaveUpper
|
|
const Vec128<float> b) {
|
|
return Vec128<float>(vzip2q_f32(a.raw, b.raw));
|
|
}
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
|
|
const Vec128<double> b) {
|
|
- return Vec128<double>(vzip2q_s64(a.raw, b.raw));
|
|
+ return Vec128<double>(vzip2q_f64(a.raw, b.raw));
|
|
}
|
|
#endif
|
|
|
|
@@ -3105,119 +3286,125 @@ HWY_INLINE Vec128<double> InterleaveUppe
|
|
// Full vectors
|
|
HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
|
|
const Vec128<uint8_t> b) {
|
|
- return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
|
|
+ return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
|
|
const Vec128<uint16_t> b) {
|
|
- return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
|
|
+ return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
|
|
const Vec128<uint32_t> b) {
|
|
- return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
|
|
+ return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw)));
|
|
}
|
|
|
|
HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
|
|
const Vec128<int8_t> b) {
|
|
- return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
|
|
+ return Vec128<int16_t>(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
|
|
const Vec128<int16_t> b) {
|
|
- return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
|
|
+ return Vec128<int32_t>(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
|
|
const Vec128<int32_t> b) {
|
|
- return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
|
|
+ return Vec128<int64_t>(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw)));
|
|
}
|
|
|
|
HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
|
|
const Vec128<uint8_t> b) {
|
|
- return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
|
|
+ return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
|
|
const Vec128<uint16_t> b) {
|
|
- return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
|
|
+ return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
|
|
const Vec128<uint32_t> b) {
|
|
- return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
|
|
+ return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw)));
|
|
}
|
|
|
|
HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
|
|
const Vec128<int8_t> b) {
|
|
- return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
|
|
+ return Vec128<int16_t>(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
|
|
const Vec128<int16_t> b) {
|
|
- return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
|
|
+ return Vec128<int32_t>(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw)));
|
|
}
|
|
HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
|
|
const Vec128<int32_t> b) {
|
|
- return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
|
|
+ return Vec128<int64_t>(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw)));
|
|
}
|
|
|
|
// Half vectors or less
|
|
template <size_t N, HWY_IF_LE64(uint8_t, N)>
|
|
HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
|
|
+ return Vec128<uint16_t, (N + 1) / 2>(
|
|
+ vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(uint16_t, N)>
|
|
HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
- return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
|
|
+ return Vec128<uint32_t, (N + 1) / 2>(
|
|
+ vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(uint32_t, N)>
|
|
HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
|
|
const Vec128<uint32_t, N> b) {
|
|
- return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
|
|
+ return Vec128<uint64_t, (N + 1) / 2>(
|
|
+ vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw)));
|
|
}
|
|
|
|
template <size_t N, HWY_IF_LE64(int8_t, N)>
|
|
HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
|
|
+ return Vec128<int16_t, (N + 1) / 2>(
|
|
+ vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int16_t, N)>
|
|
HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
- return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
|
|
+ return Vec128<int32_t, (N + 1) / 2>(
|
|
+ vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int32_t, N)>
|
|
HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
|
|
const Vec128<int32_t, N> b) {
|
|
- return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
|
|
+ return Vec128<int64_t, (N + 1) / 2>(
|
|
+ vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw)));
|
|
}
|
|
|
|
template <size_t N, HWY_IF_LE64(uint8_t, N)>
|
|
HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
|
|
+ return Vec128<uint16_t, N / 2>(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(uint16_t, N)>
|
|
HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
- return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
|
|
+ return Vec128<uint32_t, N / 2>(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(uint32_t, N)>
|
|
HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
|
|
const Vec128<uint32_t, N> b) {
|
|
- return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
|
|
+ return Vec128<uint64_t, N / 2>(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw)));
|
|
}
|
|
|
|
template <size_t N, HWY_IF_LE64(int8_t, N)>
|
|
HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
|
|
+ return Vec128<int16_t, N / 2>(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int16_t, N)>
|
|
HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
- return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
|
|
+ return Vec128<int32_t, N / 2>(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw)));
|
|
}
|
|
template <size_t N, HWY_IF_LE64(int32_t, N)>
|
|
HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
|
|
const Vec128<int32_t, N> b) {
|
|
- return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
|
|
+ return Vec128<int64_t, N / 2>(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw)));
|
|
}
|
|
|
|
// ------------------------------ Blocks
|
|
@@ -3274,84 +3461,113 @@ HWY_INLINE Vec128<T> OddEven(const Vec12
|
|
|
|
// ================================================== MISC
|
|
|
|
-// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
-template <typename T, size_t N, typename T2>
|
|
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
|
|
- HWY_ALIGN T lanes[16 / sizeof(T)];
|
|
- for (size_t i = 0; i < 16 / sizeof(T); ++i) {
|
|
- lanes[i] = static_cast<T>(first + static_cast<T2>(i));
|
|
+// ------------------------------ Scatter (Store)
|
|
+
|
|
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Offset, N> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Index, N> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ base[index_lanes[i]] = lanes[i];
|
|
}
|
|
- return Load(d, lanes);
|
|
}
|
|
|
|
-// ------------------------------ Gather (requires GetLane)
|
|
+// ------------------------------ Gather (Load/Store)
|
|
|
|
template <typename T, size_t N, typename Offset>
|
|
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
|
|
const T* HWY_RESTRICT base,
|
|
const Vec128<Offset, N> offset) {
|
|
- static_assert(N == 1, "NEON does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
|
|
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
|
|
- T val;
|
|
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
|
|
- return Set(d, val);
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
template <typename T, size_t N, typename Index>
|
|
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
|
|
const Vec128<Index, N> index) {
|
|
- static_assert(N == 1, "NEON does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
|
|
- return Set(d, base[GetLane(index)]);
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ lanes[i] = base[index_lanes[i]];
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
|
|
+// ------------------------------ Reductions
|
|
|
|
-#if !defined(__aarch64__)
|
|
+namespace detail {
|
|
|
|
-template <size_t N>
|
|
-HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
|
|
- const Vec128<int64_t, N> b) {
|
|
- const Simd<int32_t, N * 2> d32;
|
|
- const Simd<int64_t, N> d64;
|
|
- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
|
|
- const auto cmp64 = cmp32 & Shuffle2301(cmp32);
|
|
- return MaskFromVec(BitCast(d64, cmp64));
|
|
+// N=1 for any T: no-op
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
|
|
+ return v;
|
|
}
|
|
-
|
|
-template <size_t N>
|
|
-HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
|
|
- const Vec128<uint64_t, N> b) {
|
|
- const Simd<uint32_t, N * 2> d32;
|
|
- const Simd<uint64_t, N> d64;
|
|
- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
|
|
- const auto cmp64 = cmp32 & Shuffle2301(cmp32);
|
|
- return MaskFromVec(BitCast(d64, cmp64));
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
}
|
|
|
|
-HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
|
|
- const Vec128<int64_t> b) {
|
|
- const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
|
|
- return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
|
|
+// u32/i32/f32: N=2
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
|
+HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
|
|
+ return v10 + Shuffle2301(v10);
|
|
}
|
|
-HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
|
|
- const Vec128<int64_t, 1> b) {
|
|
- const int64x1_t sub = vqsub_s64(a.raw, b.raw);
|
|
- return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Min(v10, Shuffle2301(v10));
|
|
}
|
|
-
|
|
-template <size_t N>
|
|
-HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
|
|
- const Vec128<int64_t, N> b) {
|
|
- return b < a;
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Max(v10, Shuffle2301(v10));
|
|
}
|
|
-#endif
|
|
-
|
|
-// ------------------------------ Reductions
|
|
|
|
-#if defined(__aarch64__)
|
|
-// Supported for 32b and 64b vector types. Returns the sum in each lane.
|
|
+// full vectors
|
|
+#if HWY_ARCH_ARM_A64
|
|
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
|
|
return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
|
|
}
|
|
@@ -3398,20 +3614,15 @@ HWY_INLINE Vec128<int64_t> SumOfLanes(co
|
|
}
|
|
#endif
|
|
|
|
-namespace detail {
|
|
-
|
|
-// For u32/i32/f32.
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Min(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
@@ -3419,15 +3630,13 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
|
|
}
|
|
|
|
// For u64/i64[/f64].
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Min(v10, v01);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Max(v10, v01);
|
|
}
|
|
@@ -3435,6 +3644,10 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
|
|
} // namespace detail
|
|
|
|
template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
|
|
+ return detail::SumOfLanes(v);
|
|
+}
|
|
+template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
|
|
return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
|
|
}
|
|
@@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Vec128<uint8_t> values =
|
|
BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
// Can't vaddv - we need two separate bytes (16 bits).
|
|
const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
|
|
const uint8x8_t x4 = vpadd_u8(x2, x2);
|
|
const uint8x8_t x8 = vpadd_u8(x4, x4);
|
|
- return vreinterpret_u16_u8(x8)[0];
|
|
+ return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
|
|
#else
|
|
// Don't have vpaddq, so keep doubling lane size.
|
|
const uint16x8_t x2 = vpaddlq_u8(values.raw);
|
|
const uint32x4_t x4 = vpaddlq_u16(x2);
|
|
const uint64x2_t x8 = vpaddlq_u32(x4);
|
|
- return (uint64_t(x8[1]) << 8) | x8[0];
|
|
+ return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
|
|
#endif
|
|
}
|
|
|
|
@@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
|
|
const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddv_u8(values.raw);
|
|
#else
|
|
const uint16x4_t x2 = vpaddl_u8(values.raw);
|
|
@@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Full128<uint16_t> du;
|
|
const Vec128<uint16_t> values =
|
|
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_u16(values.raw);
|
|
#else
|
|
const uint32x4_t x2 = vpaddlq_u16(values.raw);
|
|
@@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Simd<uint16_t, N> du;
|
|
const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
|
|
const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddv_u16(values.raw);
|
|
#else
|
|
const uint32x2_t x2 = vpaddl_u16(values.raw);
|
|
@@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Full128<uint32_t> du;
|
|
const Vec128<uint32_t> values =
|
|
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_u32(values.raw);
|
|
#else
|
|
const uint64x2_t x2 = vpaddlq_u32(values.raw);
|
|
@@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Simd<uint32_t, N> du;
|
|
const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
|
|
const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddv_u32(values.raw);
|
|
#else
|
|
const uint64x1_t x2 = vpaddl_u32(values.raw);
|
|
@@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
|
|
const Full128<uint64_t> du;
|
|
const Vec128<uint64_t> values =
|
|
BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_u64(values.raw);
|
|
#else
|
|
return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
|
|
@@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
|
|
const int8x16_t ones =
|
|
vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_s8(ones);
|
|
#else
|
|
const int16x8_t x2 = vpaddlq_s8(ones);
|
|
const int32x4_t x4 = vpaddlq_s16(x2);
|
|
const int64x2_t x8 = vpaddlq_s32(x4);
|
|
- return x8[0] + x8[1];
|
|
+ return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
|
|
#endif
|
|
}
|
|
template <typename T>
|
|
@@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
|
|
const int16x8_t ones =
|
|
vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_s16(ones);
|
|
#else
|
|
const int32x4_t x2 = vpaddlq_s16(ones);
|
|
const int64x2_t x4 = vpaddlq_s32(x2);
|
|
- return x4[0] + x4[1];
|
|
+ return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
|
|
#endif
|
|
}
|
|
|
|
@@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
|
|
const int32x4_t ones =
|
|
vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
|
|
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
return vaddvq_s32(ones);
|
|
#else
|
|
const int64x2_t x2 = vpaddlq_s32(ones);
|
|
- return x2[0] + x2[1];
|
|
+ return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
|
|
#endif
|
|
}
|
|
|
|
template <typename T>
|
|
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
|
|
-#if defined(__aarch64__)
|
|
+#if HWY_ARCH_ARM_A64
|
|
const Full128<int64_t> di;
|
|
const int64x2_t ones =
|
|
vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
|
|
return vaddvq_s64(ones);
|
|
#else
|
|
- const Full128<int64_t> di;
|
|
- const int64x2_t ones =
|
|
- vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
|
|
- return ones[0] + ones[1];
|
|
+ const Full128<uint64_t> du;
|
|
+ const auto mask_u = VecFromMask(du, RebindMask(du, mask));
|
|
+ const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
|
|
+ return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
|
|
#endif
|
|
}
|
|
|
|
@@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma
|
|
// Full
|
|
template <typename T>
|
|
HWY_INLINE bool AllFalse(const Mask128<T> m) {
|
|
+#if HWY_ARCH_ARM_A64
|
|
+ const Full128<uint32_t> d32;
|
|
+ const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128<T>(), m)));
|
|
+ return (vmaxvq_u32(m32.raw) == 0);
|
|
+#else
|
|
const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
|
|
uint32x2_t a = vqmovn_u64(v64.raw);
|
|
- return vreinterpret_u64_u32(a)[0] == 0;
|
|
+ return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
|
|
+#endif
|
|
}
|
|
|
|
// Partial
|
|
@@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128<T,
|
|
|
|
namespace detail {
|
|
|
|
+// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
|
|
+HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
|
|
+ const uint8_t* bytes) {
|
|
+ return Vec128<uint8_t>(vreinterpretq_u8_u64(
|
|
+ vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
|
|
+}
|
|
+
|
|
+// Load 8 bytes and return half-reg with N <= 8 bytes.
|
|
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
|
|
+HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N> d,
|
|
+ const uint8_t* bytes) {
|
|
+ return Load(d, bytes);
|
|
+}
|
|
+
|
|
template <typename T, size_t N>
|
|
-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
|
|
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
|
|
+ const uint64_t mask_bits) {
|
|
+ HWY_DASSERT(mask_bits < 256);
|
|
+ const Simd<T, N> d;
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ const Simd<uint16_t, N> du;
|
|
+
|
|
+ // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
|
|
+ // indices for VTBL (one vector's worth for each of 256 combinations of
|
|
+ // 8 mask bits). Loading them directly would require 4 KiB. We can instead
|
|
+ // store lane indices and convert to byte indices (2*lane + 0..1), with the
|
|
+ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
|
|
+ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
|
|
+ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
|
|
+ // is likely more costly than the higher cache footprint from storing bytes.
|
|
+ alignas(16) constexpr uint8_t table[256 * 8] = {
|
|
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
|
|
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
|
|
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
|
|
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
|
|
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
|
|
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
|
|
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
|
|
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
|
|
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
|
|
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
|
|
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
|
|
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
|
|
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
|
|
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
|
|
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
|
|
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
|
|
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
|
|
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
|
|
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
|
|
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
|
|
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
|
|
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
|
|
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
|
|
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
|
|
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
|
|
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
|
|
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
|
|
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
|
|
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
|
|
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
|
|
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
|
|
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
|
|
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
|
|
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
|
|
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
|
|
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
|
|
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
|
|
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
|
|
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
|
|
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
|
|
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
|
|
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
|
|
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
|
|
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
|
|
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
|
|
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
|
|
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
|
|
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
|
|
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
|
|
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
|
|
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
|
|
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
|
|
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
|
|
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
|
|
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
|
|
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
|
|
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
|
|
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
|
|
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
|
|
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
|
|
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
|
|
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
|
|
+
|
|
+ const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
|
|
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
|
|
+ return BitCast(d, pairs + Set(du, 0x0100));
|
|
+}
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
|
|
+ const uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 16);
|
|
|
|
// There are only 4 lanes, so we can afford to load the index vector directly.
|
|
@@ -3742,7 +4113,8 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(
|
|
#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
|
|
|
|
template <typename T, size_t N>
|
|
-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
|
|
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
|
|
+ const uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 4);
|
|
|
|
// There are only 2 lanes, so we can afford to load the index vector directly.
|
|
@@ -3761,59 +4133,15 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
|
|
|
|
// Helper function called by both Compress and CompressStore - avoids a
|
|
// redundant BitsFromMask in the latter.
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-
|
|
-#if HWY_CAP_INTEGER64
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-
|
|
-#endif
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- const Simd<float, N> df;
|
|
- const Simd<int32_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
-}
|
|
-
|
|
-#if HWY_CAP_FLOAT64
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- const Simd<double, N> df;
|
|
- const Simd<int64_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
|
|
+ const auto idx =
|
|
+ detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
|
|
+ using D = Simd<T, N>;
|
|
+ const RebindToSigned<D> di;
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
}
|
|
|
|
-#endif
|
|
-
|
|
} // namespace detail
|
|
|
|
template <typename T, size_t N>
|
|
@@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128<T, N
|
|
return PopCount(mask_bits);
|
|
}
|
|
|
|
+// ------------------------------ StoreInterleaved3
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
|
|
+ const Vec128<uint8_t> v1,
|
|
+ const Vec128<uint8_t> v2,
|
|
+ Full128<uint8_t> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
|
|
+ vst3q_u8(unaligned, triple);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
|
|
+ const Vec128<uint8_t, 8> v1,
|
|
+ const Vec128<uint8_t, 8> v2,
|
|
+ Simd<uint8_t, 8> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
|
|
+ vst3_u8(unaligned, triple);
|
|
+}
|
|
+
|
|
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
|
|
+ const Vec128<uint8_t, N> v1,
|
|
+ const Vec128<uint8_t, N> v2,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ alignas(16) uint8_t buf[24];
|
|
+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
|
|
+ vst3_u8(buf, triple);
|
|
+ CopyBytes<N * 3>(buf, unaligned);
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
|
|
+ const Vec128<uint8_t> v1,
|
|
+ const Vec128<uint8_t> v2,
|
|
+ const Vec128<uint8_t> v3,
|
|
+ Full128<uint8_t> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
|
|
+ vst4q_u8(unaligned, quad);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> v0,
|
|
+ const Vec128<uint8_t, 8> v1,
|
|
+ const Vec128<uint8_t, 8> v2,
|
|
+ const Vec128<uint8_t, 8> v3,
|
|
+ Simd<uint8_t, 8> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
|
|
+ vst4_u8(unaligned, quad);
|
|
+}
|
|
+
|
|
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> v0,
|
|
+ const Vec128<uint8_t, N> v1,
|
|
+ const Vec128<uint8_t, N> v2,
|
|
+ const Vec128<uint8_t, N> v3,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ alignas(16) uint8_t buf[32];
|
|
+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
|
|
+ vst4_u8(buf, quad);
|
|
+ CopyBytes<N * 4>(buf, unaligned);
|
|
+}
|
|
+
|
|
// ================================================== Operator wrapper
|
|
|
|
// These apply to all x86_*-inl.h because there are no restrictions on V.
|
|
@@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a
|
|
return a <= b;
|
|
}
|
|
|
|
-#if !defined(__aarch64__)
|
|
+namespace detail { // for code folding
|
|
+#if HWY_ARCH_ARM_V7
|
|
#undef vuzp1_s8
|
|
#undef vuzp1_u8
|
|
#undef vuzp1_s16
|
|
@@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
|
|
#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
|
|
#undef HWY_NEON_DEF_FUNCTION_UINTS
|
|
#undef HWY_NEON_EVAL
|
|
+} // namespace detail
|
|
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
} // namespace HWY_NAMESPACE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 2021-06-02 10:56:05.230904367 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -39,6 +39,11 @@ using TFromV = TFromD<DFromV<V>>;
|
|
hwy::EnableIf<IsSigned<TFromV<V>>() && !IsFloat<TFromV<V>>()>* = nullptr
|
|
#define HWY_IF_FLOAT_V(V) hwy::EnableIf<IsFloat<TFromV<V>>()>* = nullptr
|
|
|
|
+// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
|
|
+template <typename T, int kShift = 0>
|
|
+using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
|
|
+ : (HWY_LANES(T) << kShift)>;
|
|
+
|
|
// ================================================== MACROS
|
|
|
|
// Generate specializations and function definitions using X macros. Although
|
|
@@ -58,29 +63,30 @@ namespace detail { // for code folding
|
|
|
|
// For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the
|
|
// preprocessor cannot easily do it.
|
|
-#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP)
|
|
-
|
|
-#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP)
|
|
-
|
|
-#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP)
|
|
-
|
|
-#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP) \
|
|
- X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP)
|
|
+// TODO(janwas): GCC does not yet support fractional LMUL
|
|
+#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
|
|
+
|
|
+#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
|
|
+
|
|
+#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
|
|
+
|
|
+#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
|
|
+ X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP)
|
|
|
|
// SEW for unsigned:
|
|
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
|
|
@@ -153,63 +159,61 @@ namespace detail { // for code folding
|
|
|
|
// Assemble types for use in x-macros
|
|
#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
|
|
-#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL
|
|
-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t
|
|
+#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL
|
|
+#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
|
|
#define HWY_RVV_M(MLEN) vbool##MLEN##_t
|
|
|
|
} // namespace detail
|
|
|
|
// TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly
|
|
|
|
-// TODO(janwas): do we want fractional LMUL? (can encode as negative)
|
|
-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they
|
|
-// need many registers.
|
|
-#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- using HWY_RVV_D(CHAR, SEW, LMUL) = \
|
|
- Simd<HWY_RVV_T(BASE, SEW), HWY_LANES(HWY_RVV_T(BASE, SEW)) * LMUL>; \
|
|
- using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \
|
|
- template <> \
|
|
- struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
|
|
- using Lane = HWY_RVV_T(BASE, SEW); \
|
|
- using type = Simd<Lane, HWY_LANES(Lane) * LMUL>; \
|
|
+// Until we have full intrinsic support for fractional LMUL, mixed-precision
|
|
+// code can use LMUL 1..8 (adequate unless they need many registers).
|
|
+#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ using HWY_RVV_D(CHAR, SEW, LMUL) = Full<HWY_RVV_T(BASE, SEW), SHIFT>; \
|
|
+ using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \
|
|
+ template <> \
|
|
+ struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
|
|
+ using Lane = HWY_RVV_T(BASE, SEW); \
|
|
+ using type = Full<Lane, SHIFT>; \
|
|
};
|
|
using Vf16m1 = vfloat16m1_t;
|
|
using Vf16m2 = vfloat16m2_t;
|
|
using Vf16m4 = vfloat16m4_t;
|
|
using Vf16m8 = vfloat16m8_t;
|
|
-using Df16m1 = Simd<float16_t, HWY_LANES(uint16_t) * 1>;
|
|
-using Df16m2 = Simd<float16_t, HWY_LANES(uint16_t) * 2>;
|
|
-using Df16m4 = Simd<float16_t, HWY_LANES(uint16_t) * 4>;
|
|
-using Df16m8 = Simd<float16_t, HWY_LANES(uint16_t) * 8>;
|
|
+using Df16m1 = Full<float16_t, 0>;
|
|
+using Df16m2 = Full<float16_t, 1>;
|
|
+using Df16m4 = Full<float16_t, 2>;
|
|
+using Df16m8 = Full<float16_t, 3>;
|
|
|
|
HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
|
|
#undef HWY_SPECIALIZE
|
|
|
|
// vector = f(d), e.g. Zero
|
|
-#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \
|
|
(void)Lanes(d); \
|
|
- return v##OP##_##CHAR##SEW##m##LMUL(); \
|
|
+ return v##OP##_##CHAR##SEW##LMUL(); \
|
|
}
|
|
|
|
// vector = f(vector), e.g. Not
|
|
-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_v_##CHAR##SEW##m##LMUL(v); \
|
|
+ return v##OP##_v_##CHAR##SEW##LMUL(v); \
|
|
}
|
|
|
|
// vector = f(vector, scalar), e.g. detail::Add
|
|
-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
|
|
- return v##OP##_##CHAR##SEW##m##LMUL(a, b); \
|
|
+#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
|
|
+ return v##OP##_##CHAR##SEW##LMUL(a, b); \
|
|
}
|
|
|
|
// vector = f(vector, vector), e.g. Add
|
|
-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(a, b); \
|
|
}
|
|
|
|
// ================================================== INIT
|
|
@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
|
|
|
|
// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
|
|
// vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
|
|
-#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \
|
|
- return v##OP##SEW##m##LMUL(); \
|
|
+#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \
|
|
+ return v##OP##SEW##LMUL(); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e)
|
|
@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero,
|
|
template <class D>
|
|
using VFromD = decltype(Zero(D()));
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Full<T>> Zero(Simd<T, N> /*tag*/) {
|
|
+ return Zero(Full<T>());
|
|
+}
|
|
+
|
|
// ------------------------------ Set
|
|
// vector = f(d, scalar), e.g. Set
|
|
-#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \
|
|
(void)Lanes(d); \
|
|
- return v##OP##_##CHAR##SEW##m##LMUL(arg); \
|
|
+ return v##OP##_##CHAR##SEW##LMUL(arg); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x)
|
|
HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f)
|
|
#undef HWY_RVV_SET
|
|
|
|
+// Partial vectors
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Simd<T, N>> Set(Simd<T, N> /*tag*/, T arg) {
|
|
+ return Set(Full<T>(), arg);
|
|
+}
|
|
+
|
|
// ------------------------------ Undefined
|
|
|
|
// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
|
|
@@ -265,7 +281,7 @@ HWY_API VFromD<D> Undefined(D d) {
|
|
namespace detail {
|
|
|
|
// u8: no change
|
|
-#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
return v; \
|
|
@@ -276,25 +292,25 @@ namespace detail {
|
|
}
|
|
|
|
// Other integers
|
|
-#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v); \
|
|
- } \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
|
|
- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \
|
|
- return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v); \
|
|
+#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
+ return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
|
|
+ } \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
|
|
+ return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
|
|
}
|
|
|
|
// Float: first cast to/from unsigned
|
|
-#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL( \
|
|
- v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v)); \
|
|
- } \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
|
|
- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \
|
|
- return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL( \
|
|
- v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v)); \
|
|
+#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
+ return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
|
|
+ v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v)); \
|
|
+ } \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
|
|
+ return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL( \
|
|
+ v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _)
|
|
@@ -315,6 +331,12 @@ HWY_API VFromD<D> BitCast(D d, FromV v)
|
|
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
}
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Simd<T, N>> BitCast(Simd<T, N> /*tag*/, FromV v) {
|
|
+ return BitCast(Full<T>(), v);
|
|
+}
|
|
+
|
|
namespace detail {
|
|
|
|
template <class V, class DU = RebindToUnsigned<DFromV<V>>>
|
|
@@ -336,6 +358,12 @@ HWY_API VFromD<DU> Iota0(const D /*d*/)
|
|
return BitCastToUnsigned(Iota0(DU()));
|
|
}
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Simd<T, N>> Iota0(Simd<T, N> /*tag*/) {
|
|
+ return Iota0(Full<T>());
|
|
+}
|
|
+
|
|
} // namespace detail
|
|
|
|
// ================================================== LOGICAL
|
|
@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) {
|
|
// ------------------------------ Or
|
|
|
|
// Scalar argument plus mask. Used by VecFromMask.
|
|
-#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm, \
|
|
HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \
|
|
- return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm); \
|
|
+ return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm); \
|
|
}
|
|
|
|
namespace detail {
|
|
@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV,
|
|
// ------------------------------ ShiftLeft[Same]
|
|
|
|
// Intrinsics do not define .vi forms, so use .vx instead.
|
|
-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- template <int kBits> \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits); \
|
|
- } \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
|
|
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast<uint8_t>(bits)); \
|
|
+#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ template <int kBits> \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
+ return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits); \
|
|
+ } \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
|
|
+ return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits)); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll)
|
|
@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi
|
|
#undef HWY_RVV_SHIFT
|
|
|
|
// ------------------------------ Shl
|
|
-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, bits); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll)
|
|
|
|
-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, \
|
|
- detail::BitCastToUnsigned(bits)); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll)
|
|
@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons
|
|
|
|
// ------------------------------ MulAdd
|
|
// Note: op is still named vv, not vvv.
|
|
-#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
|
|
HWY_RVV_V(BASE, SEW, LMUL) add) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc)
|
|
@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub
|
|
// of all bits; SLEN 8 / LMUL 4 = half of all bits.
|
|
|
|
// mask = f(vector, vector)
|
|
-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_M(MLEN) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
|
(void)Lanes(DFromV<decltype(a)>()); \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b); \
|
|
}
|
|
|
|
// ------------------------------ Eq
|
|
@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo
|
|
#undef HWY_RVV_RETM_ARGMM
|
|
|
|
// ------------------------------ IfThenElse
|
|
-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
|
|
- HWY_RVV_V(BASE, SEW, LMUL) no) { \
|
|
- return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes); \
|
|
+#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
|
|
+ HWY_RVV_V(BASE, SEW, LMUL) no) { \
|
|
+ return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge)
|
|
@@ -710,7 +737,7 @@ template <class D>
|
|
using MFromD = decltype(MaskFromVec(Zero(D())));
|
|
|
|
template <class D, typename MFrom>
|
|
-HWY_API MFromD<D> RebindMask(const D d, const MFrom mask) {
|
|
+HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
|
|
// No need to check lane size/LMUL are the same: if not, casting MFrom to
|
|
// MFromD<D> would fail.
|
|
return mask;
|
|
@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _,
|
|
|
|
// ------------------------------ Load
|
|
|
|
-#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \
|
|
- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
- (void)Lanes(d); \
|
|
- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p); \
|
|
+#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \
|
|
+ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
+ (void)Lanes(d); \
|
|
+ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p); \
|
|
}
|
|
HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le)
|
|
#undef HWY_RVV_LOAD
|
|
|
|
-// Partial load
|
|
+// Partial
|
|
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
HWY_API VFromD<Simd<T, N>> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
|
|
return Load(d, p);
|
|
@@ -800,16 +827,22 @@ HWY_API VFromD<D> LoadU(D d, const TFrom
|
|
|
|
// ------------------------------ Store
|
|
|
|
-#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
- HWY_RVV_D(CHAR, SEW, LMUL) d, \
|
|
- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
- (void)Lanes(d); \
|
|
- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v); \
|
|
+#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL) d, \
|
|
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
+ (void)Lanes(d); \
|
|
+ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v); \
|
|
}
|
|
HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se)
|
|
#undef HWY_RVV_RET_ARGVDP
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API void Store(VFromD<Simd<T, N>> v, Simd<T, N> d, T* HWY_RESTRICT p) {
|
|
+ return Store(v, Full<T>(), p);
|
|
+}
|
|
+
|
|
// ------------------------------ StoreU
|
|
|
|
// RVV only requires lane alignment, not natural alignment of the entire vector.
|
|
@@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H
|
|
Store(v, d, aligned);
|
|
}
|
|
|
|
+// ------------------------------ ScatterOffset
|
|
+
|
|
+#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
|
|
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
+ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
+ base, detail::BitCastToUnsigned(offset), v); \
|
|
+ }
|
|
+HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sx)
|
|
+#undef HWY_RVV_SCATTER
|
|
+
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterOffset(VFromD<Simd<T, N>> v, Simd<T, N> d,
|
|
+ T* HWY_RESTRICT base,
|
|
+ VFromD<Simd<MakeSigned<T>, N>> offset) {
|
|
+ return ScatterOffset(v, Full<T>(), base, offset);
|
|
+}
|
|
+
|
|
+// ------------------------------ ScatterIndex
|
|
+
|
|
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
|
|
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
|
|
+ const VFromD<RebindToSigned<D>> index) {
|
|
+ return ScatterOffset(v, d, base, ShiftLeft<2>(index));
|
|
+}
|
|
+
|
|
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
|
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
|
|
+ const VFromD<RebindToSigned<D>> index) {
|
|
+ return ScatterOffset(v, d, base, ShiftLeft<3>(index));
|
|
+}
|
|
+
|
|
// ------------------------------ GatherOffset
|
|
|
|
-#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
|
|
- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
- HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
- return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL( \
|
|
- base, detail::BitCastToUnsigned(offset)); \
|
|
+#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
|
|
+ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
+ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
+ base, detail::BitCastToUnsigned(offset)); \
|
|
}
|
|
HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lx)
|
|
#undef HWY_RVV_GATHER
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Simd<T, N>> GatherOffset(Simd<T, N> d,
|
|
+ const T* HWY_RESTRICT base,
|
|
+ VFromD<Simd<MakeSigned<T>, N>> offset) {
|
|
+ return GatherOffset(Full<T>(), base, offset);
|
|
+}
|
|
+
|
|
// ------------------------------ GatherIndex
|
|
|
|
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
|
|
@@ -852,37 +928,101 @@ HWY_API VFromD<D> GatherIndex(D d, const
|
|
return GatherOffset(d, base, ShiftLeft<3>(index));
|
|
}
|
|
|
|
-// ================================================== CONVERT
|
|
+// ------------------------------ StoreInterleaved3
|
|
|
|
-// ------------------------------ PromoteTo U
|
|
+#define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API void NAME( \
|
|
+ HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b, \
|
|
+ HWY_RVV_V(BASE, SEW, LMUL) c, HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
|
|
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
|
|
+ const v##BASE##SEW##LMUL##x3_t triple = \
|
|
+ vcreate_##CHAR##SEW##LMUL##x3(a, b, c); \
|
|
+ return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple); \
|
|
+ }
|
|
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
|
+HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3)
|
|
+HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3)
|
|
|
|
-HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); }
|
|
-HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); }
|
|
-HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); }
|
|
+#undef HWY_RVV_STORE3
|
|
|
|
-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); }
|
|
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); }
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API void StoreInterleaved3(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
|
|
+ VFromD<Simd<T, N>> v2, Simd<T, N> /*tag*/,
|
|
+ T* unaligned) {
|
|
+ return StoreInterleaved3(v0, v1, v2, Full<T>(), unaligned);
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+#define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API void NAME( \
|
|
+ HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
|
|
+ HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
|
|
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
|
|
+ const v##BASE##SEW##LMUL##x4_t quad = \
|
|
+ vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3); \
|
|
+ return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad); \
|
|
+ }
|
|
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
|
+HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4)
|
|
+HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4)
|
|
|
|
-HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) {
|
|
- return vzext_vf2_u32m2(v);
|
|
-}
|
|
-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) {
|
|
- return vzext_vf2_u32m4(v);
|
|
-}
|
|
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) {
|
|
- return vzext_vf2_u32m8(v);
|
|
-}
|
|
+#undef HWY_RVV_STORE4
|
|
|
|
-HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) {
|
|
- return vzext_vf2_u64m2(v);
|
|
-}
|
|
-HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) {
|
|
- return vzext_vf2_u64m4(v);
|
|
-}
|
|
-HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) {
|
|
- return vzext_vf2_u64m8(v);
|
|
+// Partial
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API void StoreInterleaved4(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
|
|
+ VFromD<Simd<T, N>> v2, VFromD<Simd<T, N>> v3,
|
|
+ Simd<T, N> /*tag*/, T* unaligned) {
|
|
+ return StoreInterleaved4(v0, v1, v2, v3, Full<T>(), unaligned);
|
|
}
|
|
|
|
+// ================================================== CONVERT
|
|
+
|
|
+#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \
|
|
+ HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
|
|
+ PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/, \
|
|
+ HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
|
|
+ return OP##CHAR##BITS##LMUL(v); \
|
|
+ }
|
|
+
|
|
+// TODO(janwas): GCC does not yet support fractional LMUL
|
|
+#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
|
|
+ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2)*/ \
|
|
+ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1) \
|
|
+ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2) \
|
|
+ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4)
|
|
+
|
|
+#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
|
|
+ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4)*/ \
|
|
+ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2)*/ \
|
|
+ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1) \
|
|
+ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2)
|
|
+
|
|
+// ------------------------------ PromoteTo
|
|
+
|
|
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8)
|
|
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16)
|
|
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32)
|
|
+HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
|
|
+
|
|
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8)
|
|
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16)
|
|
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32)
|
|
+HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
|
|
+
|
|
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16)
|
|
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32)
|
|
+
|
|
+// i32 to f64
|
|
+HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
|
|
+
|
|
+#undef HWY_RVV_PROMOTE_X4
|
|
+#undef HWY_RVV_PROMOTE_X2
|
|
+#undef HWY_RVV_PROMOTE
|
|
+
|
|
template <size_t N>
|
|
HWY_API VFromD<Simd<int16_t, N>> PromoteTo(Simd<int16_t, N> d,
|
|
VFromD<Simd<uint8_t, N>> v) {
|
|
@@ -901,67 +1041,6 @@ HWY_API VFromD<Simd<int32_t, N>> Promote
|
|
return BitCast(d, PromoteTo(Simd<uint32_t, N>(), v));
|
|
}
|
|
|
|
-// ------------------------------ PromoteTo I
|
|
-
|
|
-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); }
|
|
-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); }
|
|
-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); }
|
|
-
|
|
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); }
|
|
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); }
|
|
-
|
|
-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) {
|
|
- return vsext_vf2_i32m2(v);
|
|
-}
|
|
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) {
|
|
- return vsext_vf2_i32m4(v);
|
|
-}
|
|
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) {
|
|
- return vsext_vf2_i32m8(v);
|
|
-}
|
|
-
|
|
-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) {
|
|
- return vsext_vf2_i64m2(v);
|
|
-}
|
|
-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) {
|
|
- return vsext_vf2_i64m4(v);
|
|
-}
|
|
-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) {
|
|
- return vsext_vf2_i64m8(v);
|
|
-}
|
|
-
|
|
-// ------------------------------ PromoteTo F
|
|
-
|
|
-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) {
|
|
- return vfwcvt_f_f_v_f32m2(v);
|
|
-}
|
|
-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) {
|
|
- return vfwcvt_f_f_v_f32m4(v);
|
|
-}
|
|
-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) {
|
|
- return vfwcvt_f_f_v_f32m8(v);
|
|
-}
|
|
-
|
|
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) {
|
|
- return vfwcvt_f_f_v_f64m2(v);
|
|
-}
|
|
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) {
|
|
- return vfwcvt_f_f_v_f64m4(v);
|
|
-}
|
|
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) {
|
|
- return vfwcvt_f_f_v_f64m8(v);
|
|
-}
|
|
-
|
|
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) {
|
|
- return vfwcvt_f_x_v_f64m2(v);
|
|
-}
|
|
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) {
|
|
- return vfwcvt_f_x_v_f64m4(v);
|
|
-}
|
|
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) {
|
|
- return vfwcvt_f_x_v_f64m8(v);
|
|
-}
|
|
-
|
|
// ------------------------------ DemoteTo U
|
|
|
|
// First clamp negative numbers to zero to match x86 packus.
|
|
@@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
|
|
|
|
// ------------------------------ ConvertTo F
|
|
|
|
-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
|
|
HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) { \
|
|
- return vfcvt_f_x_v_f##SEW##m##LMUL(v); \
|
|
+ return vfcvt_f_x_v_f##SEW##LMUL(v); \
|
|
} \
|
|
/* Truncates (rounds toward zero). */ \
|
|
HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \
|
|
HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v); \
|
|
+ return vfcvt_rtz_x_f_v_i##SEW##LMUL(v); \
|
|
} \
|
|
/* Uses default rounding mode. */ \
|
|
HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return vfcvt_x_f_v_i##SEW##m##LMUL(v); \
|
|
+ return vfcvt_x_f_v_i##SEW##LMUL(v); \
|
|
}
|
|
|
|
// API only requires f32 but we provide f64 for internal use (otherwise, it
|
|
@@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
|
|
HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _)
|
|
#undef HWY_RVV_CONVERT
|
|
|
|
+// Partial
|
|
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
|
|
+HWY_API VFromD<Simd<T, N>> ConvertTo(Simd<T, N> /*tag*/, FromV v) {
|
|
+ return ConvertTo(Full<T>(), v);
|
|
+}
|
|
+
|
|
// ================================================== SWIZZLE
|
|
|
|
// ------------------------------ Compress
|
|
|
|
-#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
|
|
- return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v); \
|
|
+#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
|
|
+ return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v); \
|
|
}
|
|
|
|
+HWY_RVV_FOREACH_UI16(HWY_RVV_COMPRESS, Compress, compress)
|
|
HWY_RVV_FOREACH_UI32(HWY_RVV_COMPRESS, Compress, compress)
|
|
HWY_RVV_FOREACH_UI64(HWY_RVV_COMPRESS, Compress, compress)
|
|
HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress)
|
|
@@ -1121,10 +1207,10 @@ HWY_API VFromD<DU> SetTableIndices(D d,
|
|
|
|
// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
|
|
// to 2048! We could instead use vrgatherei16.
|
|
-#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, idx); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
|
|
@@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D
|
|
using T = MakeUnsigned<TFromD<D>>;
|
|
return detail::And(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
|
|
}
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
@@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) {
|
|
|
|
// ------------------------------ GetLane
|
|
|
|
-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v); \
|
|
+#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
+ return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x)
|
|
@@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL
|
|
|
|
// ------------------------------ ShiftLeftLanes
|
|
|
|
-// vector = f(vector, size_t)
|
|
-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) { \
|
|
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes); \
|
|
+// vector = f(vector, vector, size_t)
|
|
+#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
|
|
+ size_t lanes) { \
|
|
+ return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes); \
|
|
}
|
|
|
|
namespace detail {
|
|
@@ -1270,7 +1356,7 @@ template <size_t kLanes, class V>
|
|
HWY_API V ShiftLeftLanes(const V v) {
|
|
using D = DFromV<V>;
|
|
const RebindToSigned<D> di;
|
|
- const auto shifted = detail::SlideUp(v, kLanes);
|
|
+ const auto shifted = detail::SlideUp(v, v, kLanes);
|
|
// Match x86 semantics by zeroing lower lanes in 128-bit blocks
|
|
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
|
|
const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
|
|
@@ -1300,7 +1386,7 @@ template <size_t kLanes, class V>
|
|
HWY_API V ShiftRightLanes(const V v) {
|
|
using D = DFromV<V>;
|
|
const RebindToSigned<D> di;
|
|
- const auto shifted = detail::SlideDown(v, kLanes);
|
|
+ const auto shifted = detail::SlideDown(v, v, kLanes);
|
|
// Match x86 semantics by zeroing upper lanes in 128-bit blocks
|
|
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
|
|
const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
|
|
@@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c
|
|
template <class V>
|
|
HWY_API V ConcatLowerLower(const V hi, const V lo) {
|
|
// Move lower half into upper
|
|
- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
|
|
+ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
|
|
return ConcatUpperLower(hi_up, lo);
|
|
}
|
|
|
|
@@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c
|
|
template <class V>
|
|
HWY_API V ConcatUpperUpper(const V hi, const V lo) {
|
|
// Move upper half into lower
|
|
- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
|
|
+ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
|
|
return ConcatUpperLower(hi, lo_down);
|
|
}
|
|
|
|
@@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c
|
|
template <class V>
|
|
HWY_API V ConcatLowerUpper(const V hi, const V lo) {
|
|
// Move half of both inputs to the other half
|
|
- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
|
|
- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
|
|
+ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
|
|
+ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
|
|
return ConcatUpperLower(hi_up, lo_down);
|
|
}
|
|
|
|
@@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b)
|
|
// ================================================== REDUCE
|
|
|
|
// vector = f(vector, zero_m1)
|
|
-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) { \
|
|
- vsetvlmax_e##SEW##m##LMUL(); \
|
|
- return Set(HWY_RVV_D(CHAR, SEW, LMUL)(), \
|
|
- GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \
|
|
- v0, v, v0))); \
|
|
+#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
|
|
+ vsetvlmax_e##SEW##LMUL(); \
|
|
+ return Set( \
|
|
+ HWY_RVV_D(CHAR, SEW, LMUL)(), \
|
|
+ GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \
|
|
}
|
|
|
|
// ------------------------------ SumOfLanes
|
|
|
|
namespace detail {
|
|
-
|
|
HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum)
|
|
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum)
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
HWY_API V SumOfLanes(const V v) {
|
|
using T = TFromV<V>;
|
|
- const auto v0 = Zero(Simd<T, HWY_LANES(T)>()); // always m1
|
|
+ const auto v0 = Zero(Full<T>()); // always m1
|
|
return detail::RedSum(v, v0);
|
|
}
|
|
|
|
// ------------------------------ MinOfLanes
|
|
namespace detail {
|
|
-
|
|
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu)
|
|
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin)
|
|
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin)
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
HWY_API V MinOfLanes(const V v) {
|
|
using T = TFromV<V>;
|
|
- const Simd<T, HWY_LANES(T)> d1; // always m1
|
|
+ const Full<T> d1; // always m1
|
|
const auto neutral = Set(d1, HighestValue<T>());
|
|
return detail::RedMin(v, neutral);
|
|
}
|
|
|
|
// ------------------------------ MaxOfLanes
|
|
namespace detail {
|
|
-
|
|
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu)
|
|
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax)
|
|
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax)
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
HWY_API V MaxOfLanes(const V v) {
|
|
using T = TFromV<V>;
|
|
- const Simd<T, HWY_LANES(T)> d1; // always m1
|
|
+ const Full<T> d1; // always m1
|
|
const auto neutral = Set(d1, LowestValue<T>());
|
|
return detail::RedMax(v, neutral);
|
|
}
|
|
@@ -1507,7 +1587,7 @@ HWY_API VFromD<D> LoadDup128(D d, const
|
|
#define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP) \
|
|
HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \
|
|
/* LMUL=1 is always enough */ \
|
|
- Simd<uint8_t, HWY_LANES(uint8_t)> d8; \
|
|
+ Full<uint8_t> d8; \
|
|
const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN; \
|
|
/* TODO(janwas): how to convert vbool* to vuint?*/ \
|
|
/*Store(m, d8, p);*/ \
|
|
@@ -1518,6 +1598,22 @@ HWY_API VFromD<D> LoadDup128(D d, const
|
|
HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _)
|
|
#undef HWY_RVV_STORE_MASK_BITS
|
|
|
|
+// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
|
|
+
|
|
+// Disallow for 8-bit because Iota is likely to overflow.
|
|
+template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
|
|
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
|
|
+ const RebindToSigned<D> di;
|
|
+ return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
|
|
+}
|
|
+
|
|
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
|
|
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
|
|
+ const auto zero = Zero(d);
|
|
+ const auto one = Set(d, 1);
|
|
+ return Eq(detail::SlideUp(one, zero, n), one);
|
|
+}
|
|
+
|
|
// ------------------------------ Neg
|
|
|
|
template <class V, HWY_IF_SIGNED_V(V)>
|
|
@@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) {
|
|
}
|
|
|
|
// vector = f(vector), but argument is repeated
|
|
-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
|
|
+#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
|
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v); \
|
|
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, v); \
|
|
}
|
|
|
|
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn)
|
|
@@ -1565,7 +1661,6 @@ template <class V>
|
|
HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
|
|
return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
|
|
}
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
@@ -1636,10 +1731,8 @@ HWY_API VFromD<D> Iota(const D d, TFromD
|
|
// Using vwmul does not work for m8, so use mulh instead. Highway only provides
|
|
// MulHigh for 16-bit, so use a private wrapper.
|
|
namespace detail {
|
|
-
|
|
HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu)
|
|
HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh)
|
|
-
|
|
} // namespace detail
|
|
|
|
template <class V>
|
|
@@ -1649,7 +1742,7 @@ HWY_API VFromD<RepartitionToWide<DFromV<
|
|
const auto lo = Mul(a, b);
|
|
const auto hi = detail::MulHigh(a, b);
|
|
const RepartitionToWide<DFromV<V>> dw;
|
|
- return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo));
|
|
+ return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo));
|
|
}
|
|
|
|
// ================================================== END MACROS
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 2021-06-02 10:56:05.237904402 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -19,7 +19,6 @@
|
|
#include <stdint.h>
|
|
|
|
#include <algorithm> // std::min
|
|
-#include <cmath>
|
|
|
|
#include "hwy/base.h"
|
|
#include "hwy/ops/shared-inl.h"
|
|
@@ -199,7 +198,7 @@ HWY_API Vec1<T> BroadcastSignBit(const V
|
|
template <typename TFrom, typename TTo>
|
|
HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
|
|
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
|
|
- return Mask1<TTo>(m.raw);
|
|
+ return Mask1<TTo>{m.bits};
|
|
}
|
|
|
|
// v must be 0 or FF..FF.
|
|
@@ -224,6 +223,11 @@ Vec1<T> VecFromMask(Sisd<T> /* tag */, c
|
|
return v;
|
|
}
|
|
|
|
+template <typename T>
|
|
+HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
|
|
+ return Mask1<T>::FromBool(n != 0);
|
|
+}
|
|
+
|
|
// Returns mask ? yes : no.
|
|
template <typename T>
|
|
HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
|
|
@@ -357,9 +361,9 @@ HWY_INLINE Vec1<T> operator>>(const Vec1
|
|
|
|
template <typename T>
|
|
HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
|
|
- const uint64_t a64 = static_cast<int64_t>(a.raw);
|
|
- const uint64_t b64 = static_cast<int64_t>(b.raw);
|
|
- return Vec1<T>((a64 + b64) & ~T(0));
|
|
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
|
|
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
|
|
+ return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
|
|
}
|
|
HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
|
|
return Vec1<float>(a.raw + b.raw);
|
|
@@ -370,9 +374,9 @@ HWY_INLINE Vec1<double> operator+(const
|
|
|
|
template <typename T>
|
|
HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
|
|
- const uint64_t a64 = static_cast<int64_t>(a.raw);
|
|
- const uint64_t b64 = static_cast<int64_t>(b.raw);
|
|
- return Vec1<T>((a64 - b64) & ~T(0));
|
|
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
|
|
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
|
|
+ return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
|
|
}
|
|
HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
|
|
return Vec1<float>(a.raw - b.raw);
|
|
@@ -388,21 +392,25 @@ HWY_INLINE Vec1<double> operator-(const
|
|
// Unsigned
|
|
HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
|
|
const Vec1<uint8_t> b) {
|
|
- return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255));
|
|
+ return Vec1<uint8_t>(
|
|
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
|
|
}
|
|
HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
|
|
const Vec1<uint16_t> b) {
|
|
- return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535));
|
|
+ return Vec1<uint16_t>(
|
|
+ static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
|
|
}
|
|
|
|
// Signed
|
|
HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
|
|
const Vec1<int8_t> b) {
|
|
- return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127));
|
|
+ return Vec1<int8_t>(
|
|
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
|
|
}
|
|
HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
|
|
const Vec1<int16_t> b) {
|
|
- return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767));
|
|
+ return Vec1<int16_t>(
|
|
+ static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
|
|
}
|
|
|
|
// ------------------------------ Saturating subtraction
|
|
@@ -412,21 +420,25 @@ HWY_INLINE Vec1<int16_t> SaturatedAdd(co
|
|
// Unsigned
|
|
HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
|
|
const Vec1<uint8_t> b) {
|
|
- return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255));
|
|
+ return Vec1<uint8_t>(
|
|
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
|
|
}
|
|
HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
|
|
const Vec1<uint16_t> b) {
|
|
- return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535));
|
|
+ return Vec1<uint16_t>(
|
|
+ static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
|
|
}
|
|
|
|
// Signed
|
|
HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
|
|
const Vec1<int8_t> b) {
|
|
- return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127));
|
|
+ return Vec1<int8_t>(
|
|
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
|
|
}
|
|
HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
|
|
const Vec1<int16_t> b) {
|
|
- return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767));
|
|
+ return Vec1<int16_t>(
|
|
+ static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
|
|
}
|
|
|
|
// ------------------------------ Average
|
|
@@ -435,11 +447,11 @@ HWY_INLINE Vec1<int16_t> SaturatedSub(co
|
|
|
|
HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
|
|
const Vec1<uint8_t> b) {
|
|
- return Vec1<uint8_t>((a.raw + b.raw + 1) / 2);
|
|
+ return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
|
|
}
|
|
HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
|
|
const Vec1<uint16_t> b) {
|
|
- return Vec1<uint16_t>((a.raw + b.raw + 1) / 2);
|
|
+ return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
|
|
}
|
|
|
|
// ------------------------------ Absolute value
|
|
@@ -514,15 +526,15 @@ HWY_INLINE Vec1<T> operator/(const Vec1<
|
|
|
|
// Returns the upper 16 bits of a * b in each lane.
|
|
HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
|
|
- return Vec1<int16_t>((a.raw * b.raw) >> 16);
|
|
+ return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
|
|
}
|
|
HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
|
|
const Vec1<uint16_t> b) {
|
|
// Cast to uint32_t first to prevent overflow. Otherwise the result of
|
|
// uint16_t * uint16_t is in "int" which may overflow. In practice the result
|
|
// is the same but this way it is also defined.
|
|
- return Vec1<uint16_t>(
|
|
- (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16);
|
|
+ return Vec1<uint16_t>(static_cast<uint16_t>(
|
|
+ (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
|
|
}
|
|
|
|
// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
|
|
@@ -617,6 +629,31 @@ HWY_INLINE Vec1<T> Round(const Vec1<T> v
|
|
return Vec1<T>(static_cast<T>(rounded));
|
|
}
|
|
|
|
+// Round-to-nearest even.
|
|
+HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
|
|
+ using T = float;
|
|
+ using TI = int32_t;
|
|
+
|
|
+ const T abs = Abs(v).raw;
|
|
+ const bool signbit = std::signbit(v.raw);
|
|
+
|
|
+ if (!(abs < MantissaEnd<T>())) { // Huge or NaN
|
|
+ // Check if too large to cast or NaN
|
|
+ if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
|
|
+ return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
|
|
+ }
|
|
+ return Vec1<int32_t>(static_cast<TI>(v.raw));
|
|
+ }
|
|
+ const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
|
|
+ const TI rounded = static_cast<TI>(v.raw + bias);
|
|
+ if (rounded == 0) return Vec1<int32_t>(0);
|
|
+ // Round to even
|
|
+ if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
|
+ return Vec1<TI>(rounded - (signbit ? -1 : 1));
|
|
+ }
|
|
+ return Vec1<TI>(rounded);
|
|
+}
|
|
+
|
|
template <typename T>
|
|
HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
|
|
using TI = MakeSigned<T>;
|
|
@@ -641,7 +678,8 @@ V Ceiling(const V v) {
|
|
Bits bits;
|
|
CopyBytes<sizeof(Bits)>(&v, &bits);
|
|
|
|
- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
|
|
+ const int exponent =
|
|
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
// Already an integer.
|
|
if (exponent >= kMantissaBits) return v;
|
|
// |v| <= 1 => 0 or 1.
|
|
@@ -672,7 +710,8 @@ V Floor(const V v) {
|
|
Bits bits;
|
|
CopyBytes<sizeof(Bits)>(&v, &bits);
|
|
|
|
- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
|
|
+ const int exponent =
|
|
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
// Already an integer.
|
|
if (exponent >= kMantissaBits) return v;
|
|
// |v| <= 1 => -1 or 0.
|
|
@@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1<T> v,
|
|
return Store(v, d, p);
|
|
}
|
|
|
|
+// ------------------------------ StoreInterleaved3
|
|
+
|
|
+HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
|
|
+ const Vec1<uint8_t> v2, Sisd<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ StoreU(v0, d, unaligned + 0);
|
|
+ StoreU(v1, d, unaligned + 1);
|
|
+ StoreU(v2, d, unaligned + 2);
|
|
+}
|
|
+
|
|
+HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
|
|
+ const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
|
|
+ Sisd<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ StoreU(v0, d, unaligned + 0);
|
|
+ StoreU(v1, d, unaligned + 1);
|
|
+ StoreU(v2, d, unaligned + 2);
|
|
+ StoreU(v3, d, unaligned + 3);
|
|
+}
|
|
+
|
|
// ------------------------------ Stream
|
|
|
|
template <typename T>
|
|
@@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1<T> v,
|
|
return Store(v, d, aligned);
|
|
}
|
|
|
|
+// ------------------------------ Scatter
|
|
+
|
|
+template <typename T, typename Offset>
|
|
+HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
|
|
+ const Vec1<Offset> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+ uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
|
|
+ return Store(v, d, reinterpret_cast<T*>(base8));
|
|
+}
|
|
+
|
|
+template <typename T, typename Index>
|
|
+HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec1<Index> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+ return Store(v, d, base + index.raw);
|
|
+}
|
|
+
|
|
// ------------------------------ Gather
|
|
|
|
template <typename T, typename Offset>
|
|
HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
|
|
const Vec1<Offset> offset) {
|
|
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
|
|
return Load(d, reinterpret_cast<const T*>(addr));
|
|
}
|
|
@@ -792,7 +868,7 @@ HWY_INLINE Vec1<T> GatherOffset(Sisd<T>
|
|
template <typename T, typename Index>
|
|
HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
|
|
const Vec1<Index> index) {
|
|
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
return Load(d, base + index.raw);
|
|
}
|
|
|
|
@@ -833,15 +909,20 @@ HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT>
|
|
|
|
static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
|
|
const Vec1<float16_t> v) {
|
|
+#if HWY_NATIVE_FLOAT16
|
|
uint16_t bits16;
|
|
CopyBytes<2>(&v.raw, &bits16);
|
|
+#else
|
|
+ const uint16_t bits16 = v.raw.bits;
|
|
+#endif
|
|
const uint32_t sign = bits16 >> 15;
|
|
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
|
const uint32_t mantissa = bits16 & 0x3FF;
|
|
|
|
// Subnormal or zero
|
|
if (biased_exp == 0) {
|
|
- const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
|
|
+ const float subnormal =
|
|
+ (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
|
|
return Vec1<float>(sign ? -subnormal : subnormal);
|
|
}
|
|
|
|
@@ -867,8 +948,12 @@ static HWY_INLINE Vec1<float16_t> Demote
|
|
// Tiny or zero => zero.
|
|
Vec1<float16_t> out;
|
|
if (exp < -24) {
|
|
- bits32 = 0;
|
|
- CopyBytes<2>(&bits32, &out);
|
|
+#if HWY_NATIVE_FLOAT16
|
|
+ const uint16_t zero = 0;
|
|
+ CopyBytes<2>(&zero, &out.raw);
|
|
+#else
|
|
+ out.raw.bits = 0;
|
|
+#endif
|
|
return out;
|
|
}
|
|
|
|
@@ -890,7 +975,12 @@ static HWY_INLINE Vec1<float16_t> Demote
|
|
HWY_DASSERT(mantissa16 < 1024);
|
|
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
|
HWY_DASSERT(bits16 < 0x10000);
|
|
- CopyBytes<2>(&bits16, &out);
|
|
+#if HWY_NATIVE_FLOAT16
|
|
+ const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
|
+ CopyBytes<2>(&narrowed, &out.raw);
|
|
+#else
|
|
+ out.raw.bits = static_cast<uint16_t>(bits16);
|
|
+#endif
|
|
return out;
|
|
}
|
|
|
|
@@ -919,18 +1009,6 @@ HWY_INLINE Vec1<uint8_t> U8FromU32(const
|
|
return DemoteTo(Sisd<uint8_t>(), v);
|
|
}
|
|
|
|
-// Approximation of round-to-nearest for numbers representable as int32_t.
|
|
-HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
|
|
- const float f = v.raw;
|
|
- if (std::isinf(f) ||
|
|
- std::fabs(f) > static_cast<float>(LimitsMax<int32_t>())) {
|
|
- return Vec1<int32_t>(std::signbit(f) ? LimitsMin<int32_t>()
|
|
- : LimitsMax<int32_t>());
|
|
- }
|
|
- const float bias = f < 0.0f ? -0.5f : 0.5f;
|
|
- return Vec1<int32_t>(static_cast<int>(f + bias));
|
|
-}
|
|
-
|
|
// ================================================== SWIZZLE
|
|
|
|
// Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 2021-06-02 10:56:05.224904336 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -31,11 +31,6 @@
|
|
#undef HWY_ALIGN
|
|
#undef HWY_LANES
|
|
|
|
-#undef HWY_GATHER_LANES
|
|
-#undef HWY_VARIABLE_SHIFT_LANES
|
|
-#undef HWY_COMPARE64_LANES
|
|
-#undef HWY_MINMAX64_LANES
|
|
-
|
|
#undef HWY_CAP_INTEGER64
|
|
#undef HWY_CAP_FLOAT64
|
|
#undef HWY_CAP_GE256
|
|
@@ -53,11 +48,6 @@
|
|
#define HWY_ALIGN alignas(16)
|
|
#define HWY_LANES(T) (16 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) 1
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_COMPARE64_LANES 2
|
|
-#define HWY_MINMAX64_LANES 1
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
#define HWY_CAP_GE256 0
|
|
@@ -73,11 +63,6 @@
|
|
#define HWY_ALIGN alignas(32)
|
|
#define HWY_LANES(T) (32 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_COMPARE64_LANES 4
|
|
-#define HWY_MINMAX64_LANES 1
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
#define HWY_CAP_GE256 1
|
|
@@ -96,11 +81,6 @@
|
|
#define HWY_ALIGN alignas(64)
|
|
#define HWY_LANES(T) (64 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_COMPARE64_LANES 8
|
|
-#define HWY_MINMAX64_LANES 8
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
#define HWY_CAP_GE256 1
|
|
@@ -121,11 +101,6 @@
|
|
#define HWY_ALIGN alignas(16)
|
|
#define HWY_LANES(T) (16 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) 1
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_COMPARE64_LANES 2
|
|
-#define HWY_MINMAX64_LANES 2
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
#define HWY_CAP_GE256 0
|
|
@@ -142,19 +117,14 @@
|
|
#define HWY_ALIGN alignas(16)
|
|
#define HWY_LANES(T) (16 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) 1
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_MINMAX64_LANES 2
|
|
-#define HWY_COMPARE64_LANES 2
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_GE256 0
|
|
#define HWY_CAP_GE512 0
|
|
|
|
-#ifdef __arm__
|
|
-#define HWY_CAP_FLOAT64 0
|
|
-#else
|
|
+#if HWY_ARCH_ARM_A64
|
|
#define HWY_CAP_FLOAT64 1
|
|
+#else
|
|
+#define HWY_CAP_FLOAT64 0
|
|
#endif
|
|
|
|
#define HWY_NAMESPACE N_NEON
|
|
@@ -162,17 +132,34 @@
|
|
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
|
|
|
//-----------------------------------------------------------------------------
|
|
+// SVE[2]
|
|
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
|
|
+
|
|
+// SVE only requires lane alignment, not natural alignment of the entire vector.
|
|
+#define HWY_ALIGN alignas(8)
|
|
+// Upper bound, not the actual lane count!
|
|
+#define HWY_LANES(T) (256 / sizeof(T))
|
|
+
|
|
+#define HWY_CAP_INTEGER64 1
|
|
+#define HWY_CAP_FLOAT64 1
|
|
+#define HWY_CAP_GE256 0
|
|
+#define HWY_CAP_GE512 0
|
|
+
|
|
+#if HWY_TARGET == HWY_SVE2
|
|
+#define HWY_NAMESPACE N_SVE2
|
|
+#else
|
|
+#define HWY_NAMESPACE N_SVE
|
|
+#endif
|
|
+
|
|
+// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
|
|
+
|
|
+//-----------------------------------------------------------------------------
|
|
// WASM
|
|
#elif HWY_TARGET == HWY_WASM
|
|
|
|
#define HWY_ALIGN alignas(16)
|
|
#define HWY_LANES(T) (16 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) 1
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-#define HWY_COMPARE64_LANES 2
|
|
-#define HWY_MINMAX64_LANES 2
|
|
-
|
|
#define HWY_CAP_INTEGER64 0
|
|
#define HWY_CAP_FLOAT64 0
|
|
#define HWY_CAP_GE256 0
|
|
@@ -194,11 +181,6 @@
|
|
// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
|
|
#define HWY_LANES(T) (4096 / sizeof(T))
|
|
|
|
-#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
|
|
-// Cannot use HWY_LANES/sizeof here because these are used in an #if.
|
|
-#define HWY_COMPARE64_LANES 256
|
|
-#define HWY_MINMAX64_LANES 256
|
|
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
@@ -215,13 +197,9 @@
|
|
#elif HWY_TARGET == HWY_SCALAR
|
|
|
|
#define HWY_ALIGN
|
|
+// For internal use only; use Lanes(d) instead.
|
|
#define HWY_LANES(T) 1
|
|
|
|
-#define HWY_GATHER_LANES(T) 1
|
|
-#define HWY_VARIABLE_SHIFT_LANES(T) 1
|
|
-#define HWY_COMPARE64_LANES 1
|
|
-#define HWY_MINMAX64_LANES 1
|
|
-
|
|
#define HWY_CAP_INTEGER64 1
|
|
#define HWY_CAP_FLOAT64 1
|
|
#define HWY_CAP_GE256 0
|
|
@@ -265,3 +243,7 @@
|
|
#else
|
|
#define HWY_ATTR
|
|
#endif
|
|
+
|
|
+// DEPRECATED
|
|
+#undef HWY_GATHER_LANES
|
|
+#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 2021-06-02 10:56:05.235904392 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -14,6 +14,8 @@
|
|
|
|
// Per-target definitions shared by ops/*.h and user code.
|
|
|
|
+#include <cmath>
|
|
+
|
|
// Separate header because foreach_target.h re-enables its include guard.
|
|
#include "hwy/ops/set_macros-inl.h"
|
|
|
|
@@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si
|
|
}
|
|
|
|
// Targets with non-constexpr Lanes define this themselves.
|
|
-#if HWY_TARGET != HWY_RVV
|
|
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
|
|
|
|
// (Potentially) non-constant actual size of the vector at runtime, subject to
|
|
// the limit imposed by the Simd. Useful for advancing loop counters.
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 2021-06-02 10:56:05.242904427 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -19,8 +19,6 @@
|
|
#include <stdint.h>
|
|
#include <wasm_simd128.h>
|
|
|
|
-#include <cmath>
|
|
-
|
|
#include "hwy/base.h"
|
|
#include "hwy/ops/shared-inl.h"
|
|
|
|
@@ -177,6 +175,16 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N
|
|
|
|
HWY_DIAGNOSTICS(pop)
|
|
|
|
+// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
+template <typename T, size_t N, typename T2>
|
|
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
|
|
+ HWY_ALIGN T lanes[16 / sizeof(T)];
|
|
+ for (size_t i = 0; i < 16 / sizeof(T); ++i) {
|
|
+ lanes[i] = static_cast<T>(first + static_cast<T2>(i));
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
+}
|
|
+
|
|
// ================================================== ARITHMETIC
|
|
|
|
// ------------------------------ Addition
|
|
@@ -273,24 +281,24 @@ HWY_API Vec128<float, N> operator-(const
|
|
template <size_t N>
|
|
HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint8_t, N>{wasm_u8x16_add_saturate(a.raw, b.raw)};
|
|
+ return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
- return Vec128<uint16_t, N>{wasm_u16x8_add_saturate(a.raw, b.raw)};
|
|
+ return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
|
|
}
|
|
|
|
// Signed
|
|
template <size_t N>
|
|
HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int8_t, N>{wasm_i8x16_add_saturate(a.raw, b.raw)};
|
|
+ return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
- return Vec128<int16_t, N>{wasm_i16x8_add_saturate(a.raw, b.raw)};
|
|
+ return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Saturating subtraction
|
|
@@ -301,24 +309,24 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(
|
|
template <size_t N>
|
|
HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint8_t, N>{wasm_u8x16_sub_saturate(a.raw, b.raw)};
|
|
+ return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
- return Vec128<uint16_t, N>{wasm_u16x8_sub_saturate(a.raw, b.raw)};
|
|
+ return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
|
|
}
|
|
|
|
// Signed
|
|
template <size_t N>
|
|
HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int8_t, N>{wasm_i8x16_sub_saturate(a.raw, b.raw)};
|
|
+ return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
- return Vec128<int16_t, N>{wasm_i16x8_sub_saturate(a.raw, b.raw)};
|
|
+ return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Average
|
|
@@ -352,6 +360,12 @@ template <size_t N>
|
|
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
|
|
return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
|
|
}
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
|
|
+ // TODO(janwas): use wasm_i64x2_abs when available
|
|
+ const Vec128<int64_t, N> mask = wasm_i64x2_shr(v.raw, 63);
|
|
+ return ((v ^ mask) - mask);
|
|
+}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
|
|
@@ -396,9 +410,38 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
|
|
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
|
|
}
|
|
|
|
+// 8-bit
|
|
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
|
|
+ const Simd<T, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
|
|
+ return kBits == 1
|
|
+ ? (v + v)
|
|
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
|
|
+}
|
|
+
|
|
+template <int kBits, size_t N>
|
|
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
|
|
+ const Simd<uint8_t, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<uint8_t, N> shifted{
|
|
+ ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
|
|
+ return shifted & Set(d8, 0xFF >> kBits);
|
|
+}
|
|
+
|
|
+template <int kBits, size_t N>
|
|
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
|
|
+ const Simd<int8_t, N> di;
|
|
+ const Simd<uint8_t, N> du;
|
|
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ Shift lanes by same variable #bits
|
|
|
|
-// Unsigned (no u8)
|
|
+// Unsigned
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
|
|
const int bits) {
|
|
@@ -420,7 +463,7 @@ HWY_API Vec128<uint32_t, N> ShiftRightSa
|
|
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
|
|
}
|
|
|
|
-// Signed (no i8)
|
|
+// Signed
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
|
|
const int bits) {
|
|
@@ -442,6 +485,35 @@ HWY_API Vec128<int32_t, N> ShiftRightSam
|
|
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
|
|
}
|
|
|
|
+// 8-bit
|
|
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
|
|
+ const Simd<T, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<T, N> shifted{
|
|
+ ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
|
|
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
|
|
+ const int bits) {
|
|
+ const Simd<uint8_t, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<uint8_t, N> shifted{
|
|
+ ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
|
|
+ return shifted & Set(d8, 0xFF >> bits);
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
|
|
+ const Simd<int8_t, N> di;
|
|
+ const Simd<uint8_t, N> du;
|
|
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ Minimum
|
|
|
|
// Unsigned
|
|
@@ -607,29 +679,29 @@ template <size_t N>
|
|
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
// TODO(eustas): replace, when implemented in WASM.
|
|
- const auto al = wasm_i32x4_widen_low_u16x8(a.raw);
|
|
- const auto ah = wasm_i32x4_widen_high_u16x8(a.raw);
|
|
- const auto bl = wasm_i32x4_widen_low_u16x8(b.raw);
|
|
- const auto bh = wasm_i32x4_widen_high_u16x8(b.raw);
|
|
+ const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
|
|
+ const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
|
|
+ const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
|
|
+ const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
|
|
const auto l = wasm_i32x4_mul(al, bl);
|
|
const auto h = wasm_i32x4_mul(ah, bh);
|
|
// TODO(eustas): shift-right + narrow?
|
|
return Vec128<uint16_t, N>{
|
|
- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
// TODO(eustas): replace, when implemented in WASM.
|
|
- const auto al = wasm_i32x4_widen_low_i16x8(a.raw);
|
|
- const auto ah = wasm_i32x4_widen_high_i16x8(a.raw);
|
|
- const auto bl = wasm_i32x4_widen_low_i16x8(b.raw);
|
|
- const auto bh = wasm_i32x4_widen_high_i16x8(b.raw);
|
|
+ const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
|
|
+ const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
|
|
+ const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
|
|
+ const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
|
|
const auto l = wasm_i32x4_mul(al, bl);
|
|
const auto h = wasm_i32x4_mul(ah, bh);
|
|
// TODO(eustas): shift-right + narrow?
|
|
return Vec128<int16_t, N>{
|
|
- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
}
|
|
|
|
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
|
|
@@ -765,53 +837,76 @@ HWY_API Vec128<float, N> ApproximateReci
|
|
// Toward nearest integer, ties to even
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
|
|
- // TODO(eustas): is it f32x4.nearest? (not implemented yet)
|
|
- alignas(16) float input[4];
|
|
- alignas(16) float output[4];
|
|
- wasm_v128_store(input, v.raw);
|
|
- for (size_t i = 0; i < 4; ++i) {
|
|
- output[i] = std::nearbyint(input[i]);
|
|
- }
|
|
- return Vec128<float, N>{wasm_v128_load(output)};
|
|
+ // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not
|
|
+ // yet have an instruction for that (f32x4.nearest is not implemented). We
|
|
+ // rely on rounding after addition with a large value such that no mantissa
|
|
+ // bits remain (assuming the current mode is nearest-even). We may need a
|
|
+ // compiler flag for precise floating-point to prevent "optimizing" this out.
|
|
+ const Simd<float, N> df;
|
|
+ const auto max = Set(df, MantissaEnd<float>());
|
|
+ const auto large = CopySignToAbs(max, v);
|
|
+ const auto added = large + v;
|
|
+ const auto rounded = added - large;
|
|
+
|
|
+ // Keep original if NaN or the magnitude is large (already an int).
|
|
+ return IfThenElse(Abs(v) < max, rounded, v);
|
|
}
|
|
|
|
+namespace detail {
|
|
+
|
|
+// Truncating to integer and converting back to float is correct except when the
|
|
+// input magnitude is large, in which case the input was already an integer
|
|
+// (because mantissa >> exponent is zero).
|
|
+template <size_t N>
|
|
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
|
|
+ return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
// Toward zero, aka truncate
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
|
|
// TODO(eustas): is it f32x4.trunc? (not implemented yet)
|
|
- alignas(16) float input[4];
|
|
- alignas(16) float output[4];
|
|
- wasm_v128_store(input, v.raw);
|
|
- for (size_t i = 0; i < 4; ++i) {
|
|
- output[i] = std::trunc(input[i]);
|
|
- }
|
|
- return Vec128<float, N>{wasm_v128_load(output)};
|
|
+ const Simd<float, N> df;
|
|
+ const RebindToSigned<decltype(df)> di;
|
|
+
|
|
+ const auto integer = ConvertTo(di, v); // round toward 0
|
|
+ const auto int_f = ConvertTo(df, integer);
|
|
+
|
|
+ return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
}
|
|
|
|
// Toward +infinity, aka ceiling
|
|
template <size_t N>
|
|
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
|
|
+HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
|
|
// TODO(eustas): is it f32x4.ceil? (not implemented yet)
|
|
- alignas(16) float input[4];
|
|
- alignas(16) float output[4];
|
|
- wasm_v128_store(input, v.raw);
|
|
- for (size_t i = 0; i < 4; ++i) {
|
|
- output[i] = std::ceil(input[i]);
|
|
- }
|
|
- return Vec128<float, N>{wasm_v128_load(output)};
|
|
+ const Simd<float, N> df;
|
|
+ const RebindToSigned<decltype(df)> di;
|
|
+
|
|
+ const auto integer = ConvertTo(di, v); // round toward 0
|
|
+ const auto int_f = ConvertTo(df, integer);
|
|
+
|
|
+ // Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
+ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
|
|
+
|
|
+ return IfThenElse(detail::UseInt(v), int_f - neg1, v);
|
|
}
|
|
|
|
// Toward -infinity, aka floor
|
|
template <size_t N>
|
|
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
|
|
+HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) {
|
|
// TODO(eustas): is it f32x4.floor? (not implemented yet)
|
|
- alignas(16) float input[4];
|
|
- alignas(16) float output[4];
|
|
- wasm_v128_store(input, v.raw);
|
|
- for (size_t i = 0; i < 4; ++i) {
|
|
- output[i] = std::floor(input[i]);
|
|
- }
|
|
- return Vec128<float, N>{wasm_v128_load(output)};
|
|
+ const Simd<float, N> df;
|
|
+ const RebindToSigned<decltype(df)> di;
|
|
+
|
|
+ const auto integer = ConvertTo(di, v); // round toward 0
|
|
+ const auto int_f = ConvertTo(df, integer);
|
|
+
|
|
+ // Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
+ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
|
|
+
|
|
+ return IfThenElse(detail::UseInt(v), int_f + neg1, v);
|
|
}
|
|
|
|
// ================================================== COMPARE
|
|
@@ -902,12 +997,12 @@ HWY_API Mask128<int64_t, N> operator>(co
|
|
|
|
// Otherwise, the lower half decides.
|
|
const auto m_eq = a32 == b32;
|
|
- const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
|
|
+ const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
|
|
const auto lo_gt = And(m_eq, lo_in_hi);
|
|
|
|
const auto gt = Or(lo_gt, m_gt);
|
|
// Copy result in upper 32 bits to lower 32 bits.
|
|
- return Mask128<int64_t, N>{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)};
|
|
+ return Mask128<int64_t, N>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
|
|
}
|
|
|
|
template <size_t N>
|
|
@@ -935,6 +1030,14 @@ HWY_API Mask128<float, N> operator>=(con
|
|
return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
|
|
}
|
|
|
|
+// ------------------------------ FirstN (Iota, Lt)
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
|
|
+ const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
|
|
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
|
|
+}
|
|
+
|
|
// ================================================== LOGICAL
|
|
|
|
// ------------------------------ Not
|
|
@@ -1015,7 +1118,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
|
|
- return VecFromMask(v < Zero(Simd<int8_t, N>()));
|
|
+ return VecFromMask(Simd<int8_t, N>(), v < Zero(Simd<int8_t, N>()));
|
|
}
|
|
|
|
// ------------------------------ Mask
|
|
@@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128<T, N> v, Simd
|
|
wasm_v128_store(aligned, v.raw);
|
|
}
|
|
|
|
-// ------------------------------ Gather
|
|
+// ------------------------------ Scatter (Store)
|
|
+
|
|
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Offset, N> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Index, N> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ base[index_lanes[i]] = lanes[i];
|
|
+ }
|
|
+}
|
|
+
|
|
+// ------------------------------ Gather (Load/Store)
|
|
|
|
template <typename T, size_t N, typename Offset>
|
|
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
|
|
const T* HWY_RESTRICT base,
|
|
const Vec128<Offset, N> offset) {
|
|
- static_assert(N == 1, "Wasm does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
|
|
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
|
|
- T val;
|
|
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
|
|
- return Set(d, val);
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
template <typename T, size_t N, typename Index>
|
|
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
|
|
const Vec128<Index, N> index) {
|
|
- static_assert(N == 1, "Wasm does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
|
|
- return Set(d, base[GetLane(index)]);
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ lanes[i] = base[index_lanes[i]];
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
// ================================================== SWIZZLE
|
|
@@ -1346,12 +1496,12 @@ HWY_API Vec128<T, N / 2> LowerHalf(Vec12
|
|
template <typename T>
|
|
HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Vec128<T> v) {
|
|
// TODO(eustas): use swizzle?
|
|
- return Vec128<T, 8 / sizeof(T)>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
|
|
+ return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec128<float, 2> UpperHalf(Vec128<float> v) {
|
|
// TODO(eustas): use swizzle?
|
|
- return Vec128<float, 2>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
|
|
+ return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
|
|
}
|
|
|
|
// ------------------------------ Shift vector by constant #bytes
|
|
@@ -1366,64 +1516,64 @@ HWY_API Vec128<T> ShiftLeftBytes(const V
|
|
return v;
|
|
|
|
case 1:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
|
|
7, 8, 9, 10, 11, 12, 13, 14)};
|
|
|
|
case 2:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
|
|
6, 7, 8, 9, 10, 11, 12, 13)};
|
|
|
|
case 3:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
|
|
4, 5, 6, 7, 8, 9, 10, 11, 12)};
|
|
|
|
case 4:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
|
|
3, 4, 5, 6, 7, 8, 9, 10, 11)};
|
|
|
|
case 5:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
|
|
2, 3, 4, 5, 6, 7, 8, 9, 10)};
|
|
|
|
case 6:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
|
|
|
|
case 7:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
|
|
|
|
case 8:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
|
|
|
|
case 9:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
|
|
|
|
case 10:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
|
|
|
|
case 11:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
|
|
|
|
case 12:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
|
|
|
|
case 13:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
|
|
|
|
case 14:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 0,
|
|
1)};
|
|
|
|
case 15:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
0)};
|
|
}
|
|
@@ -1447,69 +1597,69 @@ HWY_API Vec128<T> ShiftRightBytes(const
|
|
return v;
|
|
|
|
case 1:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16)};
|
|
|
|
case 2:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 16, 16)};
|
|
|
|
case 3:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
|
|
11, 12, 13, 14, 15, 16, 16, 16)};
|
|
|
|
case 4:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
12, 13, 14, 15, 16, 16, 16, 16)};
|
|
|
|
case 5:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
|
|
12, 13, 14, 15, 16, 16, 16, 16, 16)};
|
|
|
|
case 6:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
|
|
13, 14, 15, 16, 16, 16, 16, 16, 16)};
|
|
|
|
case 7:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
|
|
14, 15, 16, 16, 16, 16, 16, 16, 16)};
|
|
|
|
case 8:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
|
|
15, 16, 16, 16, 16, 16, 16, 16, 16)};
|
|
|
|
case 9:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
|
|
15, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 10:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 11:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 12:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 13:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 14:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
|
|
case 15:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
|
|
16, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
16)};
|
|
}
|
|
@@ -1535,72 +1685,72 @@ HWY_API Vec128<T> CombineShiftRightBytes
|
|
return lo;
|
|
|
|
case 1:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 16)};
|
|
|
|
case 2:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16, 17)};
|
|
|
|
case 3:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 16, 17, 18)};
|
|
|
|
case 4:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
|
|
11, 12, 13, 14, 15, 16, 17, 18, 19)};
|
|
|
|
case 5:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
|
|
12, 13, 14, 15, 16, 17, 18, 19, 20)};
|
|
|
|
case 6:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
|
|
12, 13, 14, 15, 16, 17, 18, 19, 20,
|
|
21)};
|
|
|
|
case 7:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
|
|
13, 14, 15, 16, 17, 18, 19, 20, 21,
|
|
22)};
|
|
|
|
case 8:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
|
|
14, 15, 16, 17, 18, 19, 20, 21, 22,
|
|
23)};
|
|
|
|
case 9:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
|
|
15, 16, 17, 18, 19, 20, 21, 22, 23,
|
|
24)};
|
|
|
|
case 10:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
|
|
15, 16, 17, 18, 19, 20, 21, 22, 23,
|
|
24, 25)};
|
|
|
|
case 11:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
|
|
16, 17, 18, 19, 20, 21, 22, 23, 24,
|
|
25, 26)};
|
|
|
|
case 12:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
|
|
17, 18, 19, 20, 21, 22, 23, 24, 25,
|
|
26, 27)};
|
|
|
|
case 13:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
|
|
18, 19, 20, 21, 22, 23, 24, 25, 26,
|
|
27, 28)};
|
|
|
|
case 14:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
|
|
19, 20, 21, 22, 23, 24, 25, 26, 27,
|
|
28, 29)};
|
|
|
|
case 15:
|
|
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
|
|
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
|
|
20, 21, 22, 23, 24, 25, 26, 27, 28,
|
|
29, 30)};
|
|
}
|
|
@@ -1613,28 +1763,28 @@ HWY_API Vec128<T> CombineShiftRightBytes
|
|
template <int kLane, size_t N>
|
|
HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
- return Vec128<uint16_t, N>{wasm_v16x8_shuffle(
|
|
+ return Vec128<uint16_t, N>{wasm_i16x8_shuffle(
|
|
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
|
|
}
|
|
template <int kLane, size_t N>
|
|
HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
return Vec128<uint32_t, N>{
|
|
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
}
|
|
|
|
// Signed
|
|
template <int kLane, size_t N>
|
|
HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
- return Vec128<int16_t, N>{wasm_v16x8_shuffle(
|
|
+ return Vec128<int16_t, N>{wasm_i16x8_shuffle(
|
|
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
|
|
}
|
|
template <int kLane, size_t N>
|
|
HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
return Vec128<int32_t, N>{
|
|
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
}
|
|
|
|
// Float
|
|
@@ -1642,7 +1792,7 @@ template <int kLane, size_t N>
|
|
HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
return Vec128<float, N>{
|
|
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
|
|
}
|
|
|
|
// ------------------------------ Shuffle bytes with variable indices
|
|
@@ -1652,16 +1802,23 @@ HWY_API Vec128<float, N> Broadcast(const
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> TableLookupBytes(const Vec128<T, N> bytes,
|
|
const Vec128<T, N> from) {
|
|
- // TODO(eustas): use swizzle? (shuffle does not work for variable indices)
|
|
+// Not yet available in all engines, see
|
|
+// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
|
|
+// V8 implementation of this had a bug, fixed on 2021-04-03:
|
|
+// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
|
|
+#if 0
|
|
+ return Vec128<T, N>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
|
|
+#else
|
|
alignas(16) uint8_t control[16];
|
|
alignas(16) uint8_t input[16];
|
|
alignas(16) uint8_t output[16];
|
|
wasm_v128_store(control, from.raw);
|
|
wasm_v128_store(input, bytes.raw);
|
|
for (size_t i = 0; i < 16; ++i) {
|
|
- output[i] = input[control[i]];
|
|
+ output[i] = control[i] < 16 ? input[control[i]] : 0;
|
|
}
|
|
return Vec128<T, N>{wasm_v128_load(output)};
|
|
+#endif
|
|
}
|
|
|
|
// ------------------------------ Hard-coded shuffles
|
|
@@ -1673,101 +1830,102 @@ HWY_API Vec128<T, N> TableLookupBytes(co
|
|
|
|
// Swap 32-bit halves in 64-bit halves.
|
|
HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
}
|
|
HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
}
|
|
HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
|
|
}
|
|
|
|
// Swap 64-bit halves
|
|
HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
+ return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
}
|
|
HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
+ return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
}
|
|
HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
|
|
- return Vec128<float>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
+ return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
|
|
}
|
|
|
|
// Rotate right 32 bits
|
|
HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
}
|
|
HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
}
|
|
HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
|
|
}
|
|
// Rotate left 32 bits
|
|
HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
}
|
|
HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
}
|
|
HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
|
|
}
|
|
|
|
// Reverse
|
|
HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
}
|
|
HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
}
|
|
HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
|
|
}
|
|
|
|
// ------------------------------ TableLookupLanes
|
|
|
|
// Returned by SetTableIndices for use by TableLookupLanes.
|
|
-template <typename T>
|
|
+template <typename T, size_t N>
|
|
struct Indices128 {
|
|
__v128_u raw;
|
|
};
|
|
|
|
-template <typename T>
|
|
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
|
|
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
|
|
- const size_t N = 16 / sizeof(T);
|
|
for (size_t i = 0; i < N; ++i) {
|
|
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
|
|
}
|
|
#endif
|
|
|
|
- const Full128<uint8_t> d8;
|
|
- alignas(16) uint8_t control[16]; // = Lanes()
|
|
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
|
|
- const size_t idx_lane = idx_byte / sizeof(T);
|
|
- const size_t mod = idx_byte % sizeof(T);
|
|
- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ alignas(16) uint8_t control[16] = {0};
|
|
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
|
|
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
|
+ control[idx_lane * sizeof(T) + idx_byte] =
|
|
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
|
|
+ }
|
|
}
|
|
- return Indices128<T>{Load(d8, control).raw};
|
|
+ return Indices128<T, N>{Load(d8, control).raw};
|
|
}
|
|
|
|
-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
|
|
- const Indices128<uint32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
|
|
+template <size_t N>
|
|
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
|
|
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
|
|
}
|
|
-
|
|
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
|
|
- const Indices128<int32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
|
|
+ const Indices128<int32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
|
|
}
|
|
-
|
|
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
|
|
- const Indices128<float> idx) {
|
|
- const Full128<int32_t> di;
|
|
- const Full128<float> df;
|
|
+template <size_t N>
|
|
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
|
|
+ const Indices128<float, N> idx) {
|
|
+ const Simd<int32_t, N> di;
|
|
+ const Simd<float, N> df;
|
|
return BitCast(df,
|
|
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
|
|
+ TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
|
|
}
|
|
|
|
// ------------------------------ Zip lanes
|
|
@@ -1778,33 +1936,33 @@ HWY_API Vec128<float> TableLookupLanes(c
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
|
|
+ return Vec128<uint16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
|
|
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
return Vec128<uint32_t, (N + 1) / 2>{
|
|
- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
|
|
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
|
|
+ return Vec128<int16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
|
|
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
return Vec128<int32_t, (N + 1) / 2>{
|
|
- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
|
|
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
|
|
const Vec128<uint8_t, N> b) {
|
|
- return Vec128<uint16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
|
|
+ return Vec128<uint16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
|
|
10, 26, 11, 27, 12, 28, 13,
|
|
29, 14, 30, 15, 31)};
|
|
}
|
|
@@ -1812,13 +1970,13 @@ template <size_t N>
|
|
HWY_API Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
|
|
const Vec128<uint16_t, N> b) {
|
|
return Vec128<uint32_t, N / 2>{
|
|
- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
|
|
const Vec128<int8_t, N> b) {
|
|
- return Vec128<int16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
|
|
+ return Vec128<int16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
|
|
10, 26, 11, 27, 12, 28, 13,
|
|
29, 14, 30, 15, 31)};
|
|
}
|
|
@@ -1826,7 +1984,7 @@ template <size_t N>
|
|
HWY_API Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
|
|
const Vec128<int16_t, N> b) {
|
|
return Vec128<int32_t, N / 2>{
|
|
- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
}
|
|
|
|
// ------------------------------ Interleave lanes
|
|
@@ -1842,17 +2000,17 @@ HWY_API Vec128<T> InterleaveLower(const
|
|
template <>
|
|
HWY_INLINE Vec128<uint32_t> InterleaveLower<uint32_t>(
|
|
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec128<int32_t> InterleaveLower<int32_t>(const Vec128<int32_t> a,
|
|
const Vec128<int32_t> b) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec128<float> InterleaveLower<float>(const Vec128<float> a,
|
|
const Vec128<float> b) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
|
|
}
|
|
|
|
template <typename T>
|
|
@@ -1862,17 +2020,17 @@ HWY_API Vec128<T> InterleaveUpper(const
|
|
template <>
|
|
HWY_INLINE Vec128<uint32_t> InterleaveUpper<uint32_t>(
|
|
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
|
|
- return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec128<int32_t> InterleaveUpper<int32_t>(const Vec128<int32_t> a,
|
|
const Vec128<int32_t> b) {
|
|
- return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
+ return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec128<float> InterleaveUpper<float>(const Vec128<float> a,
|
|
const Vec128<float> b) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
|
|
}
|
|
|
|
// ------------------------------ Blocks
|
|
@@ -1880,13 +2038,13 @@ HWY_INLINE Vec128<float> InterleaveUpper
|
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
template <typename T>
|
|
HWY_API Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
|
|
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)};
|
|
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
template <typename T>
|
|
HWY_API Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
|
|
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)};
|
|
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
|
|
@@ -1898,7 +2056,7 @@ HWY_API Vec128<T> ConcatLowerUpper(const
|
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
template <typename T>
|
|
HWY_API Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
|
|
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)};
|
|
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)};
|
|
}
|
|
|
|
// ------------------------------ Odd/even lanes
|
|
@@ -1917,12 +2075,12 @@ HWY_API Vec128<T> odd_even_impl(hwy::Siz
|
|
template <typename T>
|
|
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
|
|
const Vec128<T> b) {
|
|
- return Vec128<T>{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
|
|
+ return Vec128<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
|
|
}
|
|
template <typename T>
|
|
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
|
|
const Vec128<T> b) {
|
|
- return Vec128<T>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
+ return Vec128<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
}
|
|
// TODO(eustas): implement
|
|
// template <typename T>
|
|
@@ -1939,7 +2097,7 @@ HWY_API Vec128<T> OddEven(const Vec128<T
|
|
template <>
|
|
HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
|
|
const Vec128<float> b) {
|
|
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
}
|
|
|
|
// ================================================== CONVERT
|
|
@@ -1950,52 +2108,52 @@ HWY_INLINE Vec128<float> OddEven<float>(
|
|
template <size_t N>
|
|
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
|
|
const Vec128<uint8_t, N> v) {
|
|
- return Vec128<uint16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
|
|
+ return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
|
|
const Vec128<uint8_t, N> v) {
|
|
return Vec128<uint32_t, N>{
|
|
- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
|
|
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
|
|
const Vec128<uint8_t, N> v) {
|
|
- return Vec128<int16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
|
|
+ return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
|
|
const Vec128<uint8_t, N> v) {
|
|
return Vec128<int32_t, N>{
|
|
- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
|
|
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
|
|
const Vec128<uint16_t, N> v) {
|
|
- return Vec128<uint32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
|
|
+ return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
|
|
const Vec128<uint16_t, N> v) {
|
|
- return Vec128<int32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
|
|
+ return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
|
|
}
|
|
|
|
// Signed: replicate sign bit.
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
|
|
const Vec128<int8_t, N> v) {
|
|
- return Vec128<int16_t, N>{wasm_i16x8_widen_low_i8x16(v.raw)};
|
|
+ return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
|
|
const Vec128<int8_t, N> v) {
|
|
return Vec128<int32_t, N>{
|
|
- wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))};
|
|
+ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
|
|
}
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
|
|
const Vec128<int16_t, N> v) {
|
|
- return Vec128<int32_t, N>{wasm_i32x4_widen_low_i16x8(v.raw)};
|
|
+ return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
|
|
}
|
|
|
|
template <size_t N>
|
|
@@ -2122,7 +2280,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
|
|
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
|
|
}
|
|
|
|
-// ------------------------------ Convert i32 <=> f32
|
|
+// ------------------------------ Convert i32 <=> f32 (Round)
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
|
|
@@ -2133,33 +2291,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
|
|
const Vec128<float, N> v) {
|
|
- return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(v.raw)};
|
|
+ return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
|
|
- const __f32x4 c00 = wasm_f32x4_splat(0.0f);
|
|
- const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00));
|
|
- const __f32x4 c05 = wasm_f32x4_splat(0.5f);
|
|
- // +0.5 for non-negative lane, -0.5 for other.
|
|
- const __f32x4 delta = wasm_f32x4_add(c05, corr);
|
|
- // Shift input by 0.5 away from 0.
|
|
- const __f32x4 fixed = wasm_f32x4_add(v.raw, delta);
|
|
- return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(fixed)};
|
|
+ return ConvertTo(Simd<int32_t, N>(), Round(v));
|
|
}
|
|
|
|
// ================================================== MISC
|
|
|
|
-// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
-template <typename T, size_t N, typename T2>
|
|
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
|
|
- HWY_ALIGN T lanes[16 / sizeof(T)];
|
|
- for (size_t i = 0; i < 16 / sizeof(T); ++i) {
|
|
- lanes[i] = static_cast<T>(first + static_cast<T2>(i));
|
|
- }
|
|
- return Load(d, lanes);
|
|
-}
|
|
-
|
|
// ------------------------------ Mask
|
|
|
|
namespace detail {
|
|
@@ -2167,20 +2308,13 @@ namespace detail {
|
|
template <typename T, size_t N>
|
|
HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
|
|
const Mask128<T, N> mask) {
|
|
- const __i8x16 slice =
|
|
- wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8);
|
|
- // Each u32 lane has byte[i] = (1 << i) or 0.
|
|
- const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice);
|
|
- // OR together 4 bytes of each u32 to get the 4 bits.
|
|
- const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16);
|
|
- const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z);
|
|
- const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8);
|
|
- const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0);
|
|
- const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24);
|
|
- // Assemble four nibbles into 16 bits.
|
|
- alignas(16) uint32_t lanes[4];
|
|
- wasm_v128_store(lanes, nibble_per_u32);
|
|
- return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12);
|
|
+ alignas(16) uint64_t lanes[2];
|
|
+ wasm_v128_store(lanes, mask.raw);
|
|
+
|
|
+ constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
|
|
+ const uint64_t lo = ((lanes[0] * kMagic) >> 56);
|
|
+ const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
|
|
+ return (hi + lo);
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
@@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() {
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API uint64_t BitsFromMask(const Mask128<T, N> mask) {
|
|
- return OnlyActive<T, N>(
|
|
- BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
+ return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
}
|
|
|
|
template <typename T>
|
|
@@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128<T
|
|
// Full vector, type-independent
|
|
template <typename T>
|
|
HWY_API bool AllFalse(const Mask128<T> m) {
|
|
- return !wasm_i8x16_any_true(m.raw);
|
|
+#if 0
|
|
+ // Casting followed by wasm_i8x16_any_true results in wasm error:
|
|
+ // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
|
|
+ const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(Full128<T>(), m));
|
|
+ return !wasm_i8x16_any_true(v8.raw);
|
|
+#else
|
|
+ return (wasm_i64x2_extract_lane(m.raw, 0) |
|
|
+ wasm_i64x2_extract_lane(m.raw, 1)) == 0;
|
|
+#endif
|
|
}
|
|
|
|
// Full vector, type-dependent
|
|
@@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128<T, N>
|
|
namespace detail {
|
|
|
|
template <typename T, size_t N>
|
|
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
|
|
+ HWY_DASSERT(mask_bits < 256);
|
|
+ const Simd<T, N> d;
|
|
+ const Rebind<uint8_t, decltype(d)> d8;
|
|
+ const Simd<uint16_t, N> du;
|
|
+
|
|
+ // We need byte indices for TableLookupBytes (one vector's worth for each of
|
|
+ // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
|
|
+ // can instead store lane indices and convert to byte indices (2*lane + 0..1),
|
|
+ // with the doubling baked into the table. Unpacking nibbles is likely more
|
|
+ // costly than the higher cache footprint from storing bytes.
|
|
+ alignas(16) constexpr uint8_t table[256 * 8] = {
|
|
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
|
|
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
|
|
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
|
|
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
|
|
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
|
|
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
|
|
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
|
|
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
|
|
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
|
|
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
|
|
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
|
|
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
|
|
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
|
|
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
|
|
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
|
|
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
|
|
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
|
|
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
|
|
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
|
|
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
|
|
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
|
|
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
|
|
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
|
|
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
|
|
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
|
|
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
|
|
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
|
|
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
|
|
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
|
|
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
|
|
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
|
|
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
|
|
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
|
|
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
|
|
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
|
|
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
|
|
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
|
|
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
|
|
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
|
|
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
|
|
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
|
|
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
|
|
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
|
|
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
|
|
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
|
|
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
|
|
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
|
|
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
|
|
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
|
|
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
|
|
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
|
|
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
|
|
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
|
|
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
|
|
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
|
|
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
|
|
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
|
|
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
|
|
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
|
|
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
|
|
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
|
|
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
|
|
+
|
|
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
|
|
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
|
|
+ return BitCast(d, pairs + Set(du, 0x0100));
|
|
+}
|
|
+
|
|
+template <typename T, size_t N>
|
|
HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 16);
|
|
|
|
@@ -2383,57 +2657,37 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
|
|
|
|
#endif
|
|
|
|
-// Helper function called by both Compress and CompressStore - avoids a
|
|
+// Helper functions called by both Compress and CompressStore - avoids a
|
|
// redundant BitsFromMask in the latter.
|
|
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
|
|
+ using D = Simd<T, N>;
|
|
+ const RebindToSigned<D> di;
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
|
|
+ using D = Simd<T, N>;
|
|
+ const RebindToSigned<D> di;
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
}
|
|
|
|
-#if HWY_CAP_INTEGER64
|
|
+#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
|
|
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<uint64_t, N> Compress(hwy::SizeTag<8> /*tag*/,
|
|
+ Vec128<uint64_t, N> v,
|
|
const uint64_t mask_bits) {
|
|
const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-}
|
|
-
|
|
-#endif
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- const Simd<float, N> df;
|
|
- const Simd<int32_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
-}
|
|
-
|
|
-#if HWY_CAP_FLOAT64
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
|
|
- const uint64_t mask_bits) {
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- const Simd<double, N> df;
|
|
- const Simd<int64_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
+ using D = Simd<T, N>;
|
|
+ const RebindToSigned<D> di;
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
}
|
|
|
|
#endif
|
|
@@ -2442,7 +2696,8 @@ HWY_API Vec128<double, N> Compress(Vec12
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
|
|
- return detail::Compress(v, detail::BitsFromMask(mask));
|
|
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
|
|
+ detail::BitsFromMask(mask));
|
|
}
|
|
|
|
// ------------------------------ CompressStore
|
|
@@ -2451,63 +2706,284 @@ template <typename T, size_t N>
|
|
HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
|
|
Simd<T, N> d, T* HWY_RESTRICT aligned) {
|
|
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
- Store(detail::Compress(v, mask_bits), d, aligned);
|
|
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
|
|
return PopCount(mask_bits);
|
|
}
|
|
|
|
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
|
+// TableLookupBytes)
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
|
|
+ const Vec128<uint8_t> c, Full128<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const auto k5 = Set(d, 5);
|
|
+ const auto k6 = Set(d, 6);
|
|
+
|
|
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d, tbl_r0);
|
|
+ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ StoreU(int0, d, unaligned + 0 * 16);
|
|
+
|
|
+ // Second vector: g10,r10, bgr[9:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
|
|
+ const auto r1 = TableLookupBytes(a, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(b, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(c, shuf_b1);
|
|
+ const auto int1 = r1 | g1 | b1;
|
|
+ StoreU(int1, d, unaligned + 1 * 16);
|
|
+
|
|
+ // Third vector: bgr[15:11], b10
|
|
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
|
|
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
|
|
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
|
|
+ const auto r2 = TableLookupBytes(a, shuf_r2);
|
|
+ const auto g2 = TableLookupBytes(b, shuf_g2);
|
|
+ const auto b2 = TableLookupBytes(c, shuf_b2);
|
|
+ const auto int2 = r2 | g2 | b2;
|
|
+ StoreU(int2, d, unaligned + 2 * 16);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
|
|
+ const Vec128<uint8_t, 8> b,
|
|
+ const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors for the shuffles and first result.
|
|
+ const Full128<uint8_t> d_full;
|
|
+ const auto k5 = Set(d_full, 5);
|
|
+ const auto k6 = Set(d_full, 6);
|
|
+
|
|
+ const Vec128<uint8_t> full_a{a.raw};
|
|
+ const Vec128<uint8_t> full_b{b.raw};
|
|
+ const Vec128<uint8_t> full_c{c.raw};
|
|
+
|
|
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d_full, tbl_r0);
|
|
+ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ StoreU(int0, d_full, unaligned + 0 * 16);
|
|
+
|
|
+ // Second (HALF) vector: bgr[7:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
|
|
+ const auto r1 = TableLookupBytes(full_a, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(full_b, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(full_c, shuf_b1);
|
|
+ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
|
|
+ StoreU(int1, d, unaligned + 1 * 16);
|
|
+}
|
|
+
|
|
+// <= 32 bits
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
|
|
+ const Vec128<uint8_t, N> b,
|
|
+ const Vec128<uint8_t, N> c,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors for the shuffles and result.
|
|
+ const Full128<uint8_t> d_full;
|
|
+
|
|
+ const Vec128<uint8_t> full_a{a.raw};
|
|
+ const Vec128<uint8_t> full_b{b.raw};
|
|
+ const Vec128<uint8_t> full_c{c.raw};
|
|
+
|
|
+ // Shuffle (a,b,c) vector bytes to bgr[3:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
|
|
+ 0x80, 0x80, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d_full, tbl_r0);
|
|
+ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
|
|
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ alignas(16) uint8_t buf[16];
|
|
+ StoreU(int0, d_full, buf);
|
|
+ CopyBytes<N * 3>(buf, unaligned);
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
|
|
+ const Vec128<uint8_t> v1,
|
|
+ const Vec128<uint8_t> v2,
|
|
+ const Vec128<uint8_t> v3, Full128<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto ba8 = ZipUpper(v0, v1);
|
|
+ const auto dc8 = ZipUpper(v2, v3);
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
|
|
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8
|
|
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC
|
|
+ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
|
|
+ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
|
|
+ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
|
|
+ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
|
|
+ const Vec128<uint8_t, 8> in1,
|
|
+ const Vec128<uint8_t, 8> in2,
|
|
+ const Vec128<uint8_t, 8> in3,
|
|
+ Simd<uint8_t, 8> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors to reduce the number of stores.
|
|
+ const Vec128<uint8_t> v0{in0.raw};
|
|
+ const Vec128<uint8_t> v1{in1.raw};
|
|
+ const Vec128<uint8_t> v2{in2.raw};
|
|
+ const Vec128<uint8_t> v3{in3.raw};
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
|
|
+ const Full128<uint8_t> d_full;
|
|
+ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
|
|
+ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
|
|
+}
|
|
+
|
|
+// <= 32 bits
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
|
|
+ const Vec128<uint8_t, N> in1,
|
|
+ const Vec128<uint8_t, N> in2,
|
|
+ const Vec128<uint8_t, N> in3,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors to reduce the number of stores.
|
|
+ const Vec128<uint8_t> v0{in0.raw};
|
|
+ const Vec128<uint8_t> v1{in1.raw};
|
|
+ const Vec128<uint8_t> v2{in2.raw};
|
|
+ const Vec128<uint8_t> v3{in3.raw};
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ alignas(16) uint8_t buf[16];
|
|
+ const Full128<uint8_t> d_full;
|
|
+ StoreU(BitCast(d_full, dcba_0), d_full, buf);
|
|
+ CopyBytes<4 * N>(buf, unaligned);
|
|
+}
|
|
+
|
|
// ------------------------------ Reductions
|
|
|
|
namespace detail {
|
|
|
|
-// For u32/i32/f32.
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+// N=1 for any T: no-op
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+
|
|
+// u32/i32/f32:
|
|
+
|
|
+// N=2
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
|
|
+}
|
|
+
|
|
+// N=4 (full)
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = v3210 + v1032;
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return v20_31_20_31 + v31_20_31_20;
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Min(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Max(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
|
|
-// For u64/i64/f64.
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+// u64/i64/f64:
|
|
+
|
|
+// N=2 (full)
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return v10 + v01;
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Min(v10, v01);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Max(v10, v01);
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
-// Supported for u/i/f 32/64. Returns the sum in each lane.
|
|
+// Supported for u/i/f 32/64. Returns the same value in each lane.
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
|
|
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 2021-06-02 10:56:05.240904417 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -154,27 +154,28 @@ HWY_API Vec128<double, N> Zero(Simd<doub
|
|
// Returns a vector/part with all lanes set to "t".
|
|
template <size_t N, HWY_IF_LE128(uint8_t, N)>
|
|
HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
|
|
- return Vec128<uint8_t, N>{_mm_set1_epi8(t)};
|
|
+ return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(uint16_t, N)>
|
|
HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
|
|
- return Vec128<uint16_t, N>{_mm_set1_epi16(t)};
|
|
+ return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(uint32_t, N)>
|
|
HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
|
|
- return Vec128<uint32_t, N>{_mm_set1_epi32(t)};
|
|
+ return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
|
|
}
|
|
template <size_t N, HWY_IF_LE128(uint64_t, N)>
|
|
HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
|
|
- return Vec128<uint64_t, N>{_mm_set1_epi64x(t)};
|
|
+ return Vec128<uint64_t, N>{
|
|
+ _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(int8_t, N)>
|
|
HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
|
|
- return Vec128<int8_t, N>{_mm_set1_epi8(t)};
|
|
+ return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(int16_t, N)>
|
|
HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
|
|
- return Vec128<int16_t, N>{_mm_set1_epi16(t)};
|
|
+ return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(int32_t, N)>
|
|
HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
|
|
@@ -182,7 +183,8 @@ HWY_API Vec128<int32_t, N> Set(Simd<int3
|
|
}
|
|
template <size_t N, HWY_IF_LE128(int64_t, N)>
|
|
HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
|
|
- return Vec128<int64_t, N>{_mm_set1_epi64x(t)};
|
|
+ return Vec128<int64_t, N>{
|
|
+ _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
template <size_t N, HWY_IF_LE128(float, N)>
|
|
HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
|
|
@@ -510,7 +512,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<
|
|
template <typename TFrom, typename TTo, size_t N>
|
|
HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
|
|
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
|
|
- return Mask128<TTo, N>{m.raw};
|
|
+ const Simd<TFrom, N> d;
|
|
+ return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
|
|
}
|
|
|
|
// ------------------------------ Equality
|
|
@@ -683,6 +686,14 @@ HWY_API Mask128<double, N> operator>=(co
|
|
return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
|
|
}
|
|
|
|
+// ------------------------------ FirstN (Iota, Lt)
|
|
+
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
|
|
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
|
|
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
|
|
+}
|
|
+
|
|
// ================================================== ARITHMETIC
|
|
|
|
// ------------------------------ Addition
|
|
@@ -894,7 +905,7 @@ template <size_t N>
|
|
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
|
|
return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
|
|
}
|
|
-
|
|
+// i64 is implemented after BroadcastSignBit.
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
|
|
const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
|
|
@@ -959,7 +970,6 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> Mu
|
|
|
|
// ------------------------------ ShiftLeft
|
|
|
|
-// Unsigned
|
|
template <int kBits, size_t N>
|
|
HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
|
|
return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
|
|
@@ -988,6 +998,16 @@ HWY_API Vec128<int64_t, N> ShiftLeft(con
|
|
return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
|
|
+ const Simd<T, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
|
|
+ return kBits == 1
|
|
+ ? (v + v)
|
|
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRight
|
|
|
|
template <int kBits, size_t N>
|
|
@@ -1004,6 +1024,15 @@ HWY_API Vec128<uint64_t, N> ShiftRight(c
|
|
}
|
|
|
|
template <int kBits, size_t N>
|
|
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
|
|
+ const Simd<uint8_t, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<uint8_t, N> shifted{
|
|
+ ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
|
|
+ return shifted & Set(d8, 0xFF >> kBits);
|
|
+}
|
|
+
|
|
+template <int kBits, size_t N>
|
|
HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
|
|
return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
|
|
}
|
|
@@ -1012,6 +1041,15 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
|
|
return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits, size_t N>
|
|
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
|
|
+ const Simd<int8_t, N> di;
|
|
+ const Simd<uint8_t, N> du;
|
|
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// i64 is implemented after BroadcastSignBit.
|
|
|
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
@@ -1039,15 +1077,24 @@ HWY_API Vec128<int64_t, N> BroadcastSign
|
|
return VecFromMask(v < Zero(Simd<int64_t, N>()));
|
|
#else
|
|
// Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires
|
|
- // two constants and domain crossing. 32-bit compare only requires Zero()
|
|
- // plus a shuffle to replicate the upper 32 bits.
|
|
+ // two constants and domain crossing. 32-bit shift avoids generating a zero.
|
|
const Simd<int32_t, N * 2> d32;
|
|
- const auto sign = BitCast(d32, v) < Zero(d32);
|
|
+ const auto sign = ShiftRight<31>(BitCast(d32, v));
|
|
return Vec128<int64_t, N>{
|
|
_mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
#endif
|
|
}
|
|
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
|
|
+#if HWY_TARGET == HWY_AVX3
|
|
+ return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
|
|
+#else
|
|
+ const auto zero = Zero(Simd<int64_t,N>());
|
|
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
+#endif
|
|
+}
|
|
+
|
|
template <int kBits, size_t N>
|
|
HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
|
|
#if HWY_TARGET == HWY_AVX3
|
|
@@ -1097,6 +1144,15 @@ HWY_API Vec128<int64_t, N> ShiftLeftSame
|
|
return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
|
|
+ const Simd<T, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<T, N> shifted{
|
|
+ ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
|
|
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRightSame (BroadcastSignBit)
|
|
|
|
template <size_t N>
|
|
@@ -1116,6 +1172,16 @@ HWY_API Vec128<uint64_t, N> ShiftRightSa
|
|
}
|
|
|
|
template <size_t N>
|
|
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
|
|
+ const int bits) {
|
|
+ const Simd<uint8_t, N> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec128<uint8_t, N> shifted{
|
|
+ ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
|
|
+ return shifted & Set(d8, 0xFF >> bits);
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
|
|
const int bits) {
|
|
return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
|
|
@@ -1140,6 +1206,15 @@ HWY_API Vec128<int64_t, N> ShiftRightSam
|
|
#endif
|
|
}
|
|
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
|
|
+ const Simd<int8_t, N> di;
|
|
+ const Simd<uint8_t, N> du;
|
|
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ Negate
|
|
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128<double,
|
|
_mm_stream_pd(aligned, v.raw);
|
|
}
|
|
|
|
-// ------------------------------ Gather
|
|
+// ------------------------------ Scatter
|
|
+
|
|
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
+HWY_DIAGNOSTICS(push)
|
|
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
|
|
// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
|
|
using GatherIndex64 = long long int; // NOLINT(google-runtime-int)
|
|
static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
|
|
|
|
+#if HWY_TARGET == HWY_AVX3
|
|
+namespace detail {
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
|
|
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec128<int32_t, N> offset) {
|
|
+ if (N == 4) {
|
|
+ _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
|
|
+ }
|
|
+}
|
|
+template <typename T, size_t N>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
|
|
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec128<int32_t, N> index) {
|
|
+ if (N == 4) {
|
|
+ _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename T, size_t N>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
|
|
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec128<int64_t, N> offset) {
|
|
+ if (N == 2) {
|
|
+ _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
|
|
+ }
|
|
+}
|
|
+template <typename T, size_t N>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
|
|
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec128<int64_t, N> index) {
|
|
+ if (N == 2) {
|
|
+ _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
|
|
+ }
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
+template <typename T, size_t N, typename Offset>
|
|
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Offset, N> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
|
|
+}
|
|
+template <typename T, size_t N, typename Index>
|
|
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Index, N> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec128<int32_t, N> offset) {
|
|
+ if (N == 4) {
|
|
+ _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
|
|
+ }
|
|
+}
|
|
+template <size_t N>
|
|
+HWY_INLINE void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec128<int32_t, N> index) {
|
|
+ if (N == 4) {
|
|
+ _mm_i32scatter_ps(base, index.raw, v.raw, 4);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <size_t N>
|
|
+HWY_INLINE void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec128<int64_t, N> offset) {
|
|
+ if (N == 2) {
|
|
+ _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
|
|
+ }
|
|
+}
|
|
+template <size_t N>
|
|
+HWY_INLINE void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec128<int64_t, N> index) {
|
|
+ if (N == 2) {
|
|
+ _mm_i64scatter_pd(base, index.raw, v.raw, 8);
|
|
+ } else {
|
|
+ const __mmask8 mask = (1u << N) - 1;
|
|
+ _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
|
|
+ }
|
|
+}
|
|
+#else // HWY_TARGET == HWY_AVX3
|
|
+
|
|
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Offset, N> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
|
|
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
|
|
+ const Vec128<Index, N> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ base[index_lanes[i]] = lanes[i];
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+// ------------------------------ Gather (Load/Store)
|
|
+
|
|
#if HWY_TARGET == HWY_SSE4
|
|
|
|
template <typename T, size_t N, typename Offset>
|
|
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
|
|
const T* HWY_RESTRICT base,
|
|
const Vec128<Offset, N> offset) {
|
|
- static_assert(N == 1, "SSE4 does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
|
|
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
|
|
- T val;
|
|
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
|
|
- return Set(d, val);
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ alignas(16) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
template <typename T, size_t N, typename Index>
|
|
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
|
|
const Vec128<Index, N> index) {
|
|
- static_assert(N == 1, "SSE4 does not support full gather");
|
|
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
|
|
- return Set(d, base[GetLane(index)]);
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ alignas(16) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ alignas(16) T lanes[N];
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ lanes[i] = base[index_lanes[i]];
|
|
+ }
|
|
+ return Load(d, lanes);
|
|
}
|
|
|
|
#else
|
|
@@ -1832,6 +2071,8 @@ HWY_API Vec128<double, N> GatherIndex(Si
|
|
|
|
#endif // HWY_TARGET != HWY_SSE4
|
|
|
|
+HWY_DIAGNOSTICS(pop)
|
|
+
|
|
// ================================================== SWIZZLE
|
|
|
|
// ------------------------------ Extract half
|
|
@@ -1859,10 +2100,10 @@ HWY_INLINE Vec128<double, 1> UpperHalf(V
|
|
// ------------------------------ Shift vector by constant #bytes
|
|
|
|
// 0x01..0F, kBytes = 1 => 0x02..0F00
|
|
-template <int kBytes, typename T>
|
|
-HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
|
|
+template <int kBytes, typename T, size_t N>
|
|
+HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
|
|
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
- return Vec128<T>{_mm_slli_si128(v.raw, kBytes)};
|
|
+ return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
|
|
}
|
|
|
|
template <int kLanes, typename T, size_t N>
|
|
@@ -1873,10 +2114,10 @@ HWY_API Vec128<T, N> ShiftLeftLanes(cons
|
|
}
|
|
|
|
// 0x01..0F, kBytes = 1 => 0x0001..0E
|
|
-template <int kBytes, typename T>
|
|
-HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
|
|
+template <int kBytes, typename T, size_t N>
|
|
+HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
|
|
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
- return Vec128<T>{_mm_srli_si128(v.raw, kBytes)};
|
|
+ return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
|
|
}
|
|
|
|
template <int kLanes, typename T, size_t N>
|
|
@@ -2041,44 +2282,47 @@ HWY_API Vec128<float> Shuffle0123(const
|
|
// ------------------------------ TableLookupLanes
|
|
|
|
// Returned by SetTableIndices for use by TableLookupLanes.
|
|
-template <typename T>
|
|
+template <typename T, size_t N>
|
|
struct Indices128 {
|
|
__m128i raw;
|
|
};
|
|
|
|
-template <typename T>
|
|
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
|
|
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
|
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
|
|
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
|
|
- const size_t N = 16 / sizeof(T);
|
|
for (size_t i = 0; i < N; ++i) {
|
|
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
|
|
}
|
|
#endif
|
|
|
|
- const Full128<uint8_t> d8;
|
|
- alignas(16) uint8_t control[16];
|
|
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
|
|
- const size_t idx_lane = idx_byte / sizeof(T);
|
|
- const size_t mod = idx_byte % sizeof(T);
|
|
- control[idx_byte] = static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + mod);
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
+ alignas(16) uint8_t control[16] = {0};
|
|
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
|
|
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
|
+ control[idx_lane * sizeof(T) + idx_byte] =
|
|
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
|
|
+ }
|
|
}
|
|
- return Indices128<T>{Load(d8, control).raw};
|
|
+ return Indices128<T, N>{Load(d8, control).raw};
|
|
}
|
|
|
|
-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
|
|
- const Indices128<uint32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
|
|
+template <size_t N>
|
|
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
|
|
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
|
|
}
|
|
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
|
|
- const Indices128<int32_t> idx) {
|
|
- return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
|
|
+template <size_t N>
|
|
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
|
|
+ const Indices128<int32_t, N> idx) {
|
|
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
|
|
}
|
|
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
|
|
- const Indices128<float> idx) {
|
|
- const Full128<int32_t> di;
|
|
- const Full128<float> df;
|
|
+template <size_t N>
|
|
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
|
|
+ const Indices128<float, N> idx) {
|
|
+ const Simd<int32_t, N> di;
|
|
+ const Simd<float, N> df;
|
|
return BitCast(df,
|
|
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
|
|
+ TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
|
|
}
|
|
|
|
// ------------------------------ Interleave lanes
|
|
@@ -2286,47 +2530,47 @@ HWY_INLINE Vec128<double> ConcatUpperLow
|
|
|
|
namespace detail {
|
|
|
|
-template <typename T>
|
|
-HWY_API Vec128<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
|
|
- const Vec128<T> b) {
|
|
- const Full128<T> d;
|
|
- const Full128<uint8_t> d8;
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
|
|
+ const Vec128<T, N> b) {
|
|
+ const Simd<T, N> d;
|
|
+ const Repartition<uint8_t, decltype(d)> d8;
|
|
alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
|
|
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
|
|
return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
|
|
}
|
|
-template <typename T>
|
|
-HWY_API Vec128<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
|
|
- const Vec128<T> b) {
|
|
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
|
|
+ const Vec128<T, N> b) {
|
|
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
|
|
}
|
|
-template <typename T>
|
|
-HWY_API Vec128<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
|
|
- const Vec128<T> b) {
|
|
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
|
|
+ const Vec128<T, N> b) {
|
|
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
|
|
}
|
|
-template <typename T>
|
|
-HWY_API Vec128<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T> a,
|
|
- const Vec128<T> b) {
|
|
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
|
|
+ const Vec128<T, N> b) {
|
|
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
-template <typename T>
|
|
-HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
|
|
}
|
|
-template <>
|
|
-HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
|
|
- const Vec128<float> b) {
|
|
- return Vec128<float>{_mm_blend_ps(a.raw, b.raw, 5)};
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<float, N> OddEven(const Vec128<float, N> a,
|
|
+ const Vec128<float, N> b) {
|
|
+ return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
|
|
}
|
|
|
|
-template <>
|
|
-HWY_INLINE Vec128<double> OddEven<double>(const Vec128<double> a,
|
|
- const Vec128<double> b) {
|
|
- return Vec128<double>{_mm_blend_pd(a.raw, b.raw, 1)};
|
|
+template <size_t N>
|
|
+HWY_INLINE Vec128<double, N> OddEven(const Vec128<double, N> a,
|
|
+ const Vec128<double, N> b) {
|
|
+ return Vec128<double, N>{_mm_blend_pd(a.raw, b.raw, 1)};
|
|
}
|
|
|
|
// ------------------------------ Shl (ZipLower, Mul)
|
|
@@ -2764,7 +3008,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
|
|
return LowerHalf(LowerHalf(BitCast(d8, quad)));
|
|
}
|
|
|
|
-// ------------------------------ Convert integer <=> floating point
|
|
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
|
|
@@ -2779,13 +3023,20 @@ HWY_API Vec128<double, N> ConvertTo(Simd
|
|
(void)dd;
|
|
return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
|
|
#else
|
|
- alignas(16) int64_t lanes_i[2];
|
|
- Store(v, Simd<int64_t, N>(), lanes_i);
|
|
- alignas(16) double lanes_d[2];
|
|
- for (size_t i = 0; i < N; ++i) {
|
|
- lanes_d[i] = static_cast<double>(lanes_i[i]);
|
|
- }
|
|
- return Load(dd, lanes_d);
|
|
+ // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
|
|
+ const Repartition<uint32_t, decltype(dd)> d32;
|
|
+ const Repartition<uint64_t, decltype(dd)> d64;
|
|
+
|
|
+ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
|
|
+ const auto k84_63 = Set(d64, 0x4530000080000000ULL);
|
|
+ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
|
|
+
|
|
+ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
|
|
+ const auto k52 = Set(d32, 0x43300000);
|
|
+ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
|
|
+
|
|
+ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
|
|
+ return (v_upper - k84_63_52) + v_lower; // order matters!
|
|
#endif
|
|
}
|
|
|
|
@@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128<T
|
|
namespace detail {
|
|
|
|
template <typename T, size_t N>
|
|
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
|
|
+ HWY_DASSERT(mask_bits < 256);
|
|
+ const Simd<T, N> d;
|
|
+ const Rebind<uint8_t, decltype(d)> d8;
|
|
+ const Simd<uint16_t, N> du;
|
|
+
|
|
+ // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
|
|
+ // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
|
|
+ // 8 mask bits). Loading them directly would require 4 KiB. We can instead
|
|
+ // store lane indices and convert to byte indices (2*lane + 0..1), with the
|
|
+ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
|
|
+ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
|
|
+ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
|
|
+ // is likely more costly than the higher cache footprint from storing bytes.
|
|
+ alignas(16) constexpr uint8_t table[256 * 8] = {
|
|
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
|
|
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
|
|
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
|
|
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
|
|
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
|
|
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
|
|
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
|
|
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
|
|
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
|
|
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
|
|
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
|
|
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
|
|
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
|
|
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
|
|
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
|
|
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
|
|
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
|
|
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
|
|
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
|
|
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
|
|
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
|
|
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
|
|
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
|
|
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
|
|
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
|
|
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
|
|
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
|
|
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
|
|
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
|
|
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
|
|
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
|
|
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
|
|
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
|
|
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
|
|
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
|
|
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
|
|
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
|
|
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
|
|
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
|
|
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
|
|
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
|
|
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
|
|
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
|
|
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
|
|
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
|
|
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
|
|
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
|
|
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
|
|
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
|
|
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
|
|
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
|
|
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
|
|
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
|
|
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
|
|
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
|
|
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
|
|
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
|
|
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
|
|
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
|
|
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
|
|
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
|
|
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
|
|
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
|
|
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
|
|
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
|
|
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
|
|
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
|
|
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
|
|
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
|
|
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
|
|
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
|
|
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
|
|
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
|
|
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
|
|
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
|
|
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
|
|
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
|
|
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
|
|
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
|
|
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
|
|
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
|
|
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
|
|
+
|
|
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
|
|
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
|
|
+ return BitCast(d, pairs + Set(du, 0x0100));
|
|
+}
|
|
+
|
|
+template <typename T, size_t N>
|
|
HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 16);
|
|
|
|
@@ -2968,71 +3355,42 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
|
|
// Helper function called by both Compress and CompressStore - avoids a
|
|
// redundant BitsFromMask in the latter.
|
|
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
|
|
-#else
|
|
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-#endif
|
|
-}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<int32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
|
|
-#else
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-#endif
|
|
-}
|
|
-
|
|
-template <size_t N>
|
|
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<uint64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
|
|
-#else
|
|
- const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-#endif
|
|
-}
|
|
-template <size_t N>
|
|
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<int64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
|
|
-#else
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- return TableLookupBytes(v, idx);
|
|
-#endif
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
|
|
+ using D = Simd<T, N>;
|
|
+ const RebindToSigned<D> di;
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
}
|
|
|
|
-template <size_t N>
|
|
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
|
|
- const uint64_t mask_bits) {
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ using D = Simd<T, N>;
|
|
+ using TI = MakeSigned<T>;
|
|
+ const Rebind<TI, D> di;
|
|
#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<float, N>{_mm_maskz_compress_ps(mask_bits, v.raw)};
|
|
+ return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi32(
|
|
+ mask_bits, BitCast(di, v).raw)});
|
|
#else
|
|
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
|
|
- const Simd<float, N> df;
|
|
- const Simd<int32_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
+ const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
#endif
|
|
}
|
|
|
|
-template <size_t N>
|
|
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
|
|
- const uint64_t mask_bits) {
|
|
+template <typename T, size_t N>
|
|
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ using D = Simd<T, N>;
|
|
+ using TI = MakeSigned<T>;
|
|
+ const Rebind<TI, D> di;
|
|
#if HWY_TARGET == HWY_AVX3
|
|
- return Vec128<double, N>{_mm_maskz_compress_pd(mask_bits, v.raw)};
|
|
+ return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi64(
|
|
+ mask_bits, BitCast(di, v).raw)});
|
|
#else
|
|
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
|
|
- const Simd<double, N> df;
|
|
- const Simd<int64_t, N> di;
|
|
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
|
|
+ const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
|
|
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
|
|
#endif
|
|
}
|
|
|
|
@@ -3040,7 +3398,8 @@ HWY_API Vec128<double, N> Compress(Vec12
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
|
|
- return detail::Compress(v, detail::BitsFromMask(mask));
|
|
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
|
|
+ detail::BitsFromMask(mask));
|
|
}
|
|
|
|
// ------------------------------ CompressStore
|
|
@@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128<T, N
|
|
Simd<T, N> d, T* HWY_RESTRICT aligned) {
|
|
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
// Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
|
|
- Store(detail::Compress(v, mask_bits), d, aligned);
|
|
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
|
|
return PopCount(mask_bits);
|
|
}
|
|
|
|
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
|
+// TableLookupBytes)
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
|
|
+ const Vec128<uint8_t> v1,
|
|
+ const Vec128<uint8_t> v2, Full128<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const auto k5 = Set(d, 5);
|
|
+ const auto k6 = Set(d, 6);
|
|
+
|
|
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d, tbl_r0);
|
|
+ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ StoreU(int0, d, unaligned + 0 * 16);
|
|
+
|
|
+ // Second vector: g10,r10, bgr[9:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
|
|
+ const auto r1 = TableLookupBytes(v0, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(v1, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(v2, shuf_b1);
|
|
+ const auto int1 = r1 | g1 | b1;
|
|
+ StoreU(int1, d, unaligned + 1 * 16);
|
|
+
|
|
+ // Third vector: bgr[15:11], b10
|
|
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
|
|
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
|
|
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
|
|
+ const auto r2 = TableLookupBytes(v0, shuf_r2);
|
|
+ const auto g2 = TableLookupBytes(v1, shuf_g2);
|
|
+ const auto b2 = TableLookupBytes(v2, shuf_b2);
|
|
+ const auto int2 = r2 | g2 | b2;
|
|
+ StoreU(int2, d, unaligned + 2 * 16);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
|
|
+ const Vec128<uint8_t, 8> v1,
|
|
+ const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors for the shuffles and first result.
|
|
+ const Full128<uint8_t> d_full;
|
|
+ const auto k5 = Set(d_full, 5);
|
|
+ const auto k6 = Set(d_full, 6);
|
|
+
|
|
+ const Vec128<uint8_t> full_a{v0.raw};
|
|
+ const Vec128<uint8_t> full_b{v1.raw};
|
|
+ const Vec128<uint8_t> full_c{v2.raw};
|
|
+
|
|
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d_full, tbl_r0);
|
|
+ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ StoreU(int0, d_full, unaligned + 0 * 16);
|
|
+
|
|
+ // Second (HALF) vector: bgr[7:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
|
|
+ const auto r1 = TableLookupBytes(full_a, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(full_b, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(full_c, shuf_b1);
|
|
+ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
|
|
+ StoreU(int1, d, unaligned + 1 * 16);
|
|
+}
|
|
+
|
|
+// <= 32 bits
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
|
|
+ const Vec128<uint8_t, N> v1,
|
|
+ const Vec128<uint8_t, N> v2,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors for the shuffles and result.
|
|
+ const Full128<uint8_t> d_full;
|
|
+
|
|
+ const Vec128<uint8_t> full_a{v0.raw};
|
|
+ const Vec128<uint8_t> full_b{v1.raw};
|
|
+ const Vec128<uint8_t> full_c{v2.raw};
|
|
+
|
|
+ // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
|
|
+ 0x80, 0x80, 0x80, 0x80};
|
|
+ const auto shuf_r0 = Load(d_full, tbl_r0);
|
|
+ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
|
|
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
|
|
+ const auto int0 = r0 | g0 | b0;
|
|
+ alignas(16) uint8_t buf[16];
|
|
+ StoreU(int0, d_full, buf);
|
|
+ CopyBytes<N * 3>(buf, unaligned);
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+// 128 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
|
|
+ const Vec128<uint8_t> v1,
|
|
+ const Vec128<uint8_t> v2,
|
|
+ const Vec128<uint8_t> v3, Full128<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto ba8 = ZipUpper(v0, v1);
|
|
+ const auto dc8 = ZipUpper(v2, v3);
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
|
|
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8
|
|
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC
|
|
+ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
|
|
+ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
|
|
+ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
|
|
+ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
|
|
+}
|
|
+
|
|
+// 64 bits
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
|
|
+ const Vec128<uint8_t, 8> in1,
|
|
+ const Vec128<uint8_t, 8> in2,
|
|
+ const Vec128<uint8_t, 8> in3,
|
|
+ Simd<uint8_t, 8> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors to reduce the number of stores.
|
|
+ const Vec128<uint8_t> v0{in0.raw};
|
|
+ const Vec128<uint8_t> v1{in1.raw};
|
|
+ const Vec128<uint8_t> v2{in2.raw};
|
|
+ const Vec128<uint8_t> v3{in3.raw};
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
|
|
+ const Full128<uint8_t> d_full;
|
|
+ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
|
|
+ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
|
|
+}
|
|
+
|
|
+// <= 32 bits
|
|
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
|
|
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
|
|
+ const Vec128<uint8_t, N> in1,
|
|
+ const Vec128<uint8_t, N> in2,
|
|
+ const Vec128<uint8_t, N> in3,
|
|
+ Simd<uint8_t, N> /*tag*/,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // Use full vectors to reduce the number of stores.
|
|
+ const Vec128<uint8_t> v0{in0.raw};
|
|
+ const Vec128<uint8_t> v1{in1.raw};
|
|
+ const Vec128<uint8_t> v2{in2.raw};
|
|
+ const Vec128<uint8_t> v3{in3.raw};
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
|
|
+ alignas(16) uint8_t buf[16];
|
|
+ const Full128<uint8_t> d_full;
|
|
+ StoreU(BitCast(d_full, dcba_0), d_full, buf);
|
|
+ CopyBytes<4 * N>(buf, unaligned);
|
|
+}
|
|
+
|
|
// ------------------------------ Reductions
|
|
|
|
namespace detail {
|
|
|
|
-// For u32/i32/f32.
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+// N=1 for any T: no-op
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
+ const Vec128<T, 1> v) {
|
|
+ return v;
|
|
+}
|
|
+
|
|
+// u32/i32/f32:
|
|
+
|
|
+// N=2
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
+ const Vec128<T, 2> v10) {
|
|
+ return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
|
|
+}
|
|
+
|
|
+// N=4 (full)
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = v3210 + v1032;
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return v20_31_20_31 + v31_20_31_20;
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Min(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
|
- const Vec128<T, N> v3210) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Max(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
|
|
-// For u64/i64/f64.
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+// u64/i64/f64:
|
|
+
|
|
+// N=2 (full)
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return v10 + v01;
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Min(v10, v01);
|
|
}
|
|
-template <typename T, size_t N>
|
|
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
|
- const Vec128<T, N> v10) {
|
|
+template <typename T>
|
|
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Max(v10, v01);
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
-// Supported for u/i/f 32/64. Returns the sum in each lane.
|
|
+// Supported for u/i/f 32/64. Returns the same value in each lane.
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
|
|
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 2021-06-02 10:56:05.234904387 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -20,6 +20,20 @@
|
|
// particular, "Broadcast", pack and zip behavior may be surprising.
|
|
|
|
#include <immintrin.h> // AVX2+
|
|
+
|
|
+#if defined(_MSC_VER) && defined(__clang__)
|
|
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
|
|
+// including these headers when _MSC_VER is defined, like when using clang-cl.
|
|
+// Include these directly here.
|
|
+#include <avxintrin.h>
|
|
+// avxintrin defines __m256i and must come before avx2intrin.
|
|
+#include <avx2intrin.h>
|
|
+#include <bmi2intrin.h> // _pext_u64
|
|
+#include <f16cintrin.h>
|
|
+#include <fmaintrin.h>
|
|
+#include <smmintrin.h>
|
|
+#endif
|
|
+
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
@@ -148,23 +162,24 @@ HWY_API Vec256<uint16_t> Set(Full256<uin
|
|
return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
|
|
- return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))}; // NOLINT
|
|
+ return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
|
|
}
|
|
HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
|
|
return Vec256<uint64_t>{
|
|
_mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
|
|
- return Vec256<int8_t>{_mm256_set1_epi8(t)};
|
|
+ return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
|
|
- return Vec256<int16_t>{_mm256_set1_epi16(t)};
|
|
+ return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
|
|
return Vec256<int32_t>{_mm256_set1_epi32(t)};
|
|
}
|
|
HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
|
|
- return Vec256<int64_t>{_mm256_set1_epi64x(t)};
|
|
+ return Vec256<int64_t>{
|
|
+ _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
|
|
return Vec256<float>{_mm256_set1_ps(t)};
|
|
@@ -340,6 +355,8 @@ HWY_API Vec256<T> VecFromMask(Full256<T>
|
|
return Vec256<T>{v.raw};
|
|
}
|
|
|
|
+// ------------------------------ IfThenElse
|
|
+
|
|
// mask ? yes : no
|
|
template <typename T>
|
|
HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
|
|
@@ -412,9 +429,9 @@ HWY_API Mask256<T> Xor(const Mask256<T>
|
|
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
|
|
|
template <typename TFrom, typename TTo>
|
|
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
|
|
+HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
|
|
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
|
|
- return Mask256<TTo>{m.raw};
|
|
+ return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
|
|
}
|
|
|
|
// ------------------------------ Equality
|
|
@@ -670,6 +687,14 @@ HWY_API Vec256<double> Max(const Vec256<
|
|
return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
|
|
}
|
|
|
|
+// ------------------------------ FirstN (Iota, Lt)
|
|
+
|
|
+template <typename T>
|
|
+HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
|
|
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
|
|
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
|
|
+}
|
|
+
|
|
// ================================================== ARITHMETIC
|
|
|
|
// ------------------------------ Addition
|
|
@@ -832,7 +857,13 @@ HWY_API Vec256<uint16_t> AverageRound(co
|
|
|
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
|
|
+#if HWY_COMPILER_MSVC
|
|
+ // Workaround for incorrect codegen? (wrong result)
|
|
+ const auto zero = Zero(Full256<int8_t>());
|
|
+ return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
|
|
+#else
|
|
return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
|
|
+#endif
|
|
}
|
|
HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
|
|
return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
|
|
@@ -840,6 +871,7 @@ HWY_API Vec256<int16_t> Abs(const Vec256
|
|
HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
|
|
return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
|
|
}
|
|
+// i64 is implemented after BroadcastSignBit.
|
|
|
|
HWY_API Vec256<float> Abs(const Vec256<float> v) {
|
|
const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
|
|
@@ -925,6 +957,16 @@ HWY_API Vec256<int64_t> ShiftLeft(const
|
|
return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
|
|
+ const Full256<T> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
|
|
+ return kBits == 1
|
|
+ ? (v + v)
|
|
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRight
|
|
|
|
template <int kBits>
|
|
@@ -943,6 +985,14 @@ HWY_API Vec256<uint64_t> ShiftRight(cons
|
|
}
|
|
|
|
template <int kBits>
|
|
+HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
|
|
+ const Full256<uint8_t> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
|
|
+ return shifted & Set(d8, 0xFF >> kBits);
|
|
+}
|
|
+
|
|
+template <int kBits>
|
|
HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
|
|
return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
|
|
}
|
|
@@ -952,6 +1002,15 @@ HWY_API Vec256<int32_t> ShiftRight(const
|
|
return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits>
|
|
+HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
|
|
+ const Full256<int8_t> di;
|
|
+ const Full256<uint8_t> du;
|
|
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// i64 is implemented after BroadcastSignBit.
|
|
|
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
@@ -989,6 +1048,15 @@ HWY_API Vec256<int64_t> ShiftRight(const
|
|
#endif
|
|
}
|
|
|
|
+HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|
+#if HWY_TARGET == HWY_AVX3
|
|
+ return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
|
|
+#else
|
|
+ const auto zero = Zero(Full256<int64_t>());
|
|
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
+#endif
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftLeftSame
|
|
|
|
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
|
|
@@ -1016,6 +1084,14 @@ HWY_API Vec256<int64_t> ShiftLeftSame(co
|
|
return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
|
|
+ const Full256<T> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
|
|
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRightSame (BroadcastSignBit)
|
|
|
|
HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
|
|
@@ -1031,6 +1107,13 @@ HWY_API Vec256<uint64_t> ShiftRightSame(
|
|
return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
|
|
+ const Full256<uint8_t> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
|
|
+ return shifted & Set(d8, 0xFF >> bits);
|
|
+}
|
|
+
|
|
HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
|
|
const int bits) {
|
|
return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
|
|
@@ -1053,6 +1136,14 @@ HWY_API Vec256<int64_t> ShiftRightSame(c
|
|
#endif
|
|
}
|
|
|
|
+HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
|
|
+ const Full256<int8_t> di;
|
|
+ const Full256<uint8_t> du;
|
|
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ Negate
|
|
|
|
template <typename T, HWY_IF_FLOAT(T)>
|
|
@@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256<double>
|
|
_mm256_stream_pd(aligned, v.raw);
|
|
}
|
|
|
|
+// ------------------------------ Scatter
|
|
+
|
|
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
+HWY_DIAGNOSTICS(push)
|
|
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
+
|
|
+#if HWY_TARGET == HWY_AVX3
|
|
+namespace detail {
|
|
+
|
|
+template <typename T>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256<T> v,
|
|
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec256<int32_t> offset) {
|
|
+ _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256<T> v,
|
|
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec256<int32_t> index) {
|
|
+ _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256<T> v,
|
|
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec256<int64_t> offset) {
|
|
+ _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256<T> v,
|
|
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec256<int64_t> index) {
|
|
+ _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
+template <typename T, typename Offset>
|
|
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec256<Offset> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
|
|
+}
|
|
+template <typename T, typename Index>
|
|
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec256<Index> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
|
|
+}
|
|
+
|
|
+template <>
|
|
+HWY_INLINE void ScatterOffset<float>(Vec256<float> v, Full256<float> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec256<int32_t> offset) {
|
|
+ _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <>
|
|
+HWY_INLINE void ScatterIndex<float>(Vec256<float> v, Full256<float> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec256<int32_t> index) {
|
|
+ _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
|
|
+}
|
|
+
|
|
+template <>
|
|
+HWY_INLINE void ScatterOffset<double>(Vec256<double> v,
|
|
+ Full256<double> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec256<int64_t> offset) {
|
|
+ _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <>
|
|
+HWY_INLINE void ScatterIndex<double>(Vec256<double> v,
|
|
+ Full256<double> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec256<int64_t> index) {
|
|
+ _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+template <typename T, typename Offset>
|
|
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec256<Offset> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+
|
|
+ constexpr size_t N = 32 / sizeof(T);
|
|
+ alignas(32) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(32) Offset offset_lanes[N];
|
|
+ Store(offset, Simd<Offset, N>(), offset_lanes);
|
|
+
|
|
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
|
|
+ }
|
|
+}
|
|
+
|
|
+template <typename T, typename Index>
|
|
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec256<Index> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+
|
|
+ constexpr size_t N = 32 / sizeof(T);
|
|
+ alignas(32) T lanes[N];
|
|
+ Store(v, d, lanes);
|
|
+
|
|
+ alignas(32) Index index_lanes[N];
|
|
+ Store(index, Simd<Index, N>(), index_lanes);
|
|
+
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ base[index_lanes[i]] = lanes[i];
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
// ------------------------------ Gather
|
|
|
|
namespace detail {
|
|
@@ -1374,13 +1582,13 @@ HWY_API Vec256<T> GatherIndex(hwy::SizeT
|
|
template <typename T, typename Offset>
|
|
HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
|
|
const Vec256<Offset> offset) {
|
|
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
|
|
}
|
|
template <typename T, typename Index>
|
|
HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
|
|
const Vec256<Index> index) {
|
|
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
|
|
}
|
|
|
|
@@ -1410,6 +1618,8 @@ HWY_INLINE Vec256<double> GatherIndex<do
|
|
return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
|
|
}
|
|
|
|
+HWY_DIAGNOSTICS(pop)
|
|
+
|
|
// ================================================== SWIZZLE
|
|
|
|
template <typename T>
|
|
@@ -1861,38 +2071,26 @@ HWY_API Vec256<int64_t> ZipUpper(const V
|
|
return Vec256<int64_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
|
|
}
|
|
|
|
-// ------------------------------ Blocks
|
|
+// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
|
|
+
|
|
+// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
|
|
+// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
|
|
+// UpperUpper at the cost of one extra cycle/instruction.
|
|
|
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
template <typename T>
|
|
HWY_API Vec256<T> ConcatLowerLower(const Vec256<T> hi, const Vec256<T> lo) {
|
|
- return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)};
|
|
+ return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec256<float> ConcatLowerLower(const Vec256<float> hi,
|
|
const Vec256<float> lo) {
|
|
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)};
|
|
+ return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)};
|
|
}
|
|
template <>
|
|
HWY_INLINE Vec256<double> ConcatLowerLower(const Vec256<double> hi,
|
|
const Vec256<double> lo) {
|
|
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)};
|
|
-}
|
|
-
|
|
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
-template <typename T>
|
|
-HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
|
|
- return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
|
|
-}
|
|
-template <>
|
|
-HWY_INLINE Vec256<float> ConcatUpperUpper(const Vec256<float> hi,
|
|
- const Vec256<float> lo) {
|
|
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
|
|
-}
|
|
-template <>
|
|
-HWY_INLINE Vec256<double> ConcatUpperUpper(const Vec256<double> hi,
|
|
- const Vec256<double> lo) {
|
|
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
|
|
+ return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)};
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
@@ -1927,6 +2125,12 @@ HWY_INLINE Vec256<double> ConcatUpperLow
|
|
return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
|
|
}
|
|
|
|
+// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
|
|
+ return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo)));
|
|
+}
|
|
+
|
|
// ------------------------------ Odd/even lanes
|
|
|
|
namespace detail {
|
|
@@ -2211,11 +2415,18 @@ HWY_API Vec128<int8_t> DemoteTo(Full128<
|
|
_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
|
|
}
|
|
|
|
+ // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
|
|
+ // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
|
|
+HWY_DIAGNOSTICS(push)
|
|
+HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
|
|
+
|
|
HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
|
|
const Vec256<float> v) {
|
|
return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
|
|
}
|
|
|
|
+HWY_DIAGNOSTICS(pop)
|
|
+
|
|
HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
|
|
const Vec256<double> v) {
|
|
return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
|
|
@@ -2241,7 +2452,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
|
|
return BitCast(Simd<uint8_t, 8>(), pair);
|
|
}
|
|
|
|
-// ------------------------------ Convert integer <=> floating point
|
|
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
|
|
|
|
HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
|
|
const Vec256<int32_t> v) {
|
|
@@ -2253,13 +2464,20 @@ HWY_API Vec256<double> ConvertTo(Full256
|
|
(void)dd;
|
|
return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
|
|
#else
|
|
- alignas(32) int64_t lanes_i[4];
|
|
- Store(v, Full256<int64_t>(), lanes_i);
|
|
- alignas(32) double lanes_d[4];
|
|
- for (size_t i = 0; i < 4; ++i) {
|
|
- lanes_d[i] = static_cast<double>(lanes_i[i]);
|
|
- }
|
|
- return Load(dd, lanes_d);
|
|
+ // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
|
|
+ const Repartition<uint32_t, decltype(dd)> d32;
|
|
+ const Repartition<uint64_t, decltype(dd)> d64;
|
|
+
|
|
+ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
|
|
+ const auto k84_63 = Set(d64, 0x4530000080000000ULL);
|
|
+ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
|
|
+
|
|
+ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
|
|
+ const auto k52 = Set(d32, 0x43300000);
|
|
+ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
|
|
+
|
|
+ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
|
|
+ return (v_upper - k84_63_52) + v_lower; // order matters!
|
|
#endif
|
|
}
|
|
|
|
@@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT
|
|
const auto compressed =
|
|
_mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
|
|
return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
|
|
-
|
|
-#endif
|
|
+#endif // HWY_ARCH_X86_64
|
|
}
|
|
|
|
template <typename T>
|
|
@@ -2473,75 +2690,100 @@ HWY_INLINE Vec256<uint32_t> Idx64x4FromB
|
|
return Load(d32, packed_array + 8 * mask_bits);
|
|
}
|
|
|
|
-// Helper function called by both Compress and CompressStore - avoids a
|
|
+// Helper functions called by both Compress and CompressStore - avoids a
|
|
// redundant BitsFromMask in the latter.
|
|
|
|
-HWY_API Vec256<uint32_t> Compress(Vec256<uint32_t> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<uint32_t>{
|
|
- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
-#else
|
|
- const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
|
|
- return Vec256<uint32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
|
|
-#endif
|
|
-}
|
|
-HWY_API Vec256<int32_t> Compress(Vec256<int32_t> v, const uint64_t mask_bits) {
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> Compress(hwy::SizeTag<4> /*tag*/, Vec256<T> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ const auto vu = BitCast(Full256<uint32_t>(), v);
|
|
#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<int32_t>{
|
|
- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
+ const __m256i ret =
|
|
+ _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw);
|
|
#else
|
|
const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
|
|
- return Vec256<int32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
|
|
+ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
|
|
#endif
|
|
+ return BitCast(Full256<T>(), Vec256<uint32_t>{ret});
|
|
}
|
|
|
|
-HWY_API Vec256<uint64_t> Compress(Vec256<uint64_t> v,
|
|
- const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<uint64_t>{
|
|
- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
-#else
|
|
- const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
|
|
- return Vec256<uint64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
|
|
-#endif
|
|
-}
|
|
-HWY_API Vec256<int64_t> Compress(Vec256<int64_t> v, const uint64_t mask_bits) {
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> Compress(hwy::SizeTag<8> /*tag*/, Vec256<T> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ const auto vu = BitCast(Full256<uint64_t>(), v);
|
|
#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<int64_t>{
|
|
- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
+ const __m256i ret =
|
|
+ _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw);
|
|
#else
|
|
const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
|
|
- return Vec256<int64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
|
|
+ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
|
|
#endif
|
|
+ return BitCast(Full256<T>(), Vec256<uint64_t>{ret});
|
|
}
|
|
|
|
-HWY_API Vec256<float> Compress(Vec256<float> v, const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<float>{
|
|
- _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
-#else
|
|
- const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
|
|
- return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
|
|
-#endif
|
|
-}
|
|
+// Otherwise, defined in x86_512-inl.h so it can use wider vectors.
|
|
+#if HWY_TARGET != HWY_AVX3
|
|
|
|
-HWY_API Vec256<double> Compress(Vec256<double> v, const uint64_t mask_bits) {
|
|
-#if HWY_TARGET == HWY_AVX3
|
|
- return Vec256<double>{
|
|
- _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)};
|
|
-#else
|
|
- const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
|
|
- return Vec256<double>{_mm256_castsi256_pd(
|
|
- _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))};
|
|
-#endif
|
|
+// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using
|
|
+// the native Compress is probably more efficient than 2 LUTs.
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ using D = Full256<T>;
|
|
+ const Rebind<uint16_t, D> du;
|
|
+ const Repartition<int32_t, D> dw;
|
|
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
|
|
+ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
|
|
+ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
|
|
+
|
|
+ const uint64_t mask_bits0 = mask_bits & 0xFF;
|
|
+ const uint64_t mask_bits1 = mask_bits >> 8;
|
|
+ const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0);
|
|
+ const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1);
|
|
+
|
|
+ const Half<decltype(du)> dh;
|
|
+ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
|
|
+ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
|
|
+
|
|
+ const size_t count0 = PopCount(mask_bits0);
|
|
+ // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
|
|
+ // VPERMD for shifting at 4 byte granularity.
|
|
+ alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7};
|
|
+ const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2);
|
|
+ const auto shift1_multiple4 =
|
|
+ BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices));
|
|
+
|
|
+ // Whole-register unconditional shift by 2 bytes.
|
|
+ // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead?
|
|
+ const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
|
|
+ shift1_multiple4.raw, 0x08);
|
|
+ const auto shift1_multiple2 =
|
|
+ Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
|
|
+
|
|
+ // Make the shift conditional on the lower bit of count0.
|
|
+ const auto m_odd = TestBit(Set(du, count0), Set(du, 1));
|
|
+ const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
|
|
+
|
|
+ // Blend the lower and shifted upper parts.
|
|
+ constexpr uint16_t on = 0xFFFF;
|
|
+ alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on),
|
|
+ HWY_REP4(on), HWY_REP4(on)};
|
|
+ const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0));
|
|
+ return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1));
|
|
}
|
|
|
|
+#endif // HWY_TARGET != HWY_AVX3
|
|
+
|
|
} // namespace detail
|
|
|
|
+// Otherwise, defined in x86_512-inl.h after detail::Compress.
|
|
+#if HWY_TARGET != HWY_AVX3
|
|
+
|
|
template <typename T>
|
|
HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
|
|
- return detail::Compress(v, detail::BitsFromMask(mask));
|
|
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
|
|
+ detail::BitsFromMask(mask));
|
|
}
|
|
|
|
// ------------------------------ CompressStore
|
|
@@ -2550,10 +2792,101 @@ template <typename T>
|
|
HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
|
|
T* HWY_RESTRICT aligned) {
|
|
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
- Store(detail::Compress(v, mask_bits), d, aligned);
|
|
+ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
|
|
+ // using StoreU to concatenate the results would cause page faults if
|
|
+ // `aligned` is the last valid vector. Instead rely on in-register splicing.
|
|
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
|
|
return PopCount(mask_bits);
|
|
}
|
|
|
|
+#endif // HWY_TARGET != HWY_AVX3
|
|
+
|
|
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
|
+// TableLookupBytes, ConcatUpperLower)
|
|
+
|
|
+HWY_API void StoreInterleaved3(const Vec256<uint8_t> v0,
|
|
+ const Vec256<uint8_t> v1,
|
|
+ const Vec256<uint8_t> v2, Full256<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const auto k5 = Set(d, 5);
|
|
+ const auto k6 = Set(d, 6);
|
|
+
|
|
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = LoadDup128(d, tbl_r0);
|
|
+ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto interleaved_10_00 = r0 | g0 | b0;
|
|
+
|
|
+ // Second vector: g10,r10, bgr[9:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
|
|
+ const auto r1 = TableLookupBytes(v0, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(v1, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(v2, shuf_b1);
|
|
+ const auto interleaved_15_05 = r1 | g1 | b1;
|
|
+
|
|
+ // We want to write the lower halves of the interleaved vectors, then the
|
|
+ // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
|
|
+ // that would require two ununaligned stores. For the lower halves, we can
|
|
+ // merge two 128-bit stores for the same swizzling cost:
|
|
+ const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00);
|
|
+ StoreU(out0, d, unaligned + 0 * 32);
|
|
+
|
|
+ // Third vector: bgr[15:11], b10
|
|
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
|
|
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
|
|
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
|
|
+ const auto r2 = TableLookupBytes(v0, shuf_r2);
|
|
+ const auto g2 = TableLookupBytes(v1, shuf_g2);
|
|
+ const auto b2 = TableLookupBytes(v2, shuf_b2);
|
|
+ const auto interleaved_1A_0A = r2 | g2 | b2;
|
|
+
|
|
+ const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A);
|
|
+ StoreU(out1, d, unaligned + 1 * 32);
|
|
+
|
|
+ const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05);
|
|
+ StoreU(out2, d, unaligned + 2 * 32);
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
|
|
+ const Vec256<uint8_t> v1,
|
|
+ const Vec256<uint8_t> v2,
|
|
+ const Vec256<uint8_t> v3, Full256<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto ba8 = ZipUpper(v0, v1);
|
|
+ const auto dc8 = ZipUpper(v2, v3);
|
|
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a13 d..a10 | d..a03 d..a00
|
|
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a17 d..a14 | d..a07 d..a04
|
|
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..a1B d..a18 | d..a0B d..a08
|
|
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C
|
|
+ // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
|
|
+ // efficiently combine two lower halves into 256 bits:
|
|
+ const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0));
|
|
+ const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8));
|
|
+ StoreU(out0, d, unaligned + 0 * 32);
|
|
+ StoreU(out1, d, unaligned + 1 * 32);
|
|
+ const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0));
|
|
+ const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8));
|
|
+ StoreU(out2, d, unaligned + 2 * 32);
|
|
+ StoreU(out3, d, unaligned + 3 * 32);
|
|
+}
|
|
+
|
|
// ------------------------------ Reductions
|
|
|
|
namespace detail {
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 2021-06-02 10:56:05.218904306 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -19,6 +19,23 @@
|
|
// particular, "Broadcast", pack and zip behavior may be surprising.
|
|
|
|
#include <immintrin.h> // AVX2+
|
|
+#if defined(_MSC_VER) && defined(__clang__)
|
|
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
|
|
+// including these headers when _MSC_VER is defined, like when using clang-cl.
|
|
+// Include these directly here.
|
|
+#include <smmintrin.h>
|
|
+#include <avxintrin.h>
|
|
+#include <avx2intrin.h>
|
|
+#include <f16cintrin.h>
|
|
+#include <fmaintrin.h>
|
|
+#include <avx512fintrin.h>
|
|
+#include <avx512vlintrin.h>
|
|
+#include <avx512bwintrin.h>
|
|
+#include <avx512dqintrin.h>
|
|
+#include <avx512vlbwintrin.h>
|
|
+#include <avx512vldqintrin.h>
|
|
+#endif
|
|
+
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
@@ -100,9 +117,8 @@ struct RawMask512<8> {
|
|
// Mask register: one bit per lane.
|
|
template <typename T>
|
|
class Mask512 {
|
|
- using Raw = typename RawMask512<sizeof(T)>::type;
|
|
-
|
|
public:
|
|
+ using Raw = typename RawMask512<sizeof(T)>::type;
|
|
Raw raw;
|
|
};
|
|
|
|
@@ -167,23 +183,24 @@ HWY_API Vec512<uint16_t> Set(Full512<uin
|
|
return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
|
|
- return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))}; // NOLINT
|
|
+ return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
|
|
}
|
|
HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
|
|
return Vec512<uint64_t>{
|
|
_mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
|
|
- return Vec512<int8_t>{_mm512_set1_epi8(t)};
|
|
+ return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
|
|
- return Vec512<int16_t>{_mm512_set1_epi16(t)};
|
|
+ return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
|
|
return Vec512<int32_t>{_mm512_set1_epi32(t)};
|
|
}
|
|
HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
|
|
- return Vec512<int64_t>{_mm512_set1_epi64(t)};
|
|
+ return Vec512<int64_t>{
|
|
+ _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
|
|
}
|
|
HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
|
|
return Vec512<float>{_mm512_set1_ps(t)};
|
|
@@ -329,7 +346,45 @@ HWY_API Vec512<T> CopySignToAbs(const Ve
|
|
return CopySign(abs, sign);
|
|
}
|
|
|
|
-// ------------------------------ Select/blend
|
|
+// ------------------------------ FirstN
|
|
+
|
|
+// Possibilities for constructing a bitmask of N ones:
|
|
+// - kshift* only consider the lowest byte of the shift count, so they would
|
|
+// not correctly handle large n.
|
|
+// - Scalar shifts >= 64 are UB.
|
|
+// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
|
|
+// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
|
|
+
|
|
+#if HWY_ARCH_X86_32
|
|
+namespace detail {
|
|
+
|
|
+// 32 bit mask is sufficient for lane size >= 2.
|
|
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
|
|
+HWY_API Mask512<T> FirstN(size_t n) {
|
|
+ using Bits = typename Mask512<T>::Raw;
|
|
+ return Mask512<T>{static_cast<Bits>(_bzhi_u32(~uint32_t(0), n))};
|
|
+}
|
|
+
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Mask512<T> FirstN(size_t n) {
|
|
+ const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
|
|
+ return Mask512<T>{static_cast<__mmask64>(bits)};
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+#endif // HWY_ARCH_X86_32
|
|
+
|
|
+template <typename T>
|
|
+HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
|
|
+#if HWY_ARCH_X86_64
|
|
+ using Bits = typename Mask512<T>::Raw;
|
|
+ return Mask512<T>{static_cast<Bits>(_bzhi_u64(~uint64_t(0), n))};
|
|
+#else
|
|
+ return detail::FirstN<T>(n);
|
|
+#endif // HWY_ARCH_X86_64
|
|
+}
|
|
+
|
|
+// ------------------------------ IfThenElse
|
|
|
|
// Returns mask ? b : a.
|
|
|
|
@@ -626,7 +681,13 @@ HWY_API Vec512<uint16_t> AverageRound(co
|
|
|
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) {
|
|
+#if HWY_COMPILER_MSVC
|
|
+ // Workaround for incorrect codegen? (untested due to internal compiler error)
|
|
+ const auto zero = Zero(Full512<int8_t>());
|
|
+ return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
|
|
+#else
|
|
return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
|
|
+#endif
|
|
}
|
|
HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) {
|
|
return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
|
|
@@ -634,6 +695,9 @@ HWY_API Vec512<int16_t> Abs(const Vec512
|
|
HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) {
|
|
return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
|
|
}
|
|
+HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
|
|
+ return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
|
|
+}
|
|
|
|
// These aren't native instructions, they also involve AND with constant.
|
|
HWY_API Vec512<float> Abs(const Vec512<float> v) {
|
|
@@ -675,6 +739,16 @@ HWY_API Vec512<int64_t> ShiftLeft(const
|
|
return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
|
|
+ const Full512<T> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
|
|
+ return kBits == 1
|
|
+ ? (v + v)
|
|
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRight
|
|
|
|
template <int kBits>
|
|
@@ -693,6 +767,14 @@ HWY_API Vec512<uint64_t> ShiftRight(cons
|
|
}
|
|
|
|
template <int kBits>
|
|
+HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
|
|
+ const Full512<uint8_t> d8;
|
|
+ // Use raw instead of BitCast to support N=1.
|
|
+ const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
|
|
+ return shifted & Set(d8, 0xFF >> kBits);
|
|
+}
|
|
+
|
|
+template <int kBits>
|
|
HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) {
|
|
return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
|
|
}
|
|
@@ -707,6 +789,15 @@ HWY_API Vec512<int64_t> ShiftRight(const
|
|
return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
|
|
}
|
|
|
|
+template <int kBits>
|
|
+HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
|
|
+ const Full512<int8_t> di;
|
|
+ const Full512<uint8_t> du;
|
|
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftLeftSame
|
|
|
|
HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v,
|
|
@@ -734,6 +825,14 @@ HWY_API Vec512<int64_t> ShiftLeftSame(co
|
|
return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
|
+HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
|
|
+ const Full512<T> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
|
|
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
|
|
+}
|
|
+
|
|
// ------------------------------ ShiftRightSame
|
|
|
|
HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v,
|
|
@@ -749,6 +848,13 @@ HWY_API Vec512<uint64_t> ShiftRightSame(
|
|
return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) {
|
|
+ const Full512<uint8_t> d8;
|
|
+ const RepartitionToWide<decltype(d8)> d16;
|
|
+ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
|
|
+ return shifted & Set(d8, 0xFF >> bits);
|
|
+}
|
|
+
|
|
HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v,
|
|
const int bits) {
|
|
return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
|
|
@@ -763,6 +869,14 @@ HWY_API Vec512<int64_t> ShiftRightSame(c
|
|
return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
|
|
}
|
|
|
|
+HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
|
|
+ const Full512<int8_t> di;
|
|
+ const Full512<uint8_t> du;
|
|
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
|
|
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
|
|
+ return (shifted ^ shifted_sign) - shifted_sign;
|
|
+}
|
|
+
|
|
// ------------------------------ Shl
|
|
|
|
HWY_API Vec512<uint16_t> operator<<(const Vec512<uint16_t> v,
|
|
@@ -1046,6 +1160,10 @@ HWY_API Vec512<float> ApproximateRecipro
|
|
|
|
// ------------------------------ Floating-point rounding
|
|
|
|
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
+HWY_DIAGNOSTICS(push)
|
|
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
+
|
|
// Toward nearest integer, tie to even
|
|
HWY_API Vec512<float> Round(const Vec512<float> v) {
|
|
return Vec512<float>{_mm512_roundscale_ps(
|
|
@@ -1086,6 +1204,8 @@ HWY_API Vec512<double> Floor(const Vec51
|
|
_mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
|
|
}
|
|
|
|
+HWY_DIAGNOSTICS(pop)
|
|
+
|
|
// ================================================== COMPARE
|
|
|
|
// Comparisons set a mask bit to 1 if the condition is true, else 0.
|
|
@@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512<double>
|
|
_mm512_stream_pd(aligned, v.raw);
|
|
}
|
|
|
|
+// ------------------------------ Scatter
|
|
+
|
|
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
+HWY_DIAGNOSTICS(push)
|
|
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
+
|
|
+namespace detail {
|
|
+
|
|
+template <typename T>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v,
|
|
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec512<int32_t> offset) {
|
|
+ _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v,
|
|
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec512<int32_t> index) {
|
|
+ _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
|
|
+}
|
|
+
|
|
+template <typename T>
|
|
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v,
|
|
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec512<int64_t> offset) {
|
|
+ _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <typename T>
|
|
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v,
|
|
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
|
|
+ const Vec512<int64_t> index) {
|
|
+ _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
+template <typename T, typename Offset>
|
|
+HWY_API void ScatterOffset(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec512<Offset> offset) {
|
|
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
|
|
+}
|
|
+template <typename T, typename Index>
|
|
+HWY_API void ScatterIndex(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
|
|
+ const Vec512<Index> index) {
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
|
|
+}
|
|
+
|
|
+template <>
|
|
+HWY_INLINE void ScatterOffset<float>(Vec512<float> v, Full512<float> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec512<int32_t> offset) {
|
|
+ _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <>
|
|
+HWY_INLINE void ScatterIndex<float>(Vec512<float> v, Full512<float> /* tag */,
|
|
+ float* HWY_RESTRICT base,
|
|
+ const Vec512<int32_t> index) {
|
|
+ _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
|
|
+}
|
|
+
|
|
+template <>
|
|
+HWY_INLINE void ScatterOffset<double>(Vec512<double> v,
|
|
+ Full512<double> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec512<int64_t> offset) {
|
|
+ _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
|
|
+}
|
|
+template <>
|
|
+HWY_INLINE void ScatterIndex<double>(Vec512<double> v,
|
|
+ Full512<double> /* tag */,
|
|
+ double* HWY_RESTRICT base,
|
|
+ const Vec512<int64_t> index) {
|
|
+ _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
|
|
+}
|
|
+
|
|
// ------------------------------ Gather
|
|
|
|
namespace detail {
|
|
@@ -1713,13 +1910,13 @@ HWY_API Vec512<T> GatherIndex(hwy::SizeT
|
|
template <typename T, typename Offset>
|
|
HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
|
|
const Vec512<Offset> offset) {
|
|
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
|
|
+static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
|
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
|
|
}
|
|
template <typename T, typename Index>
|
|
HWY_API Vec512<T> GatherIndex(Full512<T> d, const T* HWY_RESTRICT base,
|
|
const Vec512<Index> index) {
|
|
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
|
|
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
|
|
return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
|
|
}
|
|
|
|
@@ -1749,6 +1946,8 @@ HWY_INLINE Vec512<double> GatherIndex<do
|
|
return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
|
|
}
|
|
|
|
+HWY_DIAGNOSTICS(pop)
|
|
+
|
|
// ================================================== SWIZZLE
|
|
|
|
template <typename T>
|
|
@@ -2439,7 +2638,11 @@ HWY_API Vec256<int8_t> DemoteTo(Full256<
|
|
|
|
HWY_API Vec256<float16_t> DemoteTo(Full256<float16_t> /* tag */,
|
|
const Vec512<float> v) {
|
|
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
+ HWY_DIAGNOSTICS(push)
|
|
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
|
|
+ HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
|
|
@@ -2633,8 +2836,81 @@ HWY_API Vec512<double> Compress(Vec512<d
|
|
return Vec512<double>{_mm512_maskz_compress_pd(mask.raw, v.raw)};
|
|
}
|
|
|
|
+namespace detail {
|
|
+
|
|
+// Ignore IDE redefinition error for these two functions: if this header is
|
|
+// included, then the functions weren't actually defined in x86_256-inl.h.
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
|
|
+ const uint64_t mask_bits) {
|
|
+ using D = Full256<T>;
|
|
+ const Rebind<uint16_t, D> du;
|
|
+ const Rebind<int32_t, D> dw; // 512-bit, not 256!
|
|
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
|
|
+ const Mask512<int32_t> mask{static_cast<__mmask16>(mask_bits)};
|
|
+ return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask)));
|
|
+}
|
|
+
|
|
+} // namespace detail
|
|
+
|
|
+template <typename T>
|
|
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
|
|
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
|
|
+ detail::BitsFromMask(mask));
|
|
+}
|
|
+
|
|
+// Expands to 32-bit, compresses, concatenate demoted halves.
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
|
+HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
|
|
+ using D = Full512<T>;
|
|
+ const Rebind<uint16_t, D> du;
|
|
+ const Repartition<int32_t, D> dw;
|
|
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
|
|
+ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
|
|
+ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
|
|
+
|
|
+ const Mask512<int32_t> mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)};
|
|
+ const Mask512<int32_t> mask1{static_cast<__mmask16>(mask.raw >> 16)};
|
|
+ const auto compressed0 = Compress(promoted0, mask0);
|
|
+ const auto compressed1 = Compress(promoted1, mask1);
|
|
+
|
|
+ const Half<decltype(du)> dh;
|
|
+ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
|
|
+ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
|
|
+
|
|
+ // Concatenate into single vector by shifting upper with writemask.
|
|
+ const size_t num0 = CountTrue(mask0);
|
|
+ const __mmask32 m_upper = ~((1u << num0) - 1);
|
|
+ alignas(64) uint16_t iota[64] = {
|
|
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
+ const auto idx = LoadU(du, iota + 32 - num0);
|
|
+ return Vec512<T>{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw,
|
|
+ demoted1.raw)};
|
|
+}
|
|
+
|
|
// ------------------------------ CompressStore
|
|
|
|
+template <typename T>
|
|
+HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
|
|
+ T* HWY_RESTRICT aligned) {
|
|
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
|
|
+ return PopCount(mask_bits);
|
|
+}
|
|
+
|
|
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
|
+HWY_API size_t CompressStore(Vec512<T> v, const Mask512<T> mask, Full512<T> d,
|
|
+ T* HWY_RESTRICT aligned) {
|
|
+ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
|
|
+ // using StoreU to concatenate the results would cause page faults if
|
|
+ // `aligned` is the last valid vector. Instead rely on in-register splicing.
|
|
+ Store(Compress(v, mask), d, aligned);
|
|
+ return CountTrue(mask);
|
|
+}
|
|
+
|
|
HWY_API size_t CompressStore(Vec512<uint32_t> v, const Mask512<uint32_t> mask,
|
|
Full512<uint32_t> /* tag */,
|
|
uint32_t* HWY_RESTRICT aligned) {
|
|
@@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512<doub
|
|
return CountTrue(mask);
|
|
}
|
|
|
|
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
|
+// TableLookupBytes)
|
|
+
|
|
+HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
|
|
+ const Vec512<uint8_t> c, Full512<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ const auto k5 = Set(d, 5);
|
|
+ const auto k6 = Set(d, 6);
|
|
+
|
|
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
|
|
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
|
|
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
|
|
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
|
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
|
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
|
|
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
|
|
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
+ const auto shuf_r0 = LoadDup128(d, tbl_r0);
|
|
+ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
|
|
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
|
|
+ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
|
|
+ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
|
|
+ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
|
|
+ const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00
|
|
+
|
|
+ // Second vector: g10,r10, bgr[9:6], b5,g5
|
|
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
|
|
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
|
|
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
|
|
+ const auto r1 = TableLookupBytes(a, shuf_r1);
|
|
+ const auto g1 = TableLookupBytes(b, shuf_g1);
|
|
+ const auto b1 = TableLookupBytes(c, shuf_b1);
|
|
+ const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05
|
|
+
|
|
+ // Third vector: bgr[15:11], b10
|
|
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
|
|
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
|
|
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
|
|
+ const auto r2 = TableLookupBytes(a, shuf_r2);
|
|
+ const auto g2 = TableLookupBytes(b, shuf_g2);
|
|
+ const auto b2 = TableLookupBytes(c, shuf_b2);
|
|
+ const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
|
|
+
|
|
+ // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
|
|
+ const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
|
|
+ const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
|
|
+ const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
|
|
+
|
|
+ // Alternating order, most-significant 128 bits from the second arg.
|
|
+ const __mmask8 m = 0xCC;
|
|
+ const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
|
|
+ const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
|
|
+ const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
|
|
+
|
|
+ StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00
|
|
+ StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15
|
|
+ StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A
|
|
+}
|
|
+
|
|
+// ------------------------------ StoreInterleaved4
|
|
+
|
|
+HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
|
|
+ const Vec512<uint8_t> v1,
|
|
+ const Vec512<uint8_t> v2,
|
|
+ const Vec512<uint8_t> v3, Full512<uint8_t> d,
|
|
+ uint8_t* HWY_RESTRICT unaligned) {
|
|
+ // let a,b,c,d denote v0..3.
|
|
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
|
|
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
|
|
+ const auto ba8 = ZipUpper(v0, v1);
|
|
+ const auto dc8 = ZipUpper(v2, v3);
|
|
+ const auto i = ZipLower(ba0, dc0).raw; // 4x128bit: d..a3 d..a0
|
|
+ const auto j = ZipUpper(ba0, dc0).raw; // 4x128bit: d..a7 d..a4
|
|
+ const auto k = ZipLower(ba8, dc8).raw; // 4x128bit: d..aB d..a8
|
|
+ const auto l = ZipUpper(ba8, dc8).raw; // 4x128bit: d..aF d..aC
|
|
+ // 128-bit blocks were independent until now; transpose 4x4.
|
|
+ const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
|
|
+ const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
|
|
+ const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
|
|
+ const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
|
|
+ constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
|
|
+ constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
|
|
+ const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
|
|
+ const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
|
|
+ const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
|
|
+ const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
|
|
+ StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d, unaligned + 0 * 64);
|
|
+ StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d, unaligned + 1 * 64);
|
|
+ StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d, unaligned + 2 * 64);
|
|
+ StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d, unaligned + 3 * 64);
|
|
+}
|
|
+
|
|
// ------------------------------ Reductions
|
|
|
|
// Returns the sum in each lane.
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 2021-06-02 10:56:05.281904625 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -28,12 +28,12 @@
|
|
|
|
#if HWY_ARCH_X86
|
|
#include <xmmintrin.h>
|
|
-#ifdef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
#include <intrin.h>
|
|
-#else
|
|
+#else // HWY_COMPILER_MSVC
|
|
#include <cpuid.h>
|
|
-#endif
|
|
-#endif
|
|
+#endif // HWY_COMPILER_MSVC
|
|
+#endif // HWY_ARCH_X86
|
|
|
|
namespace hwy {
|
|
namespace {
|
|
@@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const
|
|
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
|
|
void Cpuid(const uint32_t level, const uint32_t count,
|
|
uint32_t* HWY_RESTRICT abcd) {
|
|
-#ifdef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
int regs[4];
|
|
__cpuidex(regs, level, count);
|
|
for (int i = 0; i < 4; ++i) {
|
|
abcd[i] = regs[i];
|
|
}
|
|
-#else
|
|
+#else // HWY_COMPILER_MSVC
|
|
uint32_t a;
|
|
uint32_t b;
|
|
uint32_t c;
|
|
@@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u
|
|
abcd[1] = b;
|
|
abcd[2] = c;
|
|
abcd[3] = d;
|
|
-#endif
|
|
+#endif // HWY_COMPILER_MSVC
|
|
}
|
|
|
|
// Returns the lower 32 bits of extended control register 0.
|
|
// Requires CPU support for "OSXSAVE" (see below).
|
|
uint32_t ReadXCR0() {
|
|
-#ifdef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
return static_cast<uint32_t>(_xgetbv(0));
|
|
-#else
|
|
+#else // HWY_COMPILER_MSVC
|
|
uint32_t xcr0, xcr0_high;
|
|
const uint32_t index = 0;
|
|
asm volatile(".byte 0x0F, 0x01, 0xD0"
|
|
: "=a"(xcr0), "=d"(xcr0_high)
|
|
: "c"(index));
|
|
return xcr0;
|
|
-#endif
|
|
+#endif // HWY_COMPILER_MSVC
|
|
}
|
|
|
|
#endif // HWY_ARCH_X86
|
|
@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13;
|
|
constexpr uint32_t kAVX512DQ = 1u << 14;
|
|
constexpr uint32_t kAVX512BW = 1u << 15;
|
|
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
|
|
-#endif
|
|
+#endif // HWY_ARCH_X86
|
|
|
|
} // namespace
|
|
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 2021-06-02 10:56:05.267904554 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -65,7 +65,9 @@
|
|
// HWY_MAX_DYNAMIC_TARGETS in total.
|
|
#define HWY_HIGHEST_TARGET_BIT_X86 9
|
|
|
|
-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
|
|
+#define HWY_SVE2 0x400
|
|
+#define HWY_SVE 0x800
|
|
+// 0x1000 reserved for Helium
|
|
#define HWY_NEON 0x2000
|
|
|
|
#define HWY_HIGHEST_TARGET_BIT_ARM 13
|
|
@@ -90,6 +92,9 @@
|
|
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
|
|
|
|
#define HWY_SCALAR 0x20000000
|
|
+
|
|
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
|
|
+
|
|
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
|
|
|
|
//------------------------------------------------------------------------------
|
|
@@ -106,25 +111,26 @@
|
|
#ifndef HWY_BROKEN_TARGETS
|
|
|
|
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
|
-// SSE4 codegen (msan failure), so disable all those targets.
|
|
+// SSE4 codegen (possibly only for msan), so disable all those targets.
|
|
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
|
-// TODO: Disable all non-scalar targets for every build target once we have
|
|
-// clang-7 enabled in our builders.
|
|
-#ifdef MEMORY_SANITIZER
|
|
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
|
|
-#else
|
|
-#define HWY_BROKEN_TARGETS 0
|
|
-#endif
|
|
// This entails a major speed reduction, so warn unless the user explicitly
|
|
// opts in to scalar-only.
|
|
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
|
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
|
#endif
|
|
|
|
-// MSVC, or 32-bit may fail to compile AVX2/3.
|
|
-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
|
|
+// 32-bit may fail to compile AVX2/3.
|
|
+#elif HWY_ARCH_X86_32
|
|
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
|
|
-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
|
|
+
|
|
+// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
|
+#elif HWY_COMPILER_MSVC != 0
|
|
+#define HWY_BROKEN_TARGETS (HWY_AVX3)
|
|
+
|
|
+// armv7be has not been tested and is not yet supported.
|
|
+#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
|
|
+#define HWY_BROKEN_TARGETS (HWY_NEON)
|
|
|
|
#else
|
|
#define HWY_BROKEN_TARGETS 0
|
|
@@ -145,53 +151,74 @@
|
|
// user to override this without any guarantee of success.
|
|
#ifndef HWY_BASELINE_TARGETS
|
|
|
|
-#ifdef __wasm_simd128__
|
|
+// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
|
+// HWY_TARGET == HWY_SCALAR.
|
|
+
|
|
+#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
|
#define HWY_BASELINE_WASM HWY_WASM
|
|
#else
|
|
#define HWY_BASELINE_WASM 0
|
|
#endif
|
|
|
|
-#ifdef __VSX__
|
|
+// Avoid choosing the PPC target until we have an implementation.
|
|
+#if HWY_ARCH_PPC && defined(__VSX__) && 0
|
|
#define HWY_BASELINE_PPC8 HWY_PPC8
|
|
#else
|
|
#define HWY_BASELINE_PPC8 0
|
|
#endif
|
|
|
|
-// GCC 4.5.4 only defines the former; 5.4 defines both.
|
|
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
+// Avoid choosing the SVE[2] targets the implementation is ready.
|
|
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
|
|
+#define HWY_BASELINE_SVE2 HWY_SVE2
|
|
+#else
|
|
+#define HWY_BASELINE_SVE2 0
|
|
+#endif
|
|
+
|
|
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
|
|
+#define HWY_BASELINE_SVE HWY_SVE
|
|
+#else
|
|
+#define HWY_BASELINE_SVE 0
|
|
+#endif
|
|
+
|
|
+// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
|
+#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
|
#define HWY_BASELINE_NEON HWY_NEON
|
|
#else
|
|
#define HWY_BASELINE_NEON 0
|
|
#endif
|
|
|
|
-#ifdef __SSE4_1__
|
|
+// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
|
|
+// we at least get SSE4 on machines supporting AVX but not AVX2.
|
|
+// https://stackoverflow.com/questions/18563978/
|
|
+#if HWY_ARCH_X86 && \
|
|
+ (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
|
|
#define HWY_BASELINE_SSE4 HWY_SSE4
|
|
#else
|
|
#define HWY_BASELINE_SSE4 0
|
|
#endif
|
|
|
|
-#ifdef __AVX2__
|
|
+#if HWY_ARCH_X86 && defined(__AVX2__)
|
|
#define HWY_BASELINE_AVX2 HWY_AVX2
|
|
#else
|
|
#define HWY_BASELINE_AVX2 0
|
|
#endif
|
|
|
|
-#ifdef __AVX512F__
|
|
+#if HWY_ARCH_X86 && defined(__AVX512F__)
|
|
#define HWY_BASELINE_AVX3 HWY_AVX3
|
|
#else
|
|
#define HWY_BASELINE_AVX3 0
|
|
#endif
|
|
|
|
-#ifdef __riscv_vector
|
|
+#if HWY_ARCH_RVV && defined(__riscv_vector)
|
|
#define HWY_BASELINE_RVV HWY_RVV
|
|
#else
|
|
#define HWY_BASELINE_RVV 0
|
|
#endif
|
|
|
|
#define HWY_BASELINE_TARGETS \
|
|
- (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
|
|
- HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
|
- HWY_BASELINE_RVV)
|
|
+ (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
|
|
+ HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 | \
|
|
+ HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)
|
|
|
|
#endif // HWY_BASELINE_TARGETS
|
|
|
|
@@ -242,13 +269,12 @@
|
|
#define HWY_TARGETS HWY_STATIC_TARGET
|
|
|
|
// 3) For tests: include all attainable targets (in particular: scalar)
|
|
-#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
|
|
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
|
|
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
|
|
|
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
|
// excluding superseded targets, in particular scalar.
|
|
#else
|
|
-
|
|
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
|
|
|
|
#endif // target policy
|
|
@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
#endif
|
|
|
|
#if HWY_ARCH_ARM
|
|
+ case HWY_SVE2:
|
|
+ return "SVE2";
|
|
+ case HWY_SVE:
|
|
+ return "SVE";
|
|
case HWY_NEON:
|
|
return "Neon";
|
|
#endif
|
|
@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
return "Scalar";
|
|
|
|
default:
|
|
- return "?";
|
|
+ return "Unknown"; // must satisfy gtest IsValidParamName()
|
|
}
|
|
}
|
|
|
|
@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
nullptr, /* SSE3 */ \
|
|
nullptr /* SSE2 */
|
|
|
|
-#endif // HWY_ARCH_X86
|
|
-
|
|
-#if HWY_ARCH_ARM
|
|
+#elif HWY_ARCH_ARM
|
|
// See HWY_ARCH_X86 above for details.
|
|
#define HWY_MAX_DYNAMIC_TARGETS 4
|
|
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
|
|
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
|
- nullptr, /* reserved */ \
|
|
- nullptr, /* reserved */ \
|
|
+ HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
|
|
+ HWY_CHOOSE_SVE(func_name), /* SVE */ \
|
|
nullptr, /* reserved */ \
|
|
HWY_CHOOSE_NEON(func_name) /* NEON */
|
|
|
|
-#endif // HWY_ARCH_ARM
|
|
-
|
|
-#if HWY_ARCH_PPC
|
|
+#elif HWY_ARCH_PPC
|
|
// See HWY_ARCH_X86 above for details.
|
|
#define HWY_MAX_DYNAMIC_TARGETS 5
|
|
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
|
|
@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
nullptr, /* VSX */ \
|
|
nullptr /* AltiVec */
|
|
|
|
-#endif // HWY_ARCH_PPC
|
|
-
|
|
-#if HWY_ARCH_WASM
|
|
+#elif HWY_ARCH_WASM
|
|
// See HWY_ARCH_X86 above for details.
|
|
#define HWY_MAX_DYNAMIC_TARGETS 4
|
|
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
|
@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
nullptr, /* reserved */ \
|
|
HWY_CHOOSE_WASM(func_name) /* WASM */
|
|
|
|
-#endif // HWY_ARCH_WASM
|
|
-
|
|
-#if HWY_ARCH_RVV
|
|
+#elif HWY_ARCH_RVV
|
|
// See HWY_ARCH_X86 above for details.
|
|
#define HWY_MAX_DYNAMIC_TARGETS 4
|
|
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
|
|
@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha
|
|
nullptr, /* reserved */ \
|
|
HWY_CHOOSE_RVV(func_name) /* RVV */
|
|
|
|
-#endif // HWY_ARCH_RVV
|
|
+#else
|
|
+// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
|
|
+// still creating single-entry tables in HWY_EXPORT to ensure portability.
|
|
+#define HWY_MAX_DYNAMIC_TARGETS 1
|
|
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
|
|
+#endif
|
|
|
|
struct ChosenTarget {
|
|
public:
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 2021-06-02 10:56:05.264904539 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR)
|
|
HWY_EXPORT(FakeFunction);
|
|
|
|
void CheckFakeFunction() {
|
|
-#define CHECK_ARRAY_ENTRY(TGT) \
|
|
- if ((HWY_TARGETS & HWY_##TGT) != 0) { \
|
|
- hwy::SetSupportedTargetsForTest(HWY_##TGT); \
|
|
- /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
|
|
- /* the pointer to the already cached function. */ \
|
|
- hwy::chosen_target.Update(); \
|
|
- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
- /* Calling DeInit() will test that the initializer function */ \
|
|
- /* also calls the right function. */ \
|
|
- hwy::chosen_target.DeInit(); \
|
|
- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
- /* Second call uses the cached value from the previous call. */ \
|
|
- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
+#define CHECK_ARRAY_ENTRY(TGT) \
|
|
+ if ((HWY_TARGETS & HWY_##TGT) != 0) { \
|
|
+ hwy::SetSupportedTargetsForTest(HWY_##TGT); \
|
|
+ /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
|
|
+ /* the pointer to the already cached function. */ \
|
|
+ hwy::chosen_target.Update(); \
|
|
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
+ /* Calling DeInit() will test that the initializer function */ \
|
|
+ /* also calls the right function. */ \
|
|
+ hwy::chosen_target.DeInit(); \
|
|
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
+ /* Second call uses the cached value from the previous call. */ \
|
|
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
|
}
|
|
CHECK_ARRAY_ENTRY(AVX3)
|
|
CHECK_ARRAY_ENTRY(AVX2)
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 2021-06-02 10:56:05.251904473 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -16,7 +16,6 @@
|
|
#include <stdint.h>
|
|
|
|
#include <algorithm>
|
|
-#include <cmath>
|
|
#include <limits>
|
|
|
|
#undef HWY_TARGET_INCLUDE
|
|
@@ -173,16 +172,8 @@ struct TestFloatAbs {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllAbs() {
|
|
- const ForPartialVectors<TestAbs> test;
|
|
- test(int8_t());
|
|
- test(int16_t());
|
|
- test(int32_t());
|
|
-
|
|
- const ForPartialVectors<TestFloatAbs> test_float;
|
|
- test_float(float());
|
|
-#if HWY_CAP_FLOAT64
|
|
- test_float(double());
|
|
-#endif
|
|
+ ForSignedTypes(ForPartialVectors<TestAbs>());
|
|
+ ForFloatTypes(ForPartialVectors<TestFloatAbs>());
|
|
}
|
|
|
|
template <bool kSigned>
|
|
@@ -199,6 +190,45 @@ struct TestLeftShifts {
|
|
const size_t N = Lanes(d);
|
|
auto expected = AllocateAligned<T>(N);
|
|
|
|
+ const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
|
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
|
+
|
|
+ // 0
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
|
|
+
|
|
+ // 1
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ const T value = kSigned ? T(i) - T(N) : T(i);
|
|
+ expected[i] = T(TU(value) << 1);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
|
|
+
|
|
+ // max
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ const T value = kSigned ? T(i) - T(N) : T(i);
|
|
+ expected[i] = T(TU(value) << kMaxShift);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
|
|
+ }
|
|
+};
|
|
+
|
|
+template <bool kSigned>
|
|
+struct TestVariableLeftShifts {
|
|
+ template <typename T, class D>
|
|
+ HWY_NOINLINE void operator()(T t, D d) {
|
|
+ if (kSigned) {
|
|
+ // Also test positive values
|
|
+ TestVariableLeftShifts</*kSigned=*/false>()(t, d);
|
|
+ }
|
|
+
|
|
+ using TI = MakeSigned<T>;
|
|
+ using TU = MakeUnsigned<T>;
|
|
+ const size_t N = Lanes(d);
|
|
+ auto expected = AllocateAligned<T>(N);
|
|
+
|
|
const auto v0 = Zero(d);
|
|
const auto v1 = Set(d, 1);
|
|
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
|
@@ -209,8 +239,6 @@ struct TestLeftShifts {
|
|
const auto large_shifts = max_shift - small_shifts;
|
|
|
|
// Same: 0
|
|
- HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
|
|
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
|
|
|
|
// Same: 1
|
|
@@ -218,8 +246,6 @@ struct TestLeftShifts {
|
|
const T value = kSigned ? T(i) - T(N) : T(i);
|
|
expected[i] = T(TU(value) << 1);
|
|
}
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
|
|
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
|
|
|
|
// Same: max
|
|
@@ -227,8 +253,6 @@ struct TestLeftShifts {
|
|
const T value = kSigned ? T(i) - T(N) : T(i);
|
|
expected[i] = T(TU(value) << kMaxShift);
|
|
}
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
|
|
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
|
|
|
|
// Variable: small
|
|
@@ -252,6 +276,37 @@ struct TestUnsignedRightShifts {
|
|
const size_t N = Lanes(d);
|
|
auto expected = AllocateAligned<T>(N);
|
|
|
|
+ const auto values = Iota(d, 0);
|
|
+
|
|
+ const T kMax = LimitsMax<T>();
|
|
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
|
+
|
|
+ // Shift by 0
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
|
+
|
|
+ // Shift by 1
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[i] = T(T(i & kMax) >> 1);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
|
+
|
|
+ // max
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[i] = T(T(i & kMax) >> kMaxShift);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
|
|
+ }
|
|
+};
|
|
+
|
|
+struct TestVariableUnsignedRightShifts {
|
|
+ template <typename T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ const size_t N = Lanes(d);
|
|
+ auto expected = AllocateAligned<T>(N);
|
|
+
|
|
const auto v0 = Zero(d);
|
|
const auto v1 = Set(d, 1);
|
|
const auto values = Iota(d, 0);
|
|
@@ -265,21 +320,15 @@ struct TestUnsignedRightShifts {
|
|
const auto large_shifts = max_shift - small_shifts;
|
|
|
|
// Same: 0
|
|
- HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
|
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
|
|
|
|
// Same: 1
|
|
for (size_t i = 0; i < N; ++i) {
|
|
- expected[i] = T(i >> 1);
|
|
+ expected[i] = T(T(i & kMax) >> 1);
|
|
}
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
|
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
|
|
|
|
// Same: max
|
|
- HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
|
|
- HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
|
|
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
|
|
|
|
// Variable: small
|
|
@@ -296,33 +345,120 @@ struct TestUnsignedRightShifts {
|
|
}
|
|
};
|
|
|
|
-struct TestSignedRightShifts {
|
|
+template <int kAmount, typename T>
|
|
+T RightShiftNegative(T val) {
|
|
+ // C++ shifts are implementation-defined for negative numbers, and we have
|
|
+ // seen divisions replaced with shifts, so resort to bit operations.
|
|
+ using TU = hwy::MakeUnsigned<T>;
|
|
+ TU bits;
|
|
+ CopyBytes<sizeof(T)>(&val, &bits);
|
|
+
|
|
+ const TU shifted = bits >> kAmount;
|
|
+
|
|
+ const TU all = ~TU(0);
|
|
+ const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
|
|
+ const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
|
|
+
|
|
+ bits = shifted | sign_extended;
|
|
+ CopyBytes<sizeof(T)>(&bits, &val);
|
|
+ return val;
|
|
+}
|
|
+
|
|
+class TestSignedRightShifts {
|
|
+ public:
|
|
template <typename T, class D>
|
|
- HWY_NOINLINE void operator()(T t, D d) {
|
|
- // Also test positive values
|
|
- TestUnsignedRightShifts()(t, d);
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ const size_t N = Lanes(d);
|
|
+ auto expected = AllocateAligned<T>(N);
|
|
+ constexpr T kMin = LimitsMin<T>();
|
|
+ constexpr T kMax = LimitsMax<T>();
|
|
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
|
+
|
|
+ // First test positive values, negative are checked below.
|
|
+ const auto v0 = Zero(d);
|
|
+ const auto values = Iota(d, 0) & Set(d, kMax);
|
|
+
|
|
+ // Shift by 0
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
|
+
|
|
+ // Shift by 1
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[i] = T(T(i & kMax) >> 1);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
|
+
|
|
+ // max
|
|
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
|
|
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
|
|
+
|
|
+ // Even negative value
|
|
+ Test<0>(kMin, d, __LINE__);
|
|
+ Test<1>(kMin, d, __LINE__);
|
|
+ Test<2>(kMin, d, __LINE__);
|
|
+ Test<kMaxShift>(kMin, d, __LINE__);
|
|
+
|
|
+ const T odd = static_cast<T>(kMin + 1);
|
|
+ Test<0>(odd, d, __LINE__);
|
|
+ Test<1>(odd, d, __LINE__);
|
|
+ Test<2>(odd, d, __LINE__);
|
|
+ Test<kMaxShift>(odd, d, __LINE__);
|
|
+ }
|
|
+
|
|
+ private:
|
|
+ template <int kAmount, typename T, class D>
|
|
+ void Test(T val, D d, int line) {
|
|
+ const auto expected = Set(d, RightShiftNegative<kAmount>(val));
|
|
+ const auto in = Set(d, val);
|
|
+ const char* file = __FILE__;
|
|
+ AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
|
|
+ AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
|
|
+ }
|
|
+};
|
|
|
|
+struct TestVariableSignedRightShifts {
|
|
+ template <typename T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
using TU = MakeUnsigned<T>;
|
|
const size_t N = Lanes(d);
|
|
auto expected = AllocateAligned<T>(N);
|
|
|
|
constexpr T kMin = LimitsMin<T>();
|
|
- const auto values = Iota(d, kMin);
|
|
+ constexpr T kMax = LimitsMax<T>();
|
|
|
|
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
|
+
|
|
+ // First test positive values, negative are checked below.
|
|
+ const auto v0 = Zero(d);
|
|
+ const auto positive = Iota(d, 0) & Set(d, kMax);
|
|
+
|
|
+ // Shift by 0
|
|
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
|
|
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
|
|
+
|
|
+ // Shift by 1
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[i] = T(T(i & kMax) >> 1);
|
|
+ }
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
|
|
+
|
|
+ // max
|
|
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
|
|
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
|
|
+
|
|
const auto max_shift = Set(d, kMaxShift);
|
|
const auto small_shifts = And(Iota(d, 0), max_shift);
|
|
const auto large_shifts = max_shift - small_shifts;
|
|
|
|
- // Test varying values to shift
|
|
+ const auto negative = Iota(d, kMin);
|
|
+
|
|
+ // Test varying negative to shift
|
|
for (size_t i = 0; i < N; ++i) {
|
|
- // We want a right-shift here, which is undefined behavior for negative
|
|
- // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if
|
|
- // minT is odd and negative.
|
|
- T minT = static_cast<T>(kMin + i);
|
|
- expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0));
|
|
+ expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
|
|
}
|
|
- HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1)));
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
|
|
|
|
// Shift MSB right by small amounts
|
|
for (size_t i = 0; i < N; ++i) {
|
|
@@ -343,6 +479,13 @@ struct TestSignedRightShifts {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllShifts() {
|
|
+ ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
|
|
+ ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
|
|
+ ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
|
|
+ ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
|
|
+}
|
|
+
|
|
+HWY_NOINLINE void TestAllVariableShifts() {
|
|
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
|
|
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
|
|
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
|
|
@@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() {
|
|
ForFloatTypes(ForPartialVectors<TestRound>());
|
|
}
|
|
|
|
+struct TestNearestInt {
|
|
+ template <typename TF, class DF>
|
|
+ HWY_NOINLINE void operator()(TF tf, const DF df) {
|
|
+ using TI = MakeSigned<TF>;
|
|
+ const RebindToSigned<DF> di;
|
|
+
|
|
+ size_t padded;
|
|
+ auto in = RoundTestCases(tf, df, padded);
|
|
+ auto expected = AllocateAligned<TI>(padded);
|
|
+
|
|
+ constexpr double max = static_cast<double>(LimitsMax<TI>());
|
|
+ for (size_t i = 0; i < padded; ++i) {
|
|
+ if (std::isnan(in[i])) {
|
|
+ // We replace NaN with 0 below (no_nan)
|
|
+ expected[i] = 0;
|
|
+ } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
|
|
+ // Avoid undefined result for lrintf
|
|
+ expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
|
|
+ } else {
|
|
+ expected[i] = lrintf(in[i]);
|
|
+ }
|
|
+ }
|
|
+ for (size_t i = 0; i < padded; i += Lanes(df)) {
|
|
+ const auto v = Load(df, &in[i]);
|
|
+ const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
|
|
+ HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllNearestInt() {
|
|
+ ForPartialVectors<TestNearestInt>()(float());
|
|
+}
|
|
+
|
|
struct TestTrunc {
|
|
template <typename T, class D>
|
|
HWY_NOINLINE void operator()(T t, D d) {
|
|
@@ -909,8 +1086,7 @@ struct TestSumOfLanes {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllSumOfLanes() {
|
|
- // Only full vectors because lanes in partial vectors are undefined.
|
|
- const ForFullVectors<TestSumOfLanes> sum;
|
|
+ const ForPartialVectors<TestSumOfLanes> sum;
|
|
|
|
// No u8/u16/i8/i16.
|
|
sum(uint32_t());
|
|
@@ -976,9 +1152,8 @@ struct TestMaxOfLanes {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllMinMaxOfLanes() {
|
|
- // Only full vectors because lanes in partial vectors are undefined.
|
|
- const ForFullVectors<TestMinOfLanes> min;
|
|
- const ForFullVectors<TestMaxOfLanes> max;
|
|
+ const ForPartialVectors<TestMinOfLanes> min;
|
|
+ const ForPartialVectors<TestMaxOfLanes> max;
|
|
|
|
// No u8/u16/i8/i16.
|
|
min(uint32_t());
|
|
@@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyArithmeticTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
|
|
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
|
|
@@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest,
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
|
|
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
|
|
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 2021-06-02 10:56:05.252904478 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyCombineTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
|
|
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
|
|
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
|
|
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
|
|
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
|
|
#else
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 2021-06-02 10:56:05.249904463 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyCompareTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
|
|
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
|
|
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
|
|
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
|
|
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 2021-06-02 10:56:05.261904523 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -16,8 +16,6 @@
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
|
|
-#include <cmath>
|
|
-
|
|
#undef HWY_TARGET_INCLUDE
|
|
#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
|
|
#include "hwy/foreach_target.h"
|
|
@@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() {
|
|
#endif
|
|
}
|
|
|
|
-struct TestNearestInt {
|
|
- template <typename TI, class DI>
|
|
- HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
|
|
- using TF = MakeFloat<TI>;
|
|
- const Rebind<TF, DI> df;
|
|
- const size_t N = Lanes(df);
|
|
-
|
|
- // Integer positive
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f)));
|
|
-
|
|
- // Integer negative
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f)));
|
|
-
|
|
- // Above positive
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f)));
|
|
-
|
|
- // Below positive
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f)));
|
|
-
|
|
- const TF eps = static_cast<TF>(0.0001);
|
|
- // Above negative
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps)));
|
|
-
|
|
- // Below negative
|
|
- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps)));
|
|
- }
|
|
-};
|
|
-
|
|
-HWY_NOINLINE void TestAllNearestInt() {
|
|
- ForPartialVectors<TestNearestInt>()(int32_t());
|
|
-}
|
|
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
} // namespace HWY_NAMESPACE
|
|
@@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyConvertTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
|
|
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
|
|
@@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te
|
|
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
|
|
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
|
|
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
|
|
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 2021-06-02 10:56:05.245904442 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -14,6 +14,7 @@
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
+#include <string.h> // memcmp
|
|
|
|
#include "hwy/base.h"
|
|
|
|
@@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() {
|
|
ForFloatTypes(ForPartialVectors<TestCopySign>());
|
|
}
|
|
|
|
+struct TestFirstN {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ const size_t N = Lanes(d);
|
|
+ auto mask_lanes = AllocateAligned<T>(N);
|
|
+
|
|
+ // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we
|
|
+ // cannot reliably compare against all bits set (NaN for float types).
|
|
+ const T off = 1;
|
|
+
|
|
+ for (size_t len = 0; len <= N; ++len) {
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ mask_lanes[i] = i < len ? T(0) : off;
|
|
+ }
|
|
+ const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
|
|
+ HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllFirstN() {
|
|
+ ForAllTypes(ForPartialVectors<TestFirstN>());
|
|
+}
|
|
+
|
|
struct TestIfThenElse {
|
|
template <class T, class D>
|
|
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
@@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() {
|
|
ForAllTypes(ForPartialVectors<TestIfThenElse>());
|
|
}
|
|
|
|
-// Also tests MaskFromVec/VecFromMask
|
|
+struct TestMaskVec {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ RandomState rng;
|
|
+
|
|
+ const size_t N = Lanes(d);
|
|
+ auto mask_lanes = AllocateAligned<T>(N);
|
|
+
|
|
+ // Each lane should have a chance of having mask=true.
|
|
+ for (size_t rep = 0; rep < 100; ++rep) {
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
|
|
+ }
|
|
+
|
|
+ const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
|
|
+ HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllMaskVec() {
|
|
+ const ForPartialVectors<TestMaskVec> test;
|
|
+
|
|
+ test(uint16_t());
|
|
+ test(int16_t());
|
|
+ // TODO(janwas): float16_t - cannot compare yet
|
|
+
|
|
+ test(uint32_t());
|
|
+ test(int32_t());
|
|
+ test(float());
|
|
+
|
|
+#if HWY_CAP_INTEGER64
|
|
+ test(uint64_t());
|
|
+ test(int64_t());
|
|
+#endif
|
|
+#if HWY_CAP_FLOAT64
|
|
+ test(double());
|
|
+#endif
|
|
+}
|
|
+
|
|
struct TestCompress {
|
|
template <class T, class D>
|
|
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
RandomState rng;
|
|
|
|
+ using TU = MakeUnsigned<T>;
|
|
+ const Rebind<TU, D> du;
|
|
const size_t N = Lanes(d);
|
|
auto in_lanes = AllocateAligned<T>(N);
|
|
- auto mask_lanes = AllocateAligned<T>(N);
|
|
+ auto mask_lanes = AllocateAligned<TU>(N);
|
|
auto expected = AllocateAligned<T>(N);
|
|
auto actual = AllocateAligned<T>(N);
|
|
|
|
@@ -224,35 +290,56 @@ struct TestCompress {
|
|
for (size_t rep = 0; rep < 100; ++rep) {
|
|
size_t expected_pos = 0;
|
|
for (size_t i = 0; i < N; ++i) {
|
|
- in_lanes[i] = static_cast<T>(Random32(&rng));
|
|
- mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
|
|
+ const uint64_t bits = Random32(&rng);
|
|
+ in_lanes[i] = T(); // cannot initialize float16_t directly.
|
|
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
|
|
+ mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
|
|
if (mask_lanes[i] == 0) { // Zero means true (easier to compare)
|
|
expected[expected_pos++] = in_lanes[i];
|
|
}
|
|
}
|
|
|
|
const auto in = Load(d, in_lanes.get());
|
|
- const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
|
|
+ const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
|
|
|
|
- HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
|
|
Store(Compress(in, mask), d, actual.get());
|
|
// Upper lanes are undefined.
|
|
for (size_t i = 0; i < expected_pos; ++i) {
|
|
- HWY_ASSERT(actual[i] == expected[i]);
|
|
+ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
|
}
|
|
|
|
// Also check CompressStore in the same way.
|
|
- std::fill(actual.get(), actual.get() + N, T(0));
|
|
+ memset(actual.get(), 0, N * sizeof(T));
|
|
const size_t num_written = CompressStore(in, mask, d, actual.get());
|
|
HWY_ASSERT_EQ(expected_pos, num_written);
|
|
for (size_t i = 0; i < expected_pos; ++i) {
|
|
- HWY_ASSERT_EQ(expected[i], actual[i]);
|
|
+ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
#if 0
|
|
+namespace detail { // for code folding
|
|
+void PrintCompress16x8Tables() {
|
|
+ constexpr size_t N = 8; // 128-bit SIMD
|
|
+ for (uint64_t code = 0; code < 1ull << N; ++code) {
|
|
+ std::array<uint8_t, N> indices{0};
|
|
+ size_t pos = 0;
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ if (code & (1ull << i)) {
|
|
+ indices[pos++] = i;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ // Doubled (for converting lane to byte indices)
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ printf("%d,", 2 * indices[i]);
|
|
+ }
|
|
+ }
|
|
+ printf("\n");
|
|
+}
|
|
+
|
|
// Compressed to nibbles
|
|
void PrintCompress32x8Tables() {
|
|
constexpr size_t N = 8; // AVX2
|
|
@@ -340,16 +427,22 @@ void PrintCompress64x2Tables() {
|
|
}
|
|
printf("\n");
|
|
}
|
|
-
|
|
+} // namespace detail
|
|
#endif
|
|
|
|
HWY_NOINLINE void TestAllCompress() {
|
|
- // PrintCompress32x8Tables();
|
|
- // PrintCompress64x4Tables();
|
|
- // PrintCompress32x4Tables();
|
|
- // PrintCompress64x2Tables();
|
|
+ // detail::PrintCompress32x8Tables();
|
|
+ // detail::PrintCompress64x4Tables();
|
|
+ // detail::PrintCompress32x4Tables();
|
|
+ // detail::PrintCompress64x2Tables();
|
|
+ // detail::PrintCompress16x8Tables();
|
|
|
|
const ForPartialVectors<TestCompress> test;
|
|
+
|
|
+ test(uint16_t());
|
|
+ test(int16_t());
|
|
+ test(float16_t());
|
|
+
|
|
test(uint32_t());
|
|
test(int32_t());
|
|
test(float());
|
|
@@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() {
|
|
test(uint64_t());
|
|
test(int64_t());
|
|
#endif
|
|
-
|
|
#if HWY_CAP_FLOAT64
|
|
test(double());
|
|
#endif
|
|
@@ -432,7 +524,7 @@ struct TestTestBit {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllTestBit() {
|
|
- ForIntegerTypes(ForFullVectors<TestTestBit>());
|
|
+ ForIntegerTypes(ForPartialVectors<TestTestBit>());
|
|
}
|
|
|
|
struct TestAllTrueFalse {
|
|
@@ -445,6 +537,8 @@ struct TestAllTrueFalse {
|
|
auto lanes = AllocateAligned<T>(N);
|
|
std::fill(lanes.get(), lanes.get() + N, T(0));
|
|
|
|
+ auto mask_lanes = AllocateAligned<T>(N);
|
|
+
|
|
HWY_ASSERT(AllTrue(Eq(v, zero)));
|
|
HWY_ASSERT(!AllFalse(Eq(v, zero)));
|
|
|
|
@@ -456,7 +550,13 @@ struct TestAllTrueFalse {
|
|
for (size_t i = 0; i < N; ++i) {
|
|
lanes[i] = T(1);
|
|
v = Load(d, lanes.get());
|
|
- HWY_ASSERT(!AllTrue(Eq(v, zero)));
|
|
+
|
|
+ // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
|
|
+ // Assigning to an lvalue is insufficient but storing to memory prevents
|
|
+ // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
|
|
+ Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
|
|
+ HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
|
|
+
|
|
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
|
|
|
|
lanes[i] = T(-1);
|
|
@@ -596,7 +696,7 @@ struct TestLogicalMask {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllLogicalMask() {
|
|
- ForAllTypes(ForFullVectors<TestLogicalMask>());
|
|
+ ForAllTypes(ForPartialVectors<TestLogicalMask>());
|
|
}
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
} // namespace HWY_NAMESPACE
|
|
@@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyLogicalTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
|
|
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
|
|
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
|
|
@@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
|
|
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 2021-06-02 10:56:05.247904453 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -12,6 +12,12 @@
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
+// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
|
|
+// detected. Must come before Highway headers.
|
|
+#if defined(_WIN32) || defined(_WIN64)
|
|
+#include <Windows.h>
|
|
+#endif
|
|
+
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
@@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() {
|
|
ForAllTypes(ForPartialVectors<TestLoadStore>());
|
|
}
|
|
|
|
+struct TestStoreInterleaved3 {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ const size_t N = Lanes(d);
|
|
+
|
|
+ RandomState rng;
|
|
+
|
|
+ // Data to be interleaved
|
|
+ auto bytes = AllocateAligned<uint8_t>(3 * N);
|
|
+ for (size_t i = 0; i < 3 * N; ++i) {
|
|
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
|
+ }
|
|
+ const auto in0 = Load(d, &bytes[0 * N]);
|
|
+ const auto in1 = Load(d, &bytes[1 * N]);
|
|
+ const auto in2 = Load(d, &bytes[2 * N]);
|
|
+
|
|
+ // Interleave here, ensure vector results match scalar
|
|
+ auto expected = AllocateAligned<T>(4 * N);
|
|
+ auto actual_aligned = AllocateAligned<T>(4 * N + 1);
|
|
+ T* actual = actual_aligned.get() + 1;
|
|
+
|
|
+ for (size_t rep = 0; rep < 100; ++rep) {
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[3 * i + 0] = bytes[0 * N + i];
|
|
+ expected[3 * i + 1] = bytes[1 * N + i];
|
|
+ expected[3 * i + 2] = bytes[2 * N + i];
|
|
+ // Ensure we do not write more than 3*N bytes
|
|
+ expected[3 * N + i] = actual[3 * N + i] = 0;
|
|
+ }
|
|
+ StoreInterleaved3(in0, in1, in2, d, actual);
|
|
+ size_t pos = 0;
|
|
+ if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
|
|
+ Print(d, "in0", in0, pos / 3);
|
|
+ Print(d, "in1", in1, pos / 3);
|
|
+ Print(d, "in2", in2, pos / 3);
|
|
+ const size_t i = pos - pos % 3;
|
|
+ fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i],
|
|
+ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
|
|
+ actual[i + 5]);
|
|
+ HWY_ASSERT(false);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllStoreInterleaved3() {
|
|
+#if HWY_TARGET == HWY_RVV
|
|
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
|
+ const ForExtendableVectors<TestStoreInterleaved3, 4> test;
|
|
+#else
|
|
+ const ForPartialVectors<TestStoreInterleaved3> test;
|
|
+#endif
|
|
+ test(uint8_t());
|
|
+}
|
|
+
|
|
+struct TestStoreInterleaved4 {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ const size_t N = Lanes(d);
|
|
+
|
|
+ RandomState rng;
|
|
+
|
|
+ // Data to be interleaved
|
|
+ auto bytes = AllocateAligned<uint8_t>(4 * N);
|
|
+ for (size_t i = 0; i < 4 * N; ++i) {
|
|
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
|
+ }
|
|
+ const auto in0 = Load(d, &bytes[0 * N]);
|
|
+ const auto in1 = Load(d, &bytes[1 * N]);
|
|
+ const auto in2 = Load(d, &bytes[2 * N]);
|
|
+ const auto in3 = Load(d, &bytes[3 * N]);
|
|
+
|
|
+ // Interleave here, ensure vector results match scalar
|
|
+ auto expected = AllocateAligned<T>(5 * N);
|
|
+ auto actual_aligned = AllocateAligned<T>(5 * N + 1);
|
|
+ T* actual = actual_aligned.get() + 1;
|
|
+
|
|
+ for (size_t rep = 0; rep < 100; ++rep) {
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ expected[4 * i + 0] = bytes[0 * N + i];
|
|
+ expected[4 * i + 1] = bytes[1 * N + i];
|
|
+ expected[4 * i + 2] = bytes[2 * N + i];
|
|
+ expected[4 * i + 3] = bytes[3 * N + i];
|
|
+ // Ensure we do not write more than 4*N bytes
|
|
+ expected[4 * N + i] = actual[4 * N + i] = 0;
|
|
+ }
|
|
+ StoreInterleaved4(in0, in1, in2, in3, d, actual);
|
|
+ size_t pos = 0;
|
|
+ if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
|
|
+ Print(d, "in0", in0, pos / 4);
|
|
+ Print(d, "in1", in1, pos / 4);
|
|
+ Print(d, "in2", in2, pos / 4);
|
|
+ Print(d, "in3", in3, pos / 4);
|
|
+ const size_t i = pos;
|
|
+ fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i],
|
|
+ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
|
|
+ actual[i + 5], actual[i + 6], actual[i + 7]);
|
|
+ HWY_ASSERT(false);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllStoreInterleaved4() {
|
|
+#if HWY_TARGET == HWY_RVV
|
|
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
|
+ const ForExtendableVectors<TestStoreInterleaved4, 4> test;
|
|
+#else
|
|
+ const ForPartialVectors<TestStoreInterleaved4> test;
|
|
+#endif
|
|
+ test(uint8_t());
|
|
+}
|
|
+
|
|
struct TestLoadDup128 {
|
|
template <class T, class D>
|
|
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
@@ -86,13 +205,14 @@ struct TestLoadDup128 {
|
|
for (size_t i = 0; i < N128; ++i) {
|
|
lanes[i] = static_cast<T>(1 + i);
|
|
}
|
|
- const auto v = LoadDup128(d, lanes);
|
|
+
|
|
const size_t N = Lanes(d);
|
|
- auto out = AllocateAligned<T>(N);
|
|
- Store(v, d, out.get());
|
|
+ auto expected = AllocateAligned<T>(N);
|
|
for (size_t i = 0; i < N; ++i) {
|
|
- HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
|
|
+ expected[i] = static_cast<T>(i % N128 + 1);
|
|
}
|
|
+
|
|
+ HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
|
|
#else
|
|
(void)d;
|
|
#endif
|
|
@@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() {
|
|
ForFloatTypes(test);
|
|
}
|
|
|
|
+// Assumes little-endian byte order!
|
|
+struct TestScatter {
|
|
+ template <class T, class D>
|
|
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
+ using Offset = MakeSigned<T>;
|
|
+
|
|
+ const size_t N = Lanes(d);
|
|
+ const size_t range = 4 * N; // number of items to scatter
|
|
+ const size_t max_bytes = range * sizeof(T); // upper bound on offset
|
|
+
|
|
+ RandomState rng;
|
|
+
|
|
+ // Data to be scattered
|
|
+ auto bytes = AllocateAligned<uint8_t>(max_bytes);
|
|
+ for (size_t i = 0; i < max_bytes; ++i) {
|
|
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
|
+ }
|
|
+ const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
|
|
+
|
|
+ // Scatter into these regions, ensure vector results match scalar
|
|
+ auto expected = AllocateAligned<T>(range);
|
|
+ auto actual = AllocateAligned<T>(range);
|
|
+
|
|
+ const Rebind<Offset, D> d_offsets;
|
|
+ auto offsets = AllocateAligned<Offset>(N); // or indices
|
|
+
|
|
+ for (size_t rep = 0; rep < 100; ++rep) {
|
|
+ // Byte offsets
|
|
+ std::fill(expected.get(), expected.get() + range, T(0));
|
|
+ std::fill(actual.get(), actual.get() + range, T(0));
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ offsets[i] =
|
|
+ static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
|
|
+ CopyBytes<sizeof(T)>(
|
|
+ bytes.get() + i * sizeof(T),
|
|
+ reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
|
|
+ }
|
|
+ const auto voffsets = Load(d_offsets, offsets.get());
|
|
+ ScatterOffset(data, d, actual.get(), voffsets);
|
|
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
|
|
+ Print(d, "Data", data);
|
|
+ Print(d_offsets, "Offsets", voffsets);
|
|
+ HWY_ASSERT(false);
|
|
+ }
|
|
+
|
|
+ // Indices
|
|
+ std::fill(expected.get(), expected.get() + range, T(0));
|
|
+ std::fill(actual.get(), actual.get() + range, T(0));
|
|
+ for (size_t i = 0; i < N; ++i) {
|
|
+ offsets[i] = static_cast<Offset>(Random32(&rng) % range);
|
|
+ CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
|
|
+ &expected[offsets[i]]);
|
|
+ }
|
|
+ const auto vindices = Load(d_offsets, offsets.get());
|
|
+ ScatterIndex(data, d, actual.get(), vindices);
|
|
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
|
|
+ Print(d, "Data", data);
|
|
+ Print(d_offsets, "Indices", vindices);
|
|
+ HWY_ASSERT(false);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+HWY_NOINLINE void TestAllScatter() {
|
|
+ // No u8,u16,i8,i16.
|
|
+ const ForPartialVectors<TestScatter> test;
|
|
+ test(uint32_t());
|
|
+ test(int32_t());
|
|
+
|
|
+#if HWY_CAP_INTEGER64
|
|
+ test(uint64_t());
|
|
+ test(int64_t());
|
|
+#endif
|
|
+
|
|
+ ForFloatTypes(test);
|
|
+}
|
|
+
|
|
struct TestGather {
|
|
template <class T, class D>
|
|
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
|
@@ -183,21 +381,15 @@ struct TestGather {
|
|
|
|
HWY_NOINLINE void TestAllGather() {
|
|
// No u8,u16,i8,i16.
|
|
- const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint32_t)> test32;
|
|
- test32(uint32_t());
|
|
- test32(int32_t());
|
|
+ const ForPartialVectors<TestGather> test;
|
|
+ test(uint32_t());
|
|
+ test(int32_t());
|
|
|
|
#if HWY_CAP_INTEGER64
|
|
- const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint64_t)> test64;
|
|
- test64(uint64_t());
|
|
- test64(int64_t());
|
|
-#endif
|
|
-
|
|
- ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(float)>()(float());
|
|
-
|
|
-#if HWY_CAP_FLOAT64
|
|
- ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(double)>()(double());
|
|
+ test(uint64_t());
|
|
+ test(int64_t());
|
|
#endif
|
|
+ ForFloatTypes(test);
|
|
}
|
|
|
|
HWY_NOINLINE void TestAllCache() {
|
|
@@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() {
|
|
int test = 0;
|
|
Prefetch(&test);
|
|
FlushCacheline(&test);
|
|
+ Pause();
|
|
}
|
|
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
@@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwyMemoryTest);
|
|
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
|
|
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
|
|
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
|
|
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
|
|
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
|
|
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
|
|
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
|
|
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 2021-06-02 10:56:05.259904513 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -223,6 +223,7 @@ struct TestTableLookupBytes {
|
|
HWY_NOINLINE void TestAllTableLookupBytes() {
|
|
ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
|
|
}
|
|
+
|
|
struct TestTableLookupLanes {
|
|
#if HWY_TARGET == HWY_RVV
|
|
using Index = uint32_t;
|
|
@@ -242,12 +243,13 @@ struct TestTableLookupLanes {
|
|
if (N <= 8) { // Test all permutations
|
|
for (size_t i0 = 0; i0 < N; ++i0) {
|
|
idx[0] = static_cast<Index>(i0);
|
|
+
|
|
for (size_t i1 = 0; i1 < N; ++i1) {
|
|
- idx[1] = static_cast<Index>(i1);
|
|
+ if (N >= 2) idx[1] = static_cast<Index>(i1);
|
|
for (size_t i2 = 0; i2 < N; ++i2) {
|
|
- idx[2] = static_cast<Index>(i2);
|
|
+ if (N >= 4) idx[2] = static_cast<Index>(i2);
|
|
for (size_t i3 = 0; i3 < N; ++i3) {
|
|
- idx[3] = static_cast<Index>(i3);
|
|
+ if (N >= 4) idx[3] = static_cast<Index>(i3);
|
|
|
|
for (size_t i = 0; i < N; ++i) {
|
|
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
|
|
@@ -286,7 +288,7 @@ struct TestTableLookupLanes {
|
|
};
|
|
|
|
HWY_NOINLINE void TestAllTableLookupLanes() {
|
|
- const ForFullVectors<TestTableLookupLanes> test;
|
|
+ const ForPartialVectors<TestTableLookupLanes> test;
|
|
test(uint32_t());
|
|
test(int32_t());
|
|
test(float());
|
|
@@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() {
|
|
HWY_AFTER_NAMESPACE();
|
|
|
|
#if HWY_ONCE
|
|
+namespace hwy {
|
|
HWY_BEFORE_TEST(HwySwizzleTest);
|
|
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
|
|
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
|
|
@@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te
|
|
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
|
|
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
|
|
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
|
|
-HWY_AFTER_TEST();
|
|
+} // namespace hwy
|
|
#endif
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h
|
|
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 2021-06-02 10:56:05.254904488 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -23,7 +23,6 @@
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
-#include <cmath> // isfinite
|
|
#include <cstddef>
|
|
#include <string>
|
|
#include <utility> // std::forward
|
|
@@ -73,7 +72,8 @@ class TestWithParamTarget : public testi
|
|
|
|
// Function to convert the test parameter of a TestWithParamTarget for
|
|
// displaying it in the gtest test name.
|
|
-std::string TestParamTargetName(const testing::TestParamInfo<uint32_t>& info) {
|
|
+static inline std::string TestParamTargetName(
|
|
+ const testing::TestParamInfo<uint32_t>& info) {
|
|
return TargetName(info.param);
|
|
}
|
|
|
|
@@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT(
|
|
static_assert(true, "For requiring trailing semicolon")
|
|
|
|
#define HWY_BEFORE_TEST(suite) \
|
|
- namespace hwy { \
|
|
class suite : public hwy::TestWithParamTarget {}; \
|
|
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
|
|
static_assert(true, "For requiring trailing semicolon")
|
|
|
|
-#define HWY_AFTER_TEST() \
|
|
- } /* namespace hwy */ \
|
|
- static_assert(true, "For requiring trailing semicolon")
|
|
-
|
|
-// Calls test for each enabled and available target.
|
|
-template <class Func, typename... Args>
|
|
-HWY_NOINLINE void RunTest(const Func& func, Args&&... args) {
|
|
- SetSupportedTargetsForTest(0);
|
|
- auto targets = SupportedAndGeneratedTargets();
|
|
-
|
|
- for (uint32_t target : targets) {
|
|
- SetSupportedTargetsForTest(target);
|
|
- fprintf(stderr, "Testing for target %s.\n",
|
|
- TargetName(static_cast<int>(target)));
|
|
- func(std::forward<Args>(args)...);
|
|
- }
|
|
- // Disable the mask after the test.
|
|
- SetSupportedTargetsForTest(0);
|
|
-}
|
|
-
|
|
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
|
|
// which triggers a compiler bug.
|
|
class RandomState {
|
|
@@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand
|
|
// built-in types.
|
|
template <class T>
|
|
inline void PreventElision(T&& output) {
|
|
-#ifndef _MSC_VER
|
|
+#if HWY_COMPILER_MSVC
|
|
+ (void)output;
|
|
+#else // HWY_COMPILER_MSVC
|
|
asm volatile("" : "+r"(output) : : "memory");
|
|
-#endif
|
|
+#endif // HWY_COMPILER_MSVC
|
|
}
|
|
|
|
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
|
|
@@ -234,23 +215,34 @@ inline void PreventElision(T&& output) {
|
|
// understanding which instantiation of a generic test failed.
|
|
template <typename T>
|
|
static inline std::string TypeName(T /*unused*/, size_t N) {
|
|
- std::string prefix(IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u"));
|
|
- prefix += std::to_string(sizeof(T) * 8);
|
|
-
|
|
- // Scalars: omit the xN suffix.
|
|
- if (N == 1) return prefix;
|
|
-
|
|
- return prefix + 'x' + std::to_string(N);
|
|
+ const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
|
|
+ char name[64];
|
|
+ // Omit the xN suffix for scalars.
|
|
+ if (N == 1) {
|
|
+ snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
|
|
+ } else {
|
|
+ snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
|
|
+ }
|
|
+ return name;
|
|
}
|
|
|
|
// String comparison
|
|
|
|
template <typename T1, typename T2>
|
|
-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) {
|
|
+inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
|
|
+ size_t* pos = nullptr) {
|
|
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
|
|
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
|
|
for (size_t i = 0; i < size; ++i) {
|
|
- if (bytes1[i] != bytes2[i]) return false;
|
|
+ if (bytes1[i] != bytes2[i]) {
|
|
+ fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
|
|
+ size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
|
|
+ TypeName(T2(), 1).c_str());
|
|
+ if (pos != nullptr) {
|
|
+ *pos = i;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
}
|
|
return true;
|
|
}
|
|
@@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const
|
|
auto lanes = AllocateAligned<T>(N);
|
|
Store(v, d, lanes.get());
|
|
const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
|
|
- const size_t end = std::min(begin + 5, N);
|
|
+ const size_t end = std::min(begin + 7, N);
|
|
fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption,
|
|
begin);
|
|
for (size_t i = begin; i < end; ++i) {
|
|
- fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str());
|
|
+ fprintf(stderr, "%g,", double(lanes[i]));
|
|
}
|
|
if (begin >= end) fprintf(stderr, "(out of bounds)");
|
|
fprintf(stderr, "\n");
|
|
@@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex
|
|
const char* filename = "", const int line = -1,
|
|
const size_t lane = 0) {
|
|
if (!IsEqual(expected, actual)) {
|
|
- const std::string expected_str = std::to_string(expected);
|
|
- const std::string actual_str = std::to_string(actual);
|
|
- NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(),
|
|
- actual_str.c_str());
|
|
+ char expected_str[100];
|
|
+ snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
|
|
+ char actual_str[100];
|
|
+ snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
|
|
+ NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
|
|
+ actual_str);
|
|
}
|
|
}
|
|
|
|
@@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co
|
|
fprintf(stderr, "\n\n");
|
|
Print(d, "expect", expected, i);
|
|
Print(d, "actual", actual, i);
|
|
+
|
|
+ char expected_str[100];
|
|
+ snprintf(expected_str, sizeof(expected_str), "%g",
|
|
+ double(expected_lanes[i]));
|
|
+ char actual_str[100];
|
|
+ snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
|
|
+
|
|
NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
|
|
- std::to_string(expected_lanes[i]).c_str(),
|
|
- std::to_string(actual_lanes[i]).c_str());
|
|
+ expected_str, actual_str);
|
|
}
|
|
}
|
|
}
|
|
@@ -458,11 +458,8 @@ struct ForeachSizeR<T, 0, kMinLanes, Tes
|
|
|
|
// These adapters may be called directly, or via For*Types:
|
|
|
|
-// Calls Test for all powers of two in [kMinLanes, kMaxLanes / kDivLanes].
|
|
-// kMaxLanes is used for HWY_GATHER_LANES etc; use a large default because we
|
|
-// don't have access to T in the template argument list.
|
|
-template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1,
|
|
- size_t kMaxLanes = 1ul << 30>
|
|
+// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
|
|
+template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
|
|
struct ForPartialVectors {
|
|
template <typename T>
|
|
void operator()(T /*unused*/) const {
|
|
@@ -470,8 +467,8 @@ struct ForPartialVectors {
|
|
// Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
|
|
ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
|
|
#else
|
|
- ForeachSizeR<T, HWY_MIN(kMaxLanes, HWY_LANES(T)) / kDivLanes / kMinLanes,
|
|
- kMinLanes, Test>::Do();
|
|
+ ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
|
|
+ Test>::Do();
|
|
#endif
|
|
}
|
|
};
|
|
@@ -505,33 +502,19 @@ struct ForGE128Vectors {
|
|
}
|
|
};
|
|
|
|
-// Calls Test for all powers of two in [128 bits, max bits/2].
|
|
-template <class Test>
|
|
+// Calls Test for all vectors that can be expanded by kFactor.
|
|
+template <class Test, size_t kFactor = 2>
|
|
struct ForExtendableVectors {
|
|
template <typename T>
|
|
void operator()(T /*unused*/) const {
|
|
#if HWY_TARGET == HWY_RVV
|
|
- ForeachSizeR<T, 4, HWY_LANES(T), Test>::Do();
|
|
+ ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
|
|
#else
|
|
- ForeachSizeR<T, HWY_LANES(T) / 2 / (16 / sizeof(T)), (16 / sizeof(T)),
|
|
+ ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
|
|
Test>::Do();
|
|
#endif
|
|
}
|
|
};
|
|
-
|
|
-// Calls Test for full vectors only.
|
|
-template <class Test>
|
|
-struct ForFullVectors {
|
|
- template <typename T>
|
|
- void operator()(T t) const {
|
|
-#if HWY_TARGET == HWY_RVV
|
|
- ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
|
- (void)t;
|
|
-#else
|
|
- Test()(t, HWY_FULL(T)());
|
|
-#endif
|
|
- }
|
|
-};
|
|
|
|
// Type lists to shorten call sites:
|
|
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/README.md.12 chromium-91.0.4472.77/third_party/highway/src/README.md
|
|
--- chromium-91.0.4472.77/third_party/highway/src/README.md.12 2021-06-02 10:56:05.295904696 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/README.md 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -15,15 +15,19 @@ applying the same operation to 'lanes'.
|
|
## Current status
|
|
|
|
Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
|
|
-A port to RVV is in progress.
|
|
+Ports to RVV and SVE/SVE2 are in progress.
|
|
|
|
Version 0.11 is considered stable enough to use in other projects, and is
|
|
expected to remain backwards compatible unless serious issues are discovered
|
|
while implementing SVE/RVV targets. After these targets are added, Highway will
|
|
reach version 1.0.
|
|
|
|
-Continuous integration tests use a recent version of Clang and older version of
|
|
-MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1.
|
|
+Continuous integration tests build with a recent version of Clang (running on
|
|
+x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
|
|
+
|
|
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
|
|
+GCC cross-compile and QEMU. See the
|
|
+[testing process](g3doc/release_testing_process.md) for details.
|
|
|
|
The `contrib` directory contains SIMD-related utilities: an image class with
|
|
aligned rows, and a math library (16 functions already implemented, mostly
|
|
@@ -62,9 +66,11 @@ To test on all the attainable targets fo
|
|
default configuration skips baseline targets (e.g. scalar) that are superseded
|
|
by another baseline target.
|
|
|
|
+Bazel is also supported for building, but it is not as widely used/tested.
|
|
+
|
|
## Quick start
|
|
|
|
-You can use the `skeleton` examples inside examples/ as a starting point.
|
|
+You can use the `benchmark` inside examples/ as a starting point.
|
|
|
|
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
|
|
and their parameters, and the [instruction_matrix][instmtx] indicates the
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat
|
|
--- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 2021-06-02 10:56:05.293904685 -0400
|
|
+++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat 2021-05-31 10:37:11.000000000 -0400
|
|
@@ -2,9 +2,9 @@
|
|
REM Switch directory of this batch file
|
|
cd %~dp0
|
|
|
|
-if not exist build mkdir build
|
|
+if not exist build_win mkdir build_win
|
|
|
|
-cd build
|
|
+cd build_win
|
|
cmake .. -G Ninja || goto error
|
|
ninja || goto error
|
|
ctest -j || goto error
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.sh.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.sh
|
|
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE
|
|
diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time
|
|
diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1
|