chromium/chromium-91.0.4472.77-updat...

diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time
diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10
diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt
--- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12	2021-06-02 10:56:05.305904746 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt	2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,7 @@ if(POLICY CMP0083)
   cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 0.1)
+project(hwy VERSION 0.12.2)  # Keep in sync with highway.h version

 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE RelWithDebInfo)
 endif()

+set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
+
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
    "int main() {
@@ -51,10 +53,13 @@ check_cxx_source_compiles(
   HWY_EMSCRIPTEN
 )

+set(HWY_CONTRIB_SOURCES
+    hwy/contrib/image/image.cc
+    hwy/contrib/image/image.h
+    hwy/contrib/math/math-inl.h
+)
+
 set(HWY_SOURCES
-    contrib/image/image.cc
-    contrib/image/image.h
-    contrib/math/math-inl.h
     hwy/aligned_allocator.cc
     hwy/aligned_allocator.h
     hwy/base.h
@@ -64,6 +69,7 @@ set(HWY_SOURCES
     hwy/nanobenchmark.cc
     hwy/nanobenchmark.h
     hwy/ops/arm_neon-inl.h
+    hwy/ops/arm_sve-inl.h
     hwy/ops/scalar-inl.h
     hwy/ops/set_macros-inl.h
     hwy/ops/shared-inl.h
@@ -146,13 +152,28 @@ else()
       -fno-exceptions
     )
   endif()
-endif()
+
+  if (HWY_CMAKE_ARM7)
+    list(APPEND HWY_FLAGS
+      -march=armv7-a
+      -mfpu=neon-vfpv4
+      -mfloat-abi=hard  # must match the toolchain specified as CXX=
+      -mfp16-format=ieee  # required for vcvt_f32_f16
+    )
+  endif()  # HWY_CMAKE_ARM7
+
+endif()  # !MSVC

 add_library(hwy STATIC ${HWY_SOURCES})
 target_compile_options(hwy PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})

+add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
+target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+
 # -------------------------------------------------------- install library
 install(TARGETS hwy
   DESTINATION "${CMAKE_INSTALL_LIBDIR}")
@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES})
   endif()
 endforeach()

-# Add a pkg-config file for libhwy and the test library.
+install(TARGETS hwy_contrib
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_CONTRIB_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+
+# Add a pkg-config file for libhwy and the contrib/test libraries.
 set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
-foreach (pc libhwy.pc libhwy-test.pc)
+foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
   install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
@@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD
 # Avoids mismatch between GTest's static CRT and our dynamic.
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

-add_executable(skeleton hwy/examples/skeleton_main.cc)
-target_sources(skeleton PRIVATE
-    hwy/examples/skeleton-inl.h
-    hwy/examples/skeleton.cc
-    hwy/examples/skeleton.h
-    hwy/examples/skeleton_shared.h)
-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
-# observe the difference in targets printed.
-target_compile_options(skeleton PRIVATE ${HWY_FLAGS})
-target_link_libraries(skeleton hwy)
-set_target_properties(skeleton
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
-
-# Similar: shared headers but without the runtime dispatch in skeleton.cc/h
-add_executable(skeleton_static hwy/examples/skeleton_static_main.cc)
-target_sources(skeleton_static PRIVATE
-    hwy/examples/skeleton-inl.h
-    hwy/examples/skeleton_shared.h)
-target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS})
-target_link_libraries(skeleton_static hwy)
-set_target_properties(skeleton_static
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
-
 # Programming exercise with integrated benchmark
 add_executable(hwy_benchmark hwy/examples/benchmark.cc)
 target_sources(hwy_benchmark PRIVATE
     hwy/nanobenchmark.cc
     hwy/nanobenchmark.h)
+# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
+# observe the difference in targets printed.
 target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
 target_link_libraries(hwy_benchmark hwy)
 set_target_properties(hwy_benchmark
@@ -272,19 +284,21 @@ endif()
 endif() # HWY_SYSTEM_GTEST

 set(HWY_TEST_FILES
-  contrib/image/image_test.cc
-  # contrib/math/math_test.cc
+  hwy/contrib/image/image_test.cc
+  # hwy/contrib/math/math_test.cc
+  hwy/aligned_allocator_test.cc
+  hwy/base_test.cc
+  hwy/highway_test.cc
+  hwy/targets_test.cc
   hwy/examples/skeleton_test.cc
   hwy/tests/arithmetic_test.cc
   hwy/tests/combine_test.cc
   hwy/tests/compare_test.cc
   hwy/tests/convert_test.cc
-  hwy/tests/hwy_test.cc
   hwy/tests/logical_test.cc
   hwy/tests/memory_test.cc
   hwy/tests/swizzle_test.cc
-  hwy/aligned_allocator_test.cc
-  hwy/targets_test.cc
+  hwy/tests/test_util_test.cc
 )

 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
@@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE
   get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
   add_executable(${TESTNAME} ${TESTFILE})
   target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
+  # Test all targets, not just the best/baseline. This changes the default
+  # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
+  # cause compile errors because only one may be set, and other CMakeLists.txt
+  # that include us may set them.
+  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)

   if(HWY_SYSTEM_GTEST)
-    target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
+    target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
   else()
-    target_link_libraries(${TESTNAME} hwy gtest gtest_main)
+    target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
   endif()
   # Output test targets in the test directory.
   set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog
--- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12	2021-06-02 10:56:05.151903967 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog	2021-05-31 10:37:11.000000000 -0400
@@ -1,3 +1,26 @@
+highway (0.12.2-1) UNRELEASED; urgency=medium
+
+  * fix scalar-only test and Windows macro conflict with Load/StoreFence
+  * replace deprecated wasm intrinsics
+
+ -- Jan Wassenberg <janwas@google.com>  Mon, 31 May 2021 16:00:00 +0200
+
+highway (0.12.1-1) UNRELEASED; urgency=medium
+
+  * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
+  * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
+  * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 19 May 2021 15:00:00 +0200
+
+highway (0.12.0-1) UNRELEASED; urgency=medium
+
+  * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
+  * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
+  * Proper IEEE rounding, reduce libstdc++ usage, inlined math
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 15 Apr 2021 20:00:00 +0200
+
 highway (0.11.1-1) UNRELEASED; urgency=medium

   * Fix clang7 asan error, finish f16 conversions and add test
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf
Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md
--- chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12	2021-06-02 10:56:05.117903795 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md	2021-05-31 10:37:11.000000000 -0400
@@ -33,6 +33,12 @@ The public headers are:
 *   hwy/cache_control.h: defines stand-alone functions to control caching (e.g.
     prefetching) and memory barriers, independent of actual SIMD.

+*   hwy/nanobenchmark.h: library for precisely measuring elapsed time (under
+    varying inputs) for benchmarking small/medium regions of code.
+
+*   hwy/tests/test_util-inl.h: defines macros for invoking tests on all
+    available targets, plus per-target functions useful in tests (e.g. Print).
+
 SIMD implementations must be preceded and followed by the following:

 ```
@@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE();

 ## Vector and descriptor types

-Highway vectors consist of one or more 'lanes' of the same built-in type `T =
-uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16,
-32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load,
-store, and conversion to/from `float32_t`; infinity or NaN have
-implementation-defined results.
-
-Each vector has `N` lanes (a power of two, possibly unknown at compile time).
-
-Platforms such as x86 support multiple vector types, and other platforms require
-that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be
-wrapped inside a class. The Highway API satisfies these constraints because it
-is designed around overloaded functions selected via a zero-sized tag parameter
-`d` of type `D = Simd<T, N>`. These are typically constructed using aliases:
-
-*   `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native
-    vector for the current target. For targets (e.g. RVV) that support register
-    groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers
-    in the group. This effectively multiplies the lane count in each operation
-    by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of
-    the sizes of the largest and smallest type. `LMUL > 1` is more efficient on
-    single-issue machines, but larger values reduce the effective number of
-    registers, which may cause the compiler to spill them to memory.
+Highway vectors consist of one or more 'lanes' of the same built-in type
+`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32,
+64`.
+
+In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store,
+and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN
+are implementation-defined due to ARMv7.
+
+On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway
+API allows using built-in types as vectors because operations are expressed as
+overloaded functions. Instead of constructors, overloaded initialization
+functions such as `Set` take a zero-sized tag argument called `d` of type `D =
+Simd<T, N>` and return an actual vector of unspecified type.
+
+`T` is one of the lane types above, and may be retrieved via `TFromD<D>`.
+
+`N` is target-dependent and not directly user-specified. The actual lane count
+may not be known at compile time, but can be obtained via `Lanes(d)`. Use this
+value, which is potentially different from `N`, to increment loop counters etc.
+It is typically a power of two, but that is not guaranteed e.g. on SVE.
+
+`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases:
+
+*   Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a
+    native vector. This is preferred because it fully utilizes vector lanes.
+
+    For targets (e.g. RVV) that support register groups, the optional `LMUL` (1,
+    2, 4, 8) specifies the number of registers in the group. This effectively
+    multiplies the lane count in each operation by `LMUL`. For mixed-precision
+    code, `LMUL` must be at least the ratio of the sizes of the largest and
+    smallest type. `LMUL > 1` is more efficient on single-issue machines, but
+    larger values reduce the effective number of registers, which may cause the
+    compiler to spill them to memory.
+
+*   Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector
+    which may be native width, but no more than `N` lanes have observable
+    effects such as loading/storing to memory. This is less performance-portable
+    because it may not use all available lanes. Note that the resulting lane
+    count may also be less than `N`.
+
+    For targets (e.g. RVV) that have compile-time-unknown lane counts, such
+    vectors incur additional runtime cost in `Load` etc.
+
+User-specified lane counts or tuples of vectors could cause spills on targets
+with fewer or smaller vectors. By contrast, Highway encourages vector-length
+agnostic code, which is more performance-portable.
+
+Given that lane counts are potentially compile-time-unknown, storage for vectors
+should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For
+applications that require a compile-time estimate, `MaxLanes(d)` returns the `N`
+from `Simd<T, N>`, which is NOT necessarily the actual lane count. This is
+DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may
+be very large) and some compilers are not able to interpret it as constexpr.

-*   `const HWY_CAPPED(T, N) d;` for up to `N` lanes.
-
-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors
-for the smaller types must be obtained from those of the larger type (e.g. via
+For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for
+the smaller types must be obtained from those of the larger type (e.g. via
 `Rebind<uint8_t, HWY_FULL(float)>`).

-The type `T` may be accessed as `TFromD<D>`. There are three possibilities for
-the template parameter `N`:
-
-1.  Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a
-    target with compile-time constant vectors.
+## Using unspecified vector types

-1.  Less than the hardware vector width. This is the result of a compile-time
-    decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of
-    lanes, even when the hardware vector width could be greater.
-
-1.  Unrelated to the hardware vector width, e.g. when the hardware vector width
-    is not known at compile-time and may be very large.
-
-In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by
-which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd<T, N>`,
-which is NOT necessarily the actual vector size (see above) and some compilers
-are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use
-alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of
-`Lanes(d)` elements.
-
-Highway is designed to map a vector variable to a (possibly partial) hardware
-register or register group. By discouraging user-specified `N` and tuples of
-vector variables, we improve performance portability (e.g. by reducing spills to
-memory for platforms that have smaller vectors than the developer expected).
-
-To construct vectors, call factory functions (see "Initialization" below) with
-a tag parameter `d`.
-
-Local variables typically use auto for type deduction. For some generic
-functions, a template argument `V` is sufficient: `template<class V> V Squared(V
-v) { return v * v; }`. In general, functions have a `D` template argument and
-can return vectors of type `Vec<D>`.
-
-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
-functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway
-functions generally take either a `Simd` or vector argument, which are also
-defined in namespace `hwy`, they will typically be found via Argument-Dependent
-Lookup and namespace qualifiers are not necessary. As an exception, Highway
-functions that are templates (e.g. because they require a compile-time argument
-such as a lane index or shift count) require a using-declaration such as
-`using hwy::HWY_NAMESPACE::ShiftLeft`.
+Because vector types are unspecified, local vector variables are typically
+defined using `auto` for type deduction. A template argument `V` suffices for
+simple generic functions: `template<class V> V Squared(V v) { return v * v; }`.
+
+Many functions will need a `D` template argument in order to initialize any
+constants. They can use a separate `V` template argument for vectors, or use
+`Vec<D>`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such
+aliases instead of auto may improve readability of mixed-type code. They can
+also be used for member variables, which are discouraged because compilers often
+have difficulty mapping them to registers.

 ## Operations

@@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ
 bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and
 bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used.

+Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
+functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions
+generally take either a `Simd` or vector/mask argument. For targets where
+vectors and masks are defined in namespace `hwy`, the functions will be found
+via Argument-Dependent Lookup. However, this does not work for function
+templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using
+hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`.
+
 ### Initialization

 *   <code>V **Zero**(D)</code>: returns N-lane vector with all bits set to 0.
@@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f
 *   `V`: `{i,f}` \
     <code>V **Neg**(V a)</code>: returns `-a[i]`.

-*   `V`: `{i}{8,16,32}, {f}` \
+*   `V`: `{i,f}` \
     <code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
     integers, `LimitsMin()` maps to `LimitsMax() + 1`.

@@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift
 shifting `MakeUnsigned<T>` and casting to `T`. Right-shifting negative signed
 `T` is the same as an unsigned shift, except that 1-bits are shifted in.

-Compile-time constant shifts, generally the most efficient variant:
+Compile-time constant shifts, generally the most efficient variant (though 8-bit
+shifts are potentially slower than other lane sizes):

-*   `V`: `{u,i}{16,32,64}` \
+*   `V`: `{u,i}` \
     <code>V **ShiftLeft**&lt;int&gt;(V a)</code> returns `a[i] << int`.

-*   `V`: `{u,i}{16,32,64}` \
+*   `V`: `{u,i}` \
     <code>V **ShiftRight**&lt;int&gt;(V a)</code> returns `a[i] >> int`.

 Shift all lanes by the same (not necessarily compile-time constant) amount:

-*   `V`: `{u,i}{16,32,64}` \
+*   `V`: `{u,i}` \
     <code>V **ShiftLeftSame**(V a, int bits)</code> returns `a[i] << bits`.

-*   `V`: `{u,i}{16,32,64}` \
+*   `V`: `{u,i}` \
     <code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.

-Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2):
+Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2):

 *   `V`: `{u,i}{16,32,64}` \
     <code>V **operator<<**(V a, V b)</code> returns `a[i] << b[i]`.
@@ -332,12 +347,17 @@ Special functions for signed types:
     slightly more efficient; requires the first argument to be non-negative.

 *   `V`: `i32/64` \
-    <code>V **BroadcastSignBit(V a)</code> returns `a[i] < 0 ? -1 : 0`.
+    <code>V **BroadcastSignBit**(V a)</code> returns `a[i] < 0 ? -1 : 0`.

 ### Masks

 Let `M` denote a mask capable of storing true/false for each lane.

+*   <code>M **FirstN**(D, size_t N)</code>: returns mask with the first `N`
+    lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in
+    an all-true mask. Useful for implementing "masked" stores by loading `prev`
+    followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`.
+
 *   <code>M1 **RebindMask**(D, M2 m)</code>: returns same mask bits as `m`, but
     reinterpreted as a mask for lanes of type `TFromD<D>`. `M1` and `M2` must
     have the same number of lanes.
@@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing
 *   <code>size_t **CountTrue**(M m)</code>: returns how many of `m[i]` are true
     [0, N]. This is typically more expensive than AllTrue/False.

-*   `V`: `{u,i,f}{32,64}` \
+*   `V`: `{u,i,f}{16,32,64}` \
     <code>V **Compress**(V v, M m)</code>: returns `r` such that `r[n]` is
     `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true.
     Compacts lanes whose mask is set into the lower lanes; upper lanes are
-    implementation-defined.
+    implementation-defined. Slow with 16-bit lanes.

-*   `V`: `{u,i,f}{32,64}` \
+*   `V`: `{u,i,f}{16,32,64}` \
     <code>size_t **CompressStore**(V v, M m, D, T* aligned)</code>: writes lanes
     whose mask is set into `aligned`, starting from lane 0. Returns
     `CountTrue(m)`, the number of valid lanes. All subsequent lanes may be
-    overwritten! Alignment ensures inactive lanes will not cause faults.
+    overwritten! Alignment ensures inactive lanes will not cause faults. Slow
+    with 16-bit lanes.

 ### Comparisons

@@ -429,10 +450,16 @@ Memory operands are little-endian, other
 lane configuration. Pointers are the addresses of `N` consecutive `T` values,
 either naturally-aligned (`aligned`) or possibly unaligned (`p`).

+**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
+bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
+are naturally aligned. An unaligned access may require two load ports.
+
 #### Load

 *   <code>Vec&lt;D&gt; **Load**(D, const T* aligned)</code>: returns
-    `aligned[i]`.
+    `aligned[i]`. May fault if the pointer is not aligned to the vector size.
+    Using this whenever possible improves codegen on SSE4: unlike `LoadU`,
+    `Load` can be fused into a memory operand, which reduces register pressure.
 *   <code>Vec&lt;D&gt; **LoadU**(D, const T* p)</code>: returns `p[i]`.

 *   <code>Vec&lt;D&gt; **LoadDup128**(D, const T* p)</code>: returns one 128-bit
@@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or
     be faster than broadcasting single values, and is more convenient than
     preparing constants for the actual vector length.

-#### Gather
+#### Scatter/Gather

-**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`:
+**Note**: Offsets/indices are of type `VI = Vec<RebindToSigned<D>>` and need not
+be unique. The results are implementation-defined if any are negative.

-*   `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
-    <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>.
-    Returns elements of base selected by possibly repeated *byte* `offsets[i]`.
-    Results are implementation-defined if `offsets[i]` is negative.
-
-*   `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
-    <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>.
-    Returns vector of `base[indices[i]]`. Indices need not be unique, but
-    results are implementation-defined if they are negative.
+**Note**: Where possible, applications should `Load/Store/TableLookup*` entire
+vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form
+`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] =
+F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
+
+*   `D`: `{u,i,f}{32,64}` \
+    <code>void **ScatterOffset**(Vec&lt;D&gt; v, D, const T* base, VI
+    offsets)</code>: stores `v[i]` to the base address plus *byte* `offsets[i]`.
+
+*   `D`: `{u,i,f}{32,64}` \
+    <code>void **ScatterIndex**(Vec&lt;D&gt; v, D, const T* base, VI
+    indices)</code>: stores `v[i]` to `base[indices[i]]`.
+
+*   `D`: `{u,i,f}{32,64}` \
+    <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>:
+    returns elements of base selected by *byte* `offsets[i]`.
+
+*   `D`: `{u,i,f}{32,64}` \
+    <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>:
+    returns vector of `base[indices[i]]`.

 #### Store

@@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or
 *   <code>void **StoreU**(Vec&lt;D&gt; a, D, T* p)</code>: as Store, but without
     the alignment requirement.

+*   `D`: `u8` \
+    <code>void **StoreInterleaved3**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
+    Vec&lt;D&gt; v2, D, T* p)</code>: equivalent to shuffling `v0, v1, v2`
+    followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0],
+    p[2] == v1[0]`. Useful for RGB samples.
+
+*   `D`: `u8` \
+    <code>void **StoreInterleaved4**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
+    Vec&lt;D&gt; v2, Vec&lt;D&gt; v3, D, T* p)</code>: as above, but for four
+    vectors (e.g. RGBA samples).
+
 ### Cache control

 All functions except Stream are defined in cache_control.h.
@@ -483,6 +533,9 @@ All functions except Stream are defined
 *   <code>void **Prefetch**(const T* p)</code>: begins loading the cache line
     containing "p".

+*   <code>void **Pause**()</code>: when called inside a spin-loop, may reduce
+    power consumption.
+
 ### Type conversion

 *   <code>Vec&lt;D&gt; **BitCast**(D, V)</code>: returns the bits of `V`
@@ -525,7 +578,8 @@ if the input exceeds the destination ran
     zero and converts the value to same-sized integer.

 *   `V`: `f32`; `Ret`: `i32` \
-    <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`.
+    <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
+    results are undefined for NaN.

 ### Swizzle

@@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with

 ### Reductions

-**Note**: the following are only available for full vectors (including scalar).
-These 'reduce' all lanes to a single result. This result is broadcasted to all
-lanes at no extra cost; you can use `GetLane` to obtain the value.
+**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
+broadcasted to all lanes at no extra cost. To obtain a scalar, you can call
+`GetLane`.

 Being a horizontal operation (across lanes of the same vector), these are slower
 than normal SIMD operations and are typically used outside critical loops.
@@ -697,9 +751,6 @@ generate such instructions (implying the
     finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to
     provide an alternative to functions which are not supported by HWY_SCALAR.

-*   `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by
-    HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof.
-
 *   `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as
     `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out.

@@ -707,26 +758,15 @@ The following signal capabilities and ex

 *   `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes.
 *   `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes.
+
+The following were used to signal the maximum number of lanes for certain
+operations, but this is no longer necessary (nor possible on SVE/RVV), so they
+are DEPRECATED:
+
+*   `HWY_GATHER_LANES(T)`.
 *   `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits.
 *   `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits.

-The following indicate the maximum number of lanes for certain operations. For
-targets that support the feature/operation, the macro evaluates to
-`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))`
-generates the best possible code (or scalar fallback) from the same source code.
-
-*   `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset.
-*   `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2).
-    DEPRECATED, this always matches HWY_LANES(T) and will be removed.
-
-As above, but the feature implies the type so there is no T parameter, thus
-these can be used in `#if` expressions.
-
-*   `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this
-    always matches HWY_LANES(int64_t) and will be removed.
-*   `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED,
-    this always matches HWY_LANES(int64_t) and will be removed.
-
 ## Detecting supported targets

 `SupportedTargets()` returns a cached (initialized on-demand) bitfield of the
@@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`:
     and permitted by the compiler, independently of autovectorization), which
     maximizes coverage in tests.

-If none are defined, the default is to select all attainable targets except any
-non-best baseline (typically `HWY_SCALAR`), which reduces code size.
+If none are defined, but `HWY_IS_TEST` is defined, the default is
+`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
+targets except any non-best baseline (typically `HWY_SCALAR`), which reduces
+code size.

 ## Compiler support

@@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags
 However, this enables AVX2 instructions in the entire translation unit, which
 may violate the one-definition rule and cause crashes. Instead, we use
 target-specific attributes introduced via #pragma. Function using SIMD must
-reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`.
+reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively,
+individual functions or lambdas may be prefixed with `HWY_ATTR`.

 Immediates (compile-time constants) are specified as template arguments to avoid
 constant-propagation issues with Clang on ARM.
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12	2021-06-02 10:56:05.278904609 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h	2021-05-31 10:37:11.000000000 -0400
@@ -111,6 +111,32 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
       new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
 }

+// Helpers for array allocators (avoids overflow)
+namespace detail {
+
+// Returns x such that 1u << x == n (if n is a power of two).
+static inline constexpr size_t ShiftCount(size_t n) {
+  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
+}
+
+template <typename T>
+T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
+  constexpr size_t size = sizeof(T);
+
+  constexpr bool is_pow2 = (size & (size - 1)) == 0;
+  constexpr size_t bits = ShiftCount(size);
+  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
+
+  const size_t bytes = is_pow2 ? items << bits : items * size;
+  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
+  if (check != items) {
+    return nullptr;  // overflowed
+  }
+  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
+}
+
+}  // namespace detail
+
 // Aligned memory equivalent of make_unique<T[]> for array types using the
 // custom allocators alloc/free. This function calls the constructor with the
 // passed Args... on every created item. The destructor of each element will be
@@ -118,10 +144,11 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
 template <typename T, typename... Args>
 AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
     size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
-  T* ptr =
-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
-  for (size_t i = 0; i < items; i++) {
-    new (ptr + i) T(std::forward<Args>(args)...);
+  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
+  if (ptr != nullptr) {
+    for (size_t i = 0; i < items; i++) {
+      new (ptr + i) T(std::forward<Args>(args)...);
+    }
   }
   return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
 }
@@ -165,7 +192,7 @@ template <typename T>
 AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
                                           FreePtr free, void* opaque) {
   return AlignedFreeUniquePtr<T[]>(
-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
+      detail::AllocateAlignedItems<T>(items, alloc, opaque),
       AlignedFreer(free, opaque));
 }

diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12	2021-06-02 10:56:05.273904584 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -16,6 +16,7 @@

 #include <stddef.h>

+#include <array>
 #include <new>
 #include <random>
 #include <vector>
@@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr)
                    /*opaque_ptr=*/nullptr);
 }

+TEST(AlignedAllocatorTest, Log2) {
+  EXPECT_EQ(0u, detail::ShiftCount(1));
+  EXPECT_EQ(1u, detail::ShiftCount(2));
+  EXPECT_EQ(3u, detail::ShiftCount(8));
+}
+
+// Allocator returns null when it detects overflow of items * sizeof(T).
+TEST(AlignedAllocatorTest, Overflow) {
+  constexpr size_t max = ~size_t(0);
+  constexpr size_t msb = (max >> 1) + 1;
+  using Size5 = std::array<uint8_t, 5>;
+  using Size10 = std::array<uint8_t, 10>;
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
+  EXPECT_EQ(nullptr,
+            detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
+}
+
 TEST(AlignedAllocatorTest, AllocDefaultPointers) {
   const size_t kSize = 7777;
   void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
                                    /*opaque_ptr=*/nullptr);
   ASSERT_NE(nullptr, ptr);
   // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
   char* p = static_cast<char*>(ptr);
   size_t ret = 0;
   for (size_t i = 0; i < kSize; i++) {
@@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP
     p[i] = static_cast<char>(i & 0x7F);
     if (i) ret += p[i] * p[i - 1];
   }
-  EXPECT_NE(0, ret);
+  EXPECT_NE(0U, ret);
   FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
 }

@@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc)
       AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
   ASSERT_NE(nullptr, ptr);
   // We should have only requested one alloc from the allocator.
-  EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
   // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
   FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
-  EXPECT_EQ(0u, fake_alloc.PendingAllocs());
+  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
 }

 TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
@@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
 TEST(AlignedAllocatorTest, AllocSingleInt) {
   auto ptr = AllocateAligned<uint32_t>(1);
   ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
   // Force delete of the unique_ptr now to check that it doesn't crash.
   ptr.reset(nullptr);
   EXPECT_EQ(nullptr, ptr.get());
@@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
   const size_t kSize = 7777;
   auto ptr = AllocateAligned<uint32_t>(kSize);
   ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
   // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
   // underlying type chosen by AllocateAligned() for the std::unique_ptr.
   EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
@@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
     ptr[i] = static_cast<uint32_t>(i);
     if (i) ret += ptr[i] * ptr[i - 1];
   }
-  EXPECT_NE(0, ret);
+  EXPECT_NE(0U, ret);
 }

 TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
     auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
         7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
         &counter);
-    // An array shold still only call a single allocation.
+    ASSERT_NE(nullptr, arr.get());
+    // An array should still only call a single allocation.
     EXPECT_EQ(1u, fake_alloc.PendingAllocs());
     EXPECT_EQ(7, counter);
     for (size_t i = 0; i < 7; i++) {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12	2021-06-02 10:56:05.266904549 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h	2021-05-31 10:37:11.000000000 -0400
@@ -34,7 +34,10 @@
 //------------------------------------------------------------------------------
 // Detect compiler using predefined macros

-#ifdef _MSC_VER
+// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
+// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
+// purpose.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define HWY_COMPILER_MSVC _MSC_VER
 #else
 #define HWY_COMPILER_MSVC 0
@@ -200,6 +203,10 @@
 #define HWY_ARCH_X86_64 0
 #endif

+#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
+#error "Cannot have both x86-32 and x86-64"
+#endif
+
 #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
 #define HWY_ARCH_X86 1
 #else
@@ -212,14 +219,29 @@
 #define HWY_ARCH_PPC 0
 #endif

-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
+#define HWY_ARCH_ARM_A64 1
+#else
+#define HWY_ARCH_ARM_A64 0
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+#define HWY_ARCH_ARM_V7 1
+#else
+#define HWY_ARCH_ARM_V7 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
+#error "Cannot have both A64 and V7"
+#endif
+
+#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
 #define HWY_ARCH_ARM 1
 #else
 #define HWY_ARCH_ARM 0
 #endif

-// There isn't yet a standard __wasm or __wasm__.
-#ifdef __EMSCRIPTEN__
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
 #define HWY_ARCH_WASM 1
 #else
 #define HWY_ARCH_WASM 0
@@ -231,9 +253,11 @@
 #define HWY_ARCH_RVV 0
 #endif

+// It is an error to detect multiple architectures at the same time, but OK to
+// detect none of the above.
 #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
-     HWY_ARCH_RVV) != 1
-#error "Must detect exactly one platform"
+     HWY_ARCH_RVV) > 1
+#error "Must not detect more than one architecture"
 #endif

 //------------------------------------------------------------------------------
@@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t
 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 // by concatenating base type and bits.

-// RVV already has a builtin type.
-#if !HWY_ARCH_RVV
+// RVV already has a builtin type and the GCC intrinsics require it.
+#if HWY_ARCH_RVV && HWY_COMPILER_GCC
+#define HWY_NATIVE_FLOAT16 1
+#else
+#define HWY_NATIVE_FLOAT16 0
+#endif
+
+#if HWY_NATIVE_FLOAT16
+using float16_t = __fp16;
+// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
+// arguments, so use a wrapper.
+// TODO(janwas): replace with _Float16 when that is supported?
+#else
+#pragma pack(push, 1)
 struct float16_t {
-  // __fp16 cannot be used as a function parameter in clang, so use a wrapper.
   uint16_t bits;
 };
+#pragma pack(pop)
 #endif
+
 using float32_t = float;
 using float64_t = double;

@@ -506,6 +543,13 @@ struct Relations<int64_t> {
   using Narrow = int32_t;
 };
 template <>
+struct Relations<float16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = float;
+};
+template <>
 struct Relations<float> {
   using Unsigned = uint32_t;
   using Signed = int32_t;
@@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t

 // Undefined results for x == 0.
 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
-#ifdef _MSC_VER
+#if HWY_COMPILER_MSVC
   unsigned long index;  // NOLINT
   _BitScanForward(&index, x);
   return index;
-#else
+#else  // HWY_COMPILER_MSVC
   return static_cast<size_t>(__builtin_ctz(x));
-#endif
+#endif  // HWY_COMPILER_MSVC
 }

 HWY_API size_t PopCount(uint64_t x) {
@@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) {
   return static_cast<size_t>(__builtin_popcountll(x));
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   return _mm_popcnt_u64(x);
-#elif HWY_COMPILER_MSVC
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
   return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
 #else
   x -= ((x >> 1) & 0x55555555U);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12	2021-06-02 10:56:05.280904620 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h	2021-05-31 10:37:11.000000000 -0400
@@ -20,7 +20,9 @@

 #include "hwy/base.h"

-#ifndef __SSE2__
+// Requires SSE2; fails to compile on 32-bit Clang 7 (see
+// https://github.com/gperftools/gperftools/issues/946).
+#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
 #undef HWY_DISABLE_CACHE_CONTROL
 #define HWY_DISABLE_CACHE_CONTROL
 #endif
@@ -30,6 +32,14 @@
 #include <emmintrin.h>  // SSE2
 #endif

+// Windows.h #defines these, which causes infinite recursion. Temporarily
+// undefine them in this header; these functions are anyway deprecated.
+// TODO(janwas): remove when these functions are removed.
+#pragma push_macro("LoadFence")
+#pragma push_macro("StoreFence")
+#undef LoadFence
+#undef StoreFence
+
 namespace hwy {

 // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
@@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach
 #endif
 }

+// Reduces power consumption in spin-loops. No effect on non-x86.
+HWY_INLINE HWY_ATTR_CACHE void Pause() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_pause();
+#endif
+}
+
 }  // namespace hwy

+// TODO(janwas): remove when these functions are removed. (See above.)
+#pragma pop_macro("StoreFence")
+#pragma pop_macro("LoadFence")
+
 #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12	2021-06-02 10:56:05.195904190 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc	2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,6 @@
 #include <stddef.h>
 #include <stdio.h>

-#include <cmath>
 #include <memory>
 #include <numeric>  // iota

@@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi

 class TwoArray {
  public:
-  // Passed to ctor as a value NOT known to the compiler. Must be a multiple of
-  // the vector lane count * 8.
+  // Must be a multiple of the vector lane count * 8.
   static size_t NumItems() { return 3456; }

-  explicit TwoArray(const size_t num_items)
-      : a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
-    const float init = num_items / NumItems();  // 1, but compiler doesn't know
-    std::iota(a_.get(), a_.get() + num_items, init);
-    std::iota(b_, b_ + num_items, init);
+  TwoArray()
+      : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
+    // = 1, but compiler doesn't know
+    const float init = static_cast<float>(Unpredictable1());
+    std::iota(a_.get(), a_.get() + NumItems(), init);
+    std::iota(b_, b_ + NumItems(), init);
   }

  protected:
@@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) {
   const FuncInput inputs[kNumInputs] = {num_items};
   Result results[kNumInputs];

-  Benchmark benchmark(num_items);
+  Benchmark benchmark;

   Params p;
   p.verbose = false;
@@ -101,7 +100,7 @@ void Intro() {
 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
 class BenchmarkDot : public TwoArray {
  public:
-  explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
+  BenchmarkDot() : dot_{-1.0f} {}

   FuncOutput operator()(const size_t num_items) {
     HWY_FULL(float) d;
@@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray {
         sum[i] += sum[i + power];
       }
     }
-    return dot_ = GetLane(SumOfLanes(sum[0]));
+    dot_ = GetLane(SumOfLanes(sum[0]));
+    return static_cast<FuncOutput>(dot_);
   }
   void Verify(size_t num_items) {
     if (dot_ == -1.0f) {
@@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray {
 // INTERMEDIATE: delta coding
 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
 struct BenchmarkDelta : public TwoArray {
-  explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
-
   FuncOutput operator()(const size_t num_items) const {
 #if HWY_TARGET == HWY_SCALAR
     b_[0] = a_[0];
@@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray
       Store(a - shifted, df, &b_[i]);
     }
 #endif
-    return b_[num_items - 1];
+    return static_cast<FuncOutput>(b_[num_items - 1]);
   }

   void Verify(size_t num_items) {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12	2021-06-02 10:56:05.189904159 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc	2021-05-31 10:37:11.000000000 -0400
@@ -22,27 +22,62 @@
 // For runtime dispatch, specify the name of the current file (unfortunately
 // __FILE__ is not reliable) so that foreach_target.h can re-include it.
 #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
-// Re-include this file once per enabled target to generate code for it.
+// Generates code for each enabled target by re-including this source file.
 #include "hwy/foreach_target.h"

-#include "hwy/examples/skeleton_shared.h"
 #include "hwy/highway.h"

-// Optional: factor out parts of the implementation into *-inl.h
-#include "hwy/examples/skeleton-inl.h"
-
 // Optional, can instead add HWY_ATTR to all functions.
 HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {

-// Compiled once per target via multiple inclusion.
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
-              float* HWY_RESTRICT out) {
-  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET),
-         ExampleGatherStrategy());
+// Highway ops reside here; ADL does not find templates nor builtins.
+using namespace hwy::HWY_NAMESPACE;
+
+// Computes log2 by converting to a vector of floats. Compiled once per target.
+template <class DF>
+HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
+                               uint8_t* HWY_RESTRICT log2) {
+  // Type tags for converting to other element types (Rebind = same count).
+  const Rebind<int32_t, DF> d32;
+  const Rebind<uint8_t, DF> d8;
+
+  const auto u8 = Load(d8, values);
+  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
+  const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
+  Store(DemoteTo(d8, exponent), d8, log2);
+}
+
+HWY_NOINLINE void CodepathDemo() {
+  // Highway defaults to portability, but per-target codepaths may be selected
+  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
+#if HWY_CAP_INTEGER64
+  const char* gather = "Has int64";
+#else
+  const char* gather = "No int64";
+#endif
+  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
+}

-  ExampleMulAdd(in1, in2, out);
+HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+                            uint8_t* HWY_RESTRICT log2) {
+  CodepathDemo();
+
+  // Second argument is necessary on RVV until it supports fractional lengths.
+  HWY_FULL(float, 4) df;
+
+  const size_t N = Lanes(df);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    OneFloorLog2(df, values + i, log2 + i);
+  }
+  // TODO(janwas): implement
+#if HWY_TARGET != HWY_RVV
+  for (; i < count; ++i) {
+    OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
+  }
+#endif
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE();

 namespace skeleton {

-// This macro declares a static array SkeletonHighwayDispatchTable used for
-// dynamic dispatch. This macro should be placed in the same namespace that
-// defines the Skeleton function above.
-HWY_EXPORT(Skeleton);
+// This macro declares a static array used for dynamic dispatch; it resides in
+// the same outer namespace that contains FloorLog2.
+HWY_EXPORT(FloorLog2);

 // This function is optional and only needed in the case of exposing it in the
-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in
-// this module is equivalent to inlining this optional function..
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
-              float* HWY_RESTRICT out) {
-  return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out);
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
+// is equivalent to inlining this function.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+                   uint8_t* HWY_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
 }

 // Optional: anything to compile only once, e.g. non-SIMD implementations of
-// public functions provided by this module, can go inside #if HWY_ONCE
-// (after end_target-inl.h).
+// public functions provided by this module, can go inside #if HWY_ONCE.

 }  // namespace skeleton
 #endif  // HWY_ONCE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12	2021-06-02 10:56:05.213904281 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h	2021-05-31 10:37:11.000000000 -0400
@@ -18,15 +18,17 @@
 #ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
 #define HIGHWAY_HWY_EXAMPLES_SKELETON_H_

-// Tiny subset of Highway API: essentials for declaring an interface, without
-// any implementation details.
+#include <stddef.h>
+
+// Platform-specific definitions used for declaring an interface, independent of
+// the SIMD instruction set.
 #include "hwy/base.h"  // HWY_RESTRICT

 namespace skeleton {

-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
-              float* HWY_RESTRICT out);
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+                   uint8_t* HWY_RESTRICT out);

 }  // namespace skeleton

diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12	2021-06-02 10:56:05.164904033 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -29,41 +29,31 @@
 // It is fine to #include normal or *-inl headers.
 #include <stddef.h>

-#include "hwy/examples/skeleton_shared.h"
 #include "hwy/highway.h"

 HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {

-using hwy::HWY_NAMESPACE::MulAdd;
+using namespace hwy::HWY_NAMESPACE;

-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
-HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1,
-                                    const float* HWY_RESTRICT in2,
-                                    float* HWY_RESTRICT out) {
-  // Descriptor(s) for all vector types used in this function.
-  HWY_FULL(float) df;
-
-  const auto mul = Set(df, kMultiplier);
-  for (size_t i = 0; i < 256; i += Lanes(df)) {
-    const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i));
-    Store(result, df, out + i);
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
+// (uses best available instruction set) function in a header.
+//
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
+template <class D, typename T>
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
+                                 const T* HWY_RESTRICT add_array,
+                                 const size_t size, T* HWY_RESTRICT x_array) {
+  for (size_t i = 0; i < size; i += Lanes(d)) {
+    const auto mul = Load(d, mul_array + i);
+    const auto add = Load(d, add_array + i);
+    auto x = Load(d, x_array + i);
+    x = MulAdd(mul, x, add);
+    Store(x, d, x_array + i);
   }
 }

-// (This doesn't generate SIMD instructions, so is not required here)
-HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() {
-  // Highway functions generate per-target implementations from the same source
-  // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed,
-  // entirely different codepaths can also be selected like so:
-#if HWY_GATHER_LANES > 1
-  return "Has gather";
-#else
-  return "Gather is limited to one lane";
-#endif
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace skeleton
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12	2021-06-02 10:56:05.170904063 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -12,30 +12,96 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Example of unit test for the "skeleton" module.
+// Example of unit test for the "skeleton" library.

-#include "hwy/examples/skeleton.h"  // Skeleton
+#include "hwy/examples/skeleton.h"

 #include <stdio.h>

-#include "hwy/tests/test_util-inl.h"  // RunTest
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"

+// Optional: factor out parts of the implementation into *-inl.h
+#include "hwy/examples/skeleton-inl.h"
+
+HWY_BEFORE_NAMESPACE();
 namespace skeleton {
+namespace HWY_NAMESPACE {
+
+using namespace hwy::HWY_NAMESPACE;
+
+// Calls function defined in skeleton.cc.
+struct TestFloorLog2 {
+  template <class T, class DF>
+  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
+    const size_t count = 5 * Lanes(df);
+    auto in = hwy::AllocateAligned<uint8_t>(count);
+    auto expected = hwy::AllocateAligned<uint8_t>(count);
+
+    hwy::RandomState rng;
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = Random32(&rng) & 7;
+      in[i] = static_cast<uint8_t>(1u << expected[i]);
+    }
+    auto out = hwy::AllocateAligned<uint8_t>(count);
+    CallFloorLog2(in.get(), count, out.get());
+    int sum = 0;
+    for (size_t i = 0; i < count; ++i) {
+      // TODO(janwas): implement
+#if HWY_TARGET != HWY_RVV
+      HWY_ASSERT_EQ(expected[i], out[i]);
+#endif
+      sum += out[i];
+    }
+    hwy::PreventElision(sum);
+  }
+};
+
+HWY_NOINLINE void TestAllFloorLog2() {
+  ForPartialVectors<TestFloorLog2>()(float());
+}
+
+// Calls function defined in skeleton-inl.h.
+struct TestSumMulAdd {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    hwy::RandomState rng;
+    const size_t count = 4096;
+    EXPECT_TRUE(count % Lanes(d) == 0);
+    auto mul = hwy::AllocateAligned<T>(count);
+    auto x = hwy::AllocateAligned<T>(count);
+    auto add = hwy::AllocateAligned<T>(count);
+    for (size_t i = 0; i < count; ++i) {
+      mul[i] = static_cast<T>(Random32(&rng) & 0xF);
+      x[i] = static_cast<T>(Random32(&rng) & 0xFF);
+      add[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    double expected_sum = 0.0;
+    for (size_t i = 0; i < count; ++i) {
+      expected_sum += mul[i] * x[i] + add[i];
+    }

-TEST(SkeletonTest, MainTest) {
-  HWY_ALIGN_MAX float in1[256];
-  HWY_ALIGN_MAX float in2[256];
-  HWY_ALIGN_MAX float out[256];
-  for (size_t i = 0; i < 256; ++i) {
-    in1[i] = static_cast<float>(i);
-    in2[i] = in1[i] + 300;
+    MulAddLoop(d, mul.get(), add.get(), count, x.get());
+    HWY_ASSERT_EQ(4344240.0, expected_sum);
   }
+};

-  // Tests will run for all compiled targets to ensure all are OK.
-  hwy::RunTest([&in1, &in2, &out]() {
-    Skeleton(in1, in2, out);
-    // Add EXPECT_... calls here.
-  });
+HWY_NOINLINE void TestAllSumMulAdd() {
+  ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
 }

+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace skeleton {
+HWY_BEFORE_TEST(SkeletonTest);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
 }  // namespace skeleton
+#endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12	2021-06-02 10:56:05.269904564 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h	2021-05-31 10:37:11.000000000 -0400
@@ -25,10 +25,10 @@

 namespace hwy {

-// API version (https://semver.org/)
+// API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 0
-#define HWY_MINOR 11
-#define HWY_PATCH 1
+#define HWY_MINOR 12
+#define HWY_PATCH 2

 //------------------------------------------------------------------------------
 // Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
@@ -49,7 +49,7 @@ namespace hwy {
   HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
 #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)

-// Vector of up to MAX_N lanes.
+// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
 #define HWY_CAPPED(T, MAX_N) \
   hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>

@@ -75,6 +75,10 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_NEON
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_PPC8
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SSE4
@@ -143,6 +147,18 @@ FunctionCache<RetType, Args...> Function
 #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
 #endif

+#if HWY_TARGETS & HWY_SVE
+#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2
+#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_PPC8
 #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
 #else
@@ -261,8 +277,11 @@ FunctionCache<RetType, Args...> Function
 #elif HWY_TARGET == HWY_AVX3
 #include "hwy/ops/x86_512-inl.h"
 #elif HWY_TARGET == HWY_PPC8
+#error "PPC is not yet supported"
 #elif HWY_TARGET == HWY_NEON
 #include "hwy/ops/arm_neon-inl.h"
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+#include "hwy/ops/arm_sve-inl.h"
 #elif HWY_TARGET == HWY_WASM
 #include "hwy/ops/wasm_128-inl.h"
 #elif HWY_TARGET == HWY_RVV
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12	2021-06-02 10:56:05.276904599 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc	2021-05-31 10:37:11.000000000 -0400
@@ -29,128 +29,43 @@
 #include <string>
 #include <vector>

+#if defined(_WIN32) || defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#include <windows.h>
+#endif
+
+#if defined(__MACH__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <OS.h>
+#endif
+
 #include "hwy/base.h"
 #if HWY_ARCH_PPC
 #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
 #elif HWY_ARCH_X86

-#ifdef _MSC_VER
+#if HWY_COMPILER_MSVC
 #include <intrin.h>
 #else
 #include <cpuid.h>  // NOLINT
-#endif              // _MSC_VER
+#endif              // HWY_COMPILER_MSVC

 #endif  // HWY_ARCH_X86

 namespace hwy {
-namespace platform {
-namespace {
-
-#if HWY_ARCH_X86
-
-void Cpuid(const uint32_t level, const uint32_t count,
-           uint32_t* HWY_RESTRICT abcd) {
-#if HWY_COMPILER_MSVC
-  int regs[4];
-  __cpuidex(regs, level, count);
-  for (int i = 0; i < 4; ++i) {
-    abcd[i] = regs[i];
-  }
-#else
-  uint32_t a;
-  uint32_t b;
-  uint32_t c;
-  uint32_t d;
-  __cpuid_count(level, count, a, b, c, d);
-  abcd[0] = a;
-  abcd[1] = b;
-  abcd[2] = c;
-  abcd[3] = d;
-#endif
-}
-
-std::string BrandString() {
-  char brand_string[49];
-  std::array<uint32_t, 4> abcd;
-
-  // Check if brand string is supported (it is on all reasonable Intel/AMD)
-  Cpuid(0x80000000U, 0, abcd.data());
-  if (abcd[0] < 0x80000004U) {
-    return std::string();
-  }
-
-  for (size_t i = 0; i < 3; ++i) {
-    Cpuid(0x80000002U + i, 0, abcd.data());
-    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
-  }
-  brand_string[48] = 0;
-  return brand_string;
-}
-
-// Returns the frequency quoted inside the brand string. This does not
-// account for throttling nor Turbo Boost.
-double NominalClockRate() {
-  const std::string& brand_string = BrandString();
-  // Brand strings include the maximum configured frequency. These prefixes are
-  // defined by Intel CPUID documentation.
-  const char* prefixes[3] = {"MHz", "GHz", "THz"};
-  const double multipliers[3] = {1E6, 1E9, 1E12};
-  for (size_t i = 0; i < 3; ++i) {
-    const size_t pos_prefix = brand_string.find(prefixes[i]);
-    if (pos_prefix != std::string::npos) {
-      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
-      if (pos_space != std::string::npos) {
-        const std::string digits =
-            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
-        return std::stod(digits) * multipliers[i];
-      }
-    }
-  }
-
-  return 0.0;
-}
-
-#endif  // HWY_ARCH_X86
-
-}  // namespace
-
-// Returns tick rate. Invariant means the tick counter frequency is independent
-// of CPU throttling or sleep. May be expensive, caller should cache the result.
-double InvariantTicksPerSecond() {
-#if HWY_ARCH_PPC
-  return __ppc_get_timebase_freq();
-#elif HWY_ARCH_X86
-  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
-  return NominalClockRate();
-#else
-  // Fall back to clock_gettime nanoseconds.
-  return 1E9;
-#endif
-}
-
-}  // namespace platform
 namespace {
-
-// Prevents the compiler from eliding the computations that led to "output".
-template <class T>
-inline void PreventElision(T&& output) {
-#if HWY_COMPILER_MSVC == 0
-  // Works by indicating to the compiler that "output" is being read and
-  // modified. The +r constraint avoids unnecessary writes to memory, but only
-  // works for built-in types (typically FuncOutput).
-  asm volatile("" : "+r"(output) : : "memory");
-#else
-  // MSVC does not support inline assembly anymore (and never supported GCC's
-  // RTL constraints). Self-assignment with #pragma optimize("off") might be
-  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
-  // with volatile pointers generates inefficient code on MSVC 2017.
-  static std::atomic<T> dummy(T{});
-  dummy.store(output, std::memory_order_relaxed);
-#endif
-}
-
 namespace timer {

+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
+
 // Start/Stop return absolute timestamps and must be placed immediately before
 // and after the region to measure. We provide separate Start/Stop functions
 // because they use different fences.
@@ -202,8 +117,8 @@ namespace timer {

 // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
 // divide by InvariantTicksPerSecond.
-inline uint64_t Start64() {
-  uint64_t t;
+inline Ticks Start() {
+  Ticks t;
 #if HWY_ARCH_PPC
   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
@@ -228,8 +143,15 @@ inline uint64_t Start64() {
       : "rdx", "memory", "cc");
 #elif HWY_ARCH_RVV
   asm volatile("rdcycle %0" : "=r"(t));
-#else
-  // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  t = counter.QuadPart;
+#elif defined(__MACH__)
+  t = mach_absolute_time();
+#elif defined(__HAIKU__)
+  t = system_time_nsecs();  // since boot
+#else  // POSIX
   timespec ts;
   clock_gettime(CLOCK_MONOTONIC, &ts);
   t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
@@ -237,7 +159,7 @@ inline uint64_t Start64() {
   return t;
 }

-inline uint64_t Stop64() {
+inline Ticks Stop() {
   uint64_t t;
 #if HWY_ARCH_PPC
   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
@@ -261,61 +183,7 @@ inline uint64_t Stop64() {
       // "cc" = flags modified by SHL.
       : "rcx", "rdx", "memory", "cc");
 #else
-  t = Start64();
-#endif
-  return t;
-}
-
-// Returns a 32-bit timestamp with about 4 cycles less overhead than
-// Start64. Only suitable for measuring very short regions because the
-// timestamp overflows about once a second.
-inline uint32_t Start32() {
-  uint32_t t;
-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-  t = static_cast<uint32_t>(__rdtsc());
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
-  asm volatile(
-      "lfence\n\t"
-      "rdtsc\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rdx = TSC >> 32.
-      : "rdx", "memory");
-#elif HWY_ARCH_RVV
-  asm volatile("rdcycle %0" : "=r"(t));
-#else
-  t = static_cast<uint32_t>(Start64());
-#endif
-  return t;
-}
-
-inline uint32_t Stop32() {
-  uint32_t t;
-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  _ReadWriteBarrier();
-  unsigned aux;
-  t = static_cast<uint32_t>(__rdtscp(&aux));
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
-  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
-  asm volatile(
-      "rdtscp\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
-      : "rcx", "rdx", "memory");
-#else
-  t = static_cast<uint32_t>(Stop64());
+  t = Start();
 #endif
   return t;
 }
@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value
 }

 }  // namespace robust_statistics
+}  // namespace
+namespace platform {
+namespace {

-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
-// read than 64 bit.
-using Ticks = uint32_t;
+// Prevents the compiler from eliding the computations that led to "output".
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC == 0
+  // Works by indicating to the compiler that "output" is being read and
+  // modified. The +r constraint avoids unnecessary writes to memory, but only
+  // works for built-in types (typically FuncOutput).
+  asm volatile("" : "+r"(output) : : "memory");
+#else
+  // MSVC does not support inline assembly anymore (and never supported GCC's
+  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+  // with volatile pointers generates inefficient code on MSVC 2017.
+  static std::atomic<T> dummy(T{});
+  dummy.store(output, std::memory_order_relaxed);
+#endif
+}
+
+#if HWY_ARCH_X86
+
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HWY_RESTRICT abcd) {
+#if HWY_COMPILER_MSVC
+  int regs[4];
+  __cpuidex(regs, level, count);
+  for (int i = 0; i < 4; ++i) {
+    abcd[i] = regs[i];
+  }
+#else
+  uint32_t a;
+  uint32_t b;
+  uint32_t c;
+  uint32_t d;
+  __cpuid_count(level, count, a, b, c, d);
+  abcd[0] = a;
+  abcd[1] = b;
+  abcd[2] = c;
+  abcd[3] = d;
+#endif
+}
+
+std::string BrandString() {
+  char brand_string[49];
+  std::array<uint32_t, 4> abcd;
+
+  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+  Cpuid(0x80000000U, 0, abcd.data());
+  if (abcd[0] < 0x80000004U) {
+    return std::string();
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
+    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+  }
+  brand_string[48] = 0;
+  return brand_string;
+}
+
+// Returns the frequency quoted inside the brand string. This does not
+// account for throttling nor Turbo Boost.
+double NominalClockRate() {
+  const std::string& brand_string = BrandString();
+  // Brand strings include the maximum configured frequency. These prefixes are
+  // defined by Intel CPUID documentation.
+  const char* prefixes[3] = {"MHz", "GHz", "THz"};
+  const double multipliers[3] = {1E6, 1E9, 1E12};
+  for (size_t i = 0; i < 3; ++i) {
+    const size_t pos_prefix = brand_string.find(prefixes[i]);
+    if (pos_prefix != std::string::npos) {
+      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+      if (pos_space != std::string::npos) {
+        const std::string digits =
+            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+        return std::stod(digits) * multipliers[i];
+      }
+    }
+  }
+
+  return 0.0;
+}
+
+#endif  // HWY_ARCH_X86
+
+}  // namespace
+
+double InvariantTicksPerSecond() {
+#if HWY_ARCH_PPC
+  return __ppc_get_timebase_freq();
+#elif HWY_ARCH_X86
+  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
+  return NominalClockRate();
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER freq;
+  (void)QueryPerformanceFrequency(&freq);
+  return double(freq.QuadPart);
+#elif defined(__MACH__)
+  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
+  mach_timebase_info_data_t timebase;
+  (void)mach_timebase_info(&timebase);
+  return double(timebase.denom) / timebase.numer * 1E9;
+#else
+  // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
+  return 1E9;  // Haiku and clock_gettime return nanoseconds.
+#endif
+}

-// Returns timer overhead / minimum measurable difference.
-Ticks TimerResolution() {
+double Now() {
+  static const double mul = 1.0 / InvariantTicksPerSecond();
+  return static_cast<double>(timer::Start()) * mul;
+}
+
+uint64_t TimerResolution() {
   // Nested loop avoids exceeding stack/L1 capacity.
-  Ticks repetitions[Params::kTimerSamples];
+  timer::Ticks repetitions[Params::kTimerSamples];
   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
-    Ticks samples[Params::kTimerSamples];
+    timer::Ticks samples[Params::kTimerSamples];
     for (size_t i = 0; i < Params::kTimerSamples; ++i) {
-      const Ticks t0 = timer::Start32();
-      const Ticks t1 = timer::Stop32();
+      const timer::Ticks t0 = timer::Start();
+      const timer::Ticks t1 = timer::Stop();
       samples[i] = t1 - t0;
     }
     repetitions[rep] = robust_statistics::Mode(samples);
@@ -462,18 +439,21 @@ Ticks TimerResolution() {
   return robust_statistics::Mode(repetitions);
 }

-static const Ticks timer_resolution = TimerResolution();
+}  // namespace platform
+namespace {
+
+static const timer::Ticks timer_resolution = platform::TimerResolution();

 // Estimates the expected value of "lambda" values with a variable number of
 // samples until the variability "rel_mad" is less than "max_rel_mad".
 template <class Lambda>
-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
-                        const Params& p, const Lambda& lambda) {
+timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+                               const Params& p, const Lambda& lambda) {
   // Choose initial samples_per_eval based on a single estimated duration.
-  Ticks t0 = timer::Start32();
+  timer::Ticks t0 = timer::Start();
   lambda();
-  Ticks t1 = timer::Stop32();
-  Ticks est = t1 - t0;
+  timer::Ticks t1 = timer::Stop();
+  timer::Ticks est = t1 - t0;
   static const double ticks_per_second = platform::InvariantTicksPerSecond();
   const size_t ticks_per_eval =
       static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max
       est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
   samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);

-  std::vector<Ticks> samples;
+  std::vector<timer::Ticks> samples;
   samples.reserve(1 + samples_per_eval);
   samples.push_back(est);

   // Percentage is too strict for tiny differences, so also allow a small
   // absolute "median absolute deviation".
-  const Ticks max_abs_mad = (timer_resolution + 99) / 100;
+  const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
   *rel_mad = 0.0;  // ensure initialized

   for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
     samples.reserve(samples.size() + samples_per_eval);
     for (size_t i = 0; i < samples_per_eval; ++i) {
-      t0 = timer::Start32();
+      t0 = timer::Start();
       lambda();
-      t1 = timer::Stop32();
+      t1 = timer::Stop();
       samples.push_back(t1 - t0);
     }

@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max
     NANOBENCHMARK_CHECK(est != 0);

     // Median absolute deviation (mad) is a robust measure of 'variability'.
-    const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+    const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
         samples.data(), samples.size(), est);
-    *rel_mad = static_cast<double>(int(abs_mad)) / est;
+    *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);

     if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
       if (p.verbose) {
-        printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
-               samples.size(), est, abs_mad, *rel_mad * 100.0);
+        printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
+               samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
       }
       return est;
     }
@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i
   return unique;
 }

-// Returns how often we need to call func for sufficient precision, or zero
-// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
+// Returns how often we need to call func for sufficient precision.
 size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
                const Params& p) {
   // Min elapsed ticks for any input.
-  Ticks min_duration = ~0u;
+  timer::Ticks min_duration = ~timer::Ticks(0);

   for (const FuncInput input : unique) {
-    // Make sure a 32-bit timer is sufficient.
-    const uint64_t t0 = timer::Start64();
-    PreventElision(func(arg, input));
-    const uint64_t t1 = timer::Stop64();
-    const uint64_t elapsed = t1 - t0;
-    if (elapsed >= (1ULL << 30)) {
-      fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
-              input);
-      return 0;
-    }
-
     double rel_mad;
-    const Ticks total = SampleUntilStable(
+    const timer::Ticks total = SampleUntilStable(
         p.target_rel_mad, &rel_mad, p,
-        [func, arg, input]() { PreventElision(func(arg, input)); });
+        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
     min_duration = std::min(min_duration, total - timer_resolution);
   }

@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui
   const size_t num_skip =
       min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
   if (p.verbose) {
-    printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
-           max_skip, min_duration, num_skip);
+    printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
+           size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
   }
   return num_skip;
 }
@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co
 }

 // Returns total ticks elapsed for all inputs.
-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
-                    const Params& p, double* max_rel_mad) {
+timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
+                           const InputVec* inputs, const Params& p,
+                           double* max_rel_mad) {
   double rel_mad;
-  const Ticks duration =
+  const timer::Ticks duration =
       SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
         for (const FuncInput input : *inputs) {
-          PreventElision(func(arg, input));
+          platform::PreventElision(func(arg, input));
         }
       });
   *max_rel_mad = std::max(*max_rel_mad, rel_mad);
@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const

 // Returns overhead of accessing inputs[] and calling a function; this will
 // be deducted from future TotalDuration return values.
-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
+timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
+                      const Params& p) {
   double rel_mad;
   // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
   return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
     for (const FuncInput input : *inputs) {
-      PreventElision(EmptyFunc(arg, input));
+      platform::PreventElision(EmptyFunc(arg, input));
     }
   });
 }

 }  // namespace

-int Unpredictable1() { return timer::Start64() != ~0ULL; }
+int Unpredictable1() { return timer::Start() != ~0ULL; }

 size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
                const size_t num_inputs, Result* results, const Params& p) {
@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui
       ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
   InputVec subset(full.size() - num_skip);

-  const Ticks overhead = Overhead(arg, &full, p);
-  const Ticks overhead_skip = Overhead(arg, &subset, p);
+  const timer::Ticks overhead = Overhead(arg, &full, p);
+  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
   if (overhead < overhead_skip) {
-    fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
-            overhead_skip);
+    fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
+            size_t(overhead), size_t(overhead_skip));
     return 0;
   }

   if (p.verbose) {
-    printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
-           overhead, overhead_skip);
+    printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
+           size_t(overhead), size_t(overhead_skip));
   }

   double max_rel_mad = 0.0;
-  const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+  const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);

   for (size_t i = 0; i < unique.size(); ++i) {
     FillSubset(full, unique[i], num_skip, &subset);
-    const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
+    const timer::Ticks total_skip =
+        TotalDuration(func, arg, &subset, p, &max_rel_mad);

     if (total < total_skip) {
-      fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
+      fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
+              size_t(total_skip));
       return 0;
     }

-    const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
+    const timer::Ticks duration =
+        (total - overhead) - (total_skip - overhead_skip);
     results[i].input = unique[i];
     results[i].ticks = static_cast<float>(duration) * mul;
     results[i].variability = static_cast<float>(max_rel_mad);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12	2021-06-02 10:56:05.272904579 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h	2021-05-31 10:37:11.000000000 -0400
@@ -44,11 +44,6 @@
 // central tendency of the measurement samples with the "half sample mode",
 // which is more robust to outliers and skewed data than the mean or median.

-// WARNING if included from multiple translation units compiled with distinct
-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
-// macro that is unique to the current compile flags. We must also avoid
-// standard library headers such as vector and functional that define functions.
-
 #include <stddef.h>
 #include <stdint.h>

@@ -79,6 +74,16 @@ namespace platform {
 // This call may be expensive, callers should cache the result.
 double InvariantTicksPerSecond();

+// Returns current timestamp [in seconds] relative to an unspecified origin.
+// Features: monotonic (no negative elapsed time), steady (unaffected by system
+// time changes), high-resolution (on the order of microseconds).
+double Now();
+
+// Returns ticks elapsed in back to back timer calls, i.e. a function of the
+// timer resolution (minimum measurable difference) and overhead.
+// This call is expensive, callers should cache the result.
+uint64_t TimerResolution();
+
 }  // namespace platform

 // Returns 1, but without the compiler knowing what the value is. This prevents
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12	2021-06-02 10:56:05.275904594 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -15,11 +15,11 @@
 #include "hwy/nanobenchmark.h"

 #include <stdio.h>
-#include <stdlib.h>  // strtol
-#include <unistd.h>  // sleep

 #include <random>

+#include "hwy/tests/test_util-inl.h"
+
 namespace hwy {
 namespace {

@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in

 template <size_t N>
 void MeasureDiv(const FuncInput (&inputs)[N]) {
+  printf("Measuring integer division (output on final two lines)\n");
   Result results[N];
   Params params;
   params.max_evals = 4;  // avoid test timeout
@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp
   }
 }

-template <size_t N>
-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
-  printf("Expect a 'measurement failed' below:\n");
-  Result results[N];
-
-  const size_t num_results = Measure(
-      [](const void*, const FuncInput input) -> FuncOutput {
-        // Loop until the sleep succeeds (not interrupted by signal). We assume
-        // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
-        while (sleep(2) != 0) {
-        }
-        return input;
-      },
-      nullptr, inputs, N, results);
-  NANOBENCHMARK_CHECK(num_results == 0);
-  (void)num_results;
-}
-
-void RunAll(const int argc, char** /*argv*/) {
-  // unpredictable == 1 but the compiler doesn't know that.
-  const int unpredictable = argc != 999;
+TEST(NanobenchmarkTest, RunAll) {
+  const int unpredictable = Unpredictable1();  // == 1, unknown to compiler.
   static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
                                      static_cast<FuncInput>(unpredictable + 9)};

   MeasureDiv(inputs);
   MeasureRandom(inputs);
-  EnsureLongMeasurementFails(inputs);
 }

 }  // namespace
 }  // namespace hwy
-
-int main(int argc, char* argv[]) {
-  hwy::RunAll(argc, argv);
-  return 0;
-}
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12	2021-06-02 10:56:05.239904412 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+namespace detail {  // for code folding and Raw128
+
 // Macros used to define single and double function calls for multiple types
 // for full and half vectors. These macros are undefined at the end of the file.

@@ -133,7 +135,7 @@ namespace HWY_NAMESPACE {
   HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)

 // float and double
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)   \
   HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args)  \
   HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args)     \
@@ -181,7 +183,7 @@ namespace HWY_NAMESPACE {
   HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)

 // Emulation of some intrinsics on armv7.
-#if !defined(__aarch64__)
+#if HWY_ARCH_ARM_V7
 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
@@ -294,7 +296,7 @@ struct Raw128<float, 4> {
   using type = float32x4_t;
 };

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 template <>
 struct Raw128<double, 2> {
   using type = float64x2_t;
@@ -352,7 +354,7 @@ struct Raw128<float, 2> {
   using type = float32x2_t;
 };

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 template <>
 struct Raw128<double, 1> {
   using type = float64x1_t;
@@ -437,12 +439,14 @@ struct Raw128<int8_t, 1> {
   using type = int8x8_t;
 };

+}  // namespace detail
+
 template <typename T>
 using Full128 = Simd<T, 16 / sizeof(T)>;

 template <typename T, size_t N = 16 / sizeof(T)>
 class Vec128 {
-  using Raw = typename Raw128<T, N>::type;
+  using Raw = typename detail::Raw128<T, N>::type;

  public:
   HWY_INLINE Vec128() {}
@@ -480,7 +484,8 @@ class Vec128 {
 // FF..FF or 0, also for floating-point - see README.
 template <typename T, size_t N = 16 / sizeof(T)>
 class Mask128 {
-  using Raw = typename Raw128<T, N>::type;
+  // ARM C Language Extensions return and expect unsigned type.
+  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;

  public:
   HWY_INLINE Mask128() {}
@@ -573,7 +578,7 @@ HWY_INLINE Vec128<int64_t, 1> BitCastFro
                                               Vec128<uint8_t, 1 * 8> v) {
   return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> BitCastFromByte(Simd<double, 1> /* tag */,
                                              Vec128<uint8_t, 1 * 8> v) {
   return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
@@ -615,7 +620,7 @@ HWY_INLINE Vec128<int64_t> BitCastFromBy
   return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
 }

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
                                           Vec128<uint8_t> v) {
   return Vec128<double>(vreinterpretq_f64_u8(v.raw));
@@ -664,15 +669,25 @@ template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
   HWY_DIAGNOSTICS(push)
   HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-  typename Raw128<T, N>::type a;
+  typename detail::Raw128<T, N>::type a;
   return Vec128<T, N>(a);
   HWY_DIAGNOSTICS(pop)
 }

-// ------------------------------ Extract lane
+// Returns a vector with lane i=[0, N) set to "first" + i.
+template <typename T, size_t N, typename T2>
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+  HWY_ALIGN T lanes[16 / sizeof(T)];
+  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+  }
+  return Load(d, lanes);
+}
+
+// ------------------------------ GetLane

 HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
-  return vget_lane_u8(vget_low_u8(v.raw), 0);
+  return vgetq_lane_u8(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128<
 }

 HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
-  return vget_lane_s8(vget_low_s8(v.raw), 0);
+  return vgetq_lane_s8(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128<i
 }

 HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
-  return vget_lane_u16(vget_low_u16(v.raw), 0);
+  return vgetq_lane_u16(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128
 }

 HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
-  return vget_lane_s16(vget_low_s16(v.raw), 0);
+  return vgetq_lane_s16(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128<
 }

 HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
-  return vget_lane_u32(vget_low_u32(v.raw), 0);
+  return vgetq_lane_u32(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128
 }

 HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
-  return vget_lane_s32(vget_low_s32(v.raw), 0);
+  return vgetq_lane_s32(v.raw, 0);
 }
 template <size_t N>
 HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128<
 }

 HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
-  return vget_lane_u64(vget_low_u64(v.raw), 0);
+  return vgetq_lane_u64(v.raw, 0);
 }
 HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
   return vget_lane_u64(v.raw, 0);
 }
 HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
-  return vget_lane_s64(vget_low_s64(v.raw), 0);
+  return vgetq_lane_s64(v.raw, 0);
 }
 HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
   return vget_lane_s64(v.raw, 0);
 }

 HWY_INLINE float GetLane(const Vec128<float, 4> v) {
-  return vget_lane_f32(vget_low_f32(v.raw), 0);
+  return vgetq_lane_f32(v.raw, 0);
 }
 HWY_INLINE float GetLane(const Vec128<float, 2> v) {
   return vget_lane_f32(v.raw, 0);
@@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128<fl
 HWY_INLINE float GetLane(const Vec128<float, 1> v) {
   return vget_lane_f32(v.raw, 0);
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE double GetLane(const Vec128<double, 2> v) {
-  return vget_lane_f64(vget_low_f64(v.raw), 0);
+  return vgetq_lane_f64(v.raw, 0);
 }
 HWY_INLINE double GetLane(const Vec128<double, 1> v) {
   return vget_lane_f64(v.raw, 0);
@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu
 // ------------------------------ Average

 // Returns (a + b + 1) / 2
-
-// Unsigned
 HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
 HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)

@@ -802,6 +815,7 @@ HWY_INLINE Vec128<int16_t> Abs(const Vec
 HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
   return Vec128<int32_t>(vabsq_s32(v.raw));
 }
+// i64 is implemented after BroadcastSignBit.
 HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
   return Vec128<float>(vabsq_f32(v.raw));
 }
@@ -823,7 +837,7 @@ HWY_INLINE Vec128<float, N> Abs(const Ve
   return Vec128<float, N>(vabs_f32(v.raw));
 }

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
   return Vec128<double>(vabsq_f64(v.raw));
 }
@@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn
 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below

 HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return Vec128<int64_t, 1>(vneg_s64(v.raw));
 #else
   return Zero(Simd<int64_t, 1>()) - v;
@@ -847,7 +861,7 @@ HWY_INLINE Vec128<int64_t, 1> Neg(const
 }

 HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return Vec128<int64_t>(vnegq_s64(v.raw));
 #else
   return Zero(Full128<int64_t>()) - v;
@@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v

 // ------------------------------ Shl

+HWY_INLINE Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
+                                      const Vec128<uint8_t> bits) {
+  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
+}
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
+                                         const Vec128<uint8_t, N> bits) {
+  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
+}
+
 HWY_INLINE Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
                                        const Vec128<uint16_t> bits) {
   return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
@@ -905,6 +929,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator<
   return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
 }

+HWY_INLINE Vec128<int8_t> operator<<(const Vec128<int8_t> v,
+                                     const Vec128<int8_t> bits) {
+  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
+}
+template <size_t N, HWY_IF_LE64(int8_t, N)>
+HWY_INLINE Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
+                                        const Vec128<int8_t, N> bits) {
+  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
+}
+
 HWY_INLINE Vec128<int16_t> operator<<(const Vec128<int16_t> v,
                                       const Vec128<int16_t> bits) {
   return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
@@ -936,6 +970,18 @@ HWY_INLINE Vec128<int64_t, 1> operator<<

 // ------------------------------ Shr (Neg)

+HWY_INLINE Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
+                                      const Vec128<uint8_t> bits) {
+  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
+  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
+}
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
+                                         const Vec128<uint8_t, N> bits) {
+  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
+  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
+}
+
 HWY_INLINE Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
                                        const Vec128<uint16_t> bits) {
   const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
@@ -971,6 +1017,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator>
   return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
 }

+HWY_INLINE Vec128<int8_t> operator>>(const Vec128<int8_t> v,
+                                     const Vec128<int8_t> bits) {
+  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
+}
+template <size_t N, HWY_IF_LE64(int8_t, N)>
+HWY_INLINE Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
+                                        const Vec128<int8_t, N> bits) {
+  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
+}
+
 HWY_INLINE Vec128<int16_t> operator>>(const Vec128<int16_t> v,
                                       const Vec128<int16_t> bits) {
   return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
@@ -1059,7 +1115,7 @@ HWY_INLINE Vec128<int32_t, N> operator*(
 HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
                                    const Vec128<int16_t> b) {
   int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
 #else
   int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
@@ -1070,7 +1126,7 @@ HWY_INLINE Vec128<int16_t> MulHigh(const
 HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
                                     const Vec128<uint16_t> b) {
   uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
 #else
   uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
@@ -1139,24 +1195,37 @@ HWY_INLINE Vec128<float, N> ApproximateR
   return Vec128<float, N>(vrecpe_f32(v.raw));
 }

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
 #else
-// Emulated with approx reciprocal + Newton-Raphson + mul
+// Not defined on armv7: approximate
+namespace detail {
+
+HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
+    const Vec128<float> recip, const Vec128<float> divisor) {
+  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
+}
+template <size_t N>
+HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
+    const Vec128<float, N> recip, Vec128<float, N> divisor) {
+  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
+}
+
+}  // namespace detail
+
 template <size_t N>
 HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
                                       const Vec128<float, N> b) {
   auto x = ApproximateReciprocal(b);
-  // Newton-Raphson on 1/x - b
-  const auto two = Set(Simd<float, N>(), 2);
-  x = x * (two - b * x);
-  x = x * (two - b * x);
-  x = x * (two - b * x);
+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
   return a * x;
 }
 #endif

-// Absolute value of difference.
+// ------------------------------ Absolute value of difference.
+
 HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
   return Vec128<float>(vabdq_f32(a.raw, b.raw));
 }
@@ -1169,7 +1238,7 @@ HWY_INLINE Vec128<float, N> AbsDiff(cons
 // ------------------------------ Floating-point multiply-add variants

 // Returns add + mul * x
-#if defined(__aarch64__)
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
 template <size_t N, HWY_IF_LE64(float, N)>
 HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
                                    const Vec128<float, N> x,
@@ -1180,6 +1249,17 @@ HWY_INLINE Vec128<float> MulAdd(const Ve
                                 const Vec128<float> add) {
   return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
 }
+#else
+// Emulate FMA for floats.
+template <size_t N>
+HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
+                                   const Vec128<float, N> x,
+                                   const Vec128<float, N> add) {
+  return mul * x + add;
+}
+#endif
+
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
                                     const Vec128<double, 1> x,
                                     const Vec128<double, 1> add) {
@@ -1190,18 +1270,10 @@ HWY_INLINE Vec128<double> MulAdd(const V
                                  const Vec128<double> add) {
   return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
 }
-#else
-// Emulate FMA for floats.
-template <size_t N>
-HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> add) {
-  return mul * x + add;
-}
 #endif

 // Returns add - mul * x
-#if defined(__aarch64__)
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
 template <size_t N, HWY_IF_LE64(float, N)>
 HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
                                       const Vec128<float, N> x,
@@ -1213,7 +1285,17 @@ HWY_INLINE Vec128<float> NegMulAdd(const
                                    const Vec128<float> add) {
   return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
 }
+#else
+// Emulate FMA for floats.
+template <size_t N>
+HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
+                                      const Vec128<float, N> x,
+                                      const Vec128<float, N> add) {
+  return add - mul * x;
+}
+#endif

+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
                                        const Vec128<double, 1> x,
                                        const Vec128<double, 1> add) {
@@ -1224,14 +1306,6 @@ HWY_INLINE Vec128<double> NegMulAdd(cons
                                     const Vec128<double> add) {
   return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
 }
-#else
-// Emulate FMA for floats.
-template <size_t N>
-HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                      const Vec128<float, N> x,
-                                      const Vec128<float, N> add) {
-  return add - mul * x;
-}
 #endif

 // Returns mul * x - sub
@@ -1241,12 +1315,6 @@ HWY_INLINE Vec128<float, N> MulSub(const
                                    const Vec128<float, N> sub) {
   return MulAdd(mul, x, Neg(sub));
 }
-template <size_t N>
-HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
-                                    const Vec128<double, N> x,
-                                    const Vec128<double, N> sub) {
-  return MulAdd(mul, x, Neg(sub));
-}

 // Returns -mul * x - sub
 template <size_t N>
@@ -1255,14 +1323,23 @@ HWY_INLINE Vec128<float, N> NegMulSub(co
                                       const Vec128<float, N> sub) {
   return Neg(MulAdd(mul, x, sub));
 }
+
+#if HWY_ARCH_ARM_A64
+template <size_t N>
+HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
+                                    const Vec128<double, N> x,
+                                    const Vec128<double, N> sub) {
+  return MulAdd(mul, x, Neg(sub));
+}
 template <size_t N>
 HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
                                        const Vec128<double, N> x,
                                        const Vec128<double, N> sub) {
   return Neg(MulAdd(mul, x, sub));
 }
+#endif

-// ------------------------------ Floating-point square root
+// ------------------------------ Floating-point square root (IfThenZeroElse)

 // Approximate reciprocal square root
 HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
@@ -1275,80 +1352,36 @@ HWY_INLINE Vec128<float, N> ApproximateR
 }

 // Full precision square root
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
 #else
-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
-template <size_t N>
-HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
-  auto b = v;
-  auto Y = ApproximateReciprocalSqrt(v);
-  auto x = v * Y;
-  const auto half = Set(Simd<float, N>(), 0.5);
-  const auto oneandhalf = Set(Simd<float, N>(), 1.5);
-  for (size_t i = 0; i < 3; i++) {
-    b = b * Y * Y;
-    Y = oneandhalf - half * b;
-    x = x * Y;
-  }
-  return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
-}
-#endif
-
-// ================================================== COMPARE
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
+namespace detail {

-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask128<TTo, N>{m.raw};
+HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
+                                            const Vec128<float> recip) {
+  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
+}
+template <size_t N>
+HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
+                                               Vec128<float, N> recip) {
+  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
 }

-#define HWY_NEON_BUILD_TPL_HWY_COMPARE
-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
-  const Vec128<type, size> a, const Vec128<type, size> b
-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
-
-// ------------------------------ Equality
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
-#if defined(__aarch64__)
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
-#else
-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
-#endif
+}  // namespace detail

-// ------------------------------ Strict inequality
+// Not defined on armv7: approximate
+template <size_t N>
+HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
+  auto recip = ApproximateReciprocalSqrt(v);

-// Signed/float < (no unsigned)
-#if defined(__aarch64__)
-HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
-#else
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
-#endif
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);

-// Signed/float > (no unsigned)
-#if defined(__aarch64__)
-HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
-#else
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
+  const auto root = v * recip;
+  return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
+}
 #endif
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
-
-// ------------------------------ Weak inequality
-
-// Float <= >=
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
-
-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
-#undef HWY_NEON_BUILD_RET_HWY_COMPARE
-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE

 // ================================================== LOGICAL

@@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato
 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
 template <typename T>
 HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
-  const Full128<uint8_t> d8;
-  return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
+  const Full128<T> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
 }
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
-  const Repartition<uint8_t, Simd<T, N>> d8;
-  return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
+  const Simd<T, N> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  using V8 = decltype(Zero(d8));
+  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
 }

 // ------------------------------ And
@@ -1463,33 +1499,38 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
   return ShiftRight<sizeof(T) * 8 - 1>(v);
 }

-// ------------------------------ Make mask
+// ================================================== MASK

-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
+// ------------------------------ To/from vector

-// Mask and Vec are the same (true = FF..FF).
+// Mask and Vec have the same representation (true = FF..FF).
 template <typename T, size_t N>
 HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  return Mask128<T, N>(v.raw);
+  const Simd<MakeUnsigned<T>, N> du;
+  return Mask128<T, N>(BitCast(du, v).raw);
 }

+// DEPRECATED
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>(v.raw);
+  return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
 }

 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
-                                     const Mask128<T, N> v) {
-  return Vec128<T, N>(v.raw);
+HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
+  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
 }

-// IfThenElse(mask, yes, no)
-// Returns mask ? b : a.
+// ------------------------------ RebindMask
+
+template <typename TFrom, typename TTo, size_t N>
+HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
+  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
+}
+
+// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
+
 #define HWY_NEON_BUILD_TPL_HWY_IF
 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                 \
@@ -1524,7 +1565,6 @@ HWY_INLINE Vec128<T, N> ZeroIfNegative(V
   return Max(zero, v);
 }

-
 // ------------------------------ Mask logical

 template <typename T, size_t N>
@@ -1557,30 +1597,183 @@ HWY_API Mask128<T, N> Xor(const Mask128<
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
+// ================================================== COMPARE

-namespace detail {
+// Comparisons fill a lane with 1-bits if the condition is true, else 0.

-#if defined(__aarch64__)
+// ------------------------------ Shuffle2301 (for i64 compares)

-HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
+// Swap 32-bit halves in 64-bits
+HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
+  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
+}
+HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
+  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
+}
+HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
+  return Vec128<float, 2>(vrev64_f32(v.raw));
+}
+HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
+  return Vec128<uint32_t>(vrev64q_u32(v.raw));
 }
-HWY_INLINE Vec128<uint64_t, 1> Gt(Vec128<uint64_t, 1> a,
-                                  Vec128<uint64_t, 1> b) {
-  return Vec128<uint64_t, 1>(vcgt_u64(a.raw, b.raw));
+HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
+  return Vec128<int32_t>(vrev64q_s32(v.raw));
+}
+HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
+  return Vec128<float>(vrev64q_f32(v.raw));
 }

-HWY_INLINE Vec128<int64_t> Gt(Vec128<int64_t> a, Vec128<int64_t> b) {
-  return Vec128<int64_t>(vcgtq_s64(a.raw, b.raw));
+#define HWY_NEON_BUILD_TPL_HWY_COMPARE
+#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
+#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
+  const Vec128<type, size> a, const Vec128<type, size> b
+#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
+
+// ------------------------------ Equality
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
+#if HWY_ARCH_ARM_A64
+HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
+#else
+// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+#endif
+
+// ------------------------------ Strict inequality (signed, float)
+#if HWY_ARCH_ARM_A64
+HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
+#else
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
+#endif
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
+
+// ------------------------------ Weak inequality (float)
+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
+
+#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
+#undef HWY_NEON_BUILD_RET_HWY_COMPARE
+#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
+#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
+
+// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
+
+#if HWY_ARCH_ARM_V7
+
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
+                                          const Vec128<int64_t, N> b) {
+  const Simd<int32_t, N * 2> d32;
+  const Simd<int64_t, N> d64;
+  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
+  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+  return MaskFromVec(BitCast(d64, cmp64));
+}
+
+template <size_t N>
+HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
+                                           const Vec128<uint64_t, N> b) {
+  const Simd<uint32_t, N * 2> d32;
+  const Simd<uint64_t, N> d64;
+  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
+  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+  return MaskFromVec(BitCast(d64, cmp64));
 }
-HWY_INLINE Vec128<int64_t, 1> Gt(Vec128<int64_t, 1> a, Vec128<int64_t, 1> b) {
-  return Vec128<int64_t, 1>(vcgt_s64(a.raw, b.raw));
+
+HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
+                                      const Vec128<int64_t> b) {
+  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
+  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
+}
+HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
+                                         const Vec128<int64_t, 1> b) {
+  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
+  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
 }

 #endif

-}  // namespace detail
+// ------------------------------ Reversed comparisons
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+  return operator<(b, a);
+}
+template <typename T, size_t N>
+HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
+  return operator<=(b, a);
+}
+
+// ------------------------------ FirstN (Iota, Lt)
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
+  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
+}
+
+// ------------------------------ TestBit (Eq)
+
+#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
+#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
+#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
+  Vec128<type, size> v, Vec128<type, size> bit
+#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
+
+#if HWY_ARCH_ARM_A64
+HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
+#else
+// No 64-bit versions on armv7
+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
+
+template <size_t N>
+HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
+                                        Vec128<uint64_t, N> bit) {
+  return (v & bit) == bit;
+}
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
+                                       Vec128<int64_t, N> bit) {
+  return (v & bit) == bit;
+}
+
+#endif
+#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
+#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
+#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
+#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
+
+// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
+HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
+#if HWY_ARCH_ARM_A64
+  return Vec128<int64_t>(vabsq_s64(v.raw));
+#else
+  const auto zero = Zero(Full128<int64_t>());
+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+#endif
+}
+HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
+#if HWY_ARCH_ARM_A64
+  return Vec128<int64_t, 1>(vabs_s64(v.raw));
+#else
+  const auto zero = Zero(Simd<int64_t, 1>());
+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+#endif
+}
+
+// ------------------------------ Min (IfThenElse, BroadcastSignBit)
+
+#if HWY_ARCH_ARM_A64
+
+HWY_INLINE Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+  return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
+}
+HWY_INLINE Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
+                                          Vec128<uint64_t, 1> b) {
+  return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
+}
+
+#endif

 // Unsigned
 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
@@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min,
 template <size_t N>
 HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
                                    const Vec128<uint64_t, N> b) {
-#if defined(__aarch64__)
-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
+#if HWY_ARCH_ARM_A64
+  return IfThenElse(b < a, b, a);
 #else
   const Simd<uint64_t, N> du;
   const Simd<int64_t, N> di;
@@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v
 template <size_t N>
 HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
                                   const Vec128<int64_t, N> b) {
-#if defined(__aarch64__)
-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
+#if HWY_ARCH_ARM_A64
+  return IfThenElse(b < a, b, a);
 #else
   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
@@ -1612,7 +1805,7 @@ HWY_INLINE Vec128<int64_t, N> Min(const
 }

 // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
 #else
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
@@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max,
 template <size_t N>
 HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
                                    const Vec128<uint64_t, N> b) {
-#if defined(__aarch64__)
-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
+#if HWY_ARCH_ARM_A64
+  return IfThenElse(b < a, a, b);
 #else
   const Simd<uint64_t, N> du;
   const Simd<int64_t, N> di;
@@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v
 template <size_t N>
 HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
                                   const Vec128<int64_t, N> b) {
-#if defined(__aarch64__)
-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
+#if HWY_ARCH_ARM_A64
+  return IfThenElse(b < a, a, b);
 #else
   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
@@ -1650,7 +1843,7 @@ HWY_INLINE Vec128<int64_t, N> Max(const
 }

 // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
 #else
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
@@ -1696,7 +1889,7 @@ HWY_INLINE Vec128<float> LoadU(Full128<f
                                const float* HWY_RESTRICT aligned) {
   return Vec128<float>(vld1q_f32(aligned));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
                                 const double* HWY_RESTRICT aligned) {
   return Vec128<double>(vld1q_f64(aligned));
@@ -1741,7 +1934,7 @@ HWY_INLINE Vec128<float, 2> LoadU(Simd<f
                                   const float* HWY_RESTRICT p) {
   return Vec128<float, 2>(vld1_f32(p));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
                                    const double* HWY_RESTRICT p) {
   return Vec128<double, 1>(vld1_f64(p));
@@ -1755,73 +1948,72 @@ HWY_INLINE Vec128<double, 1> LoadU(Simd<
 // we don't actually care what is in it, and we don't want
 // to introduce extra overhead by initializing it to something.

-HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
+HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
                                     const uint8_t* HWY_RESTRICT p) {
-  uint32x2_t a = Undefined(d).raw;
+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
   return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
 }
-HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
+HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
                                      const uint16_t* HWY_RESTRICT p) {
-  uint32x2_t a = Undefined(d).raw;
+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
   return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
 }
-HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
+HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
                                      const uint32_t* HWY_RESTRICT p) {
-  uint32x2_t a = Undefined(d).raw;
+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
   uint32x2_t b = vld1_lane_u32(p, a, 0);
   return Vec128<uint32_t, 1>(b);
 }
-HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
+HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
                                    const int8_t* HWY_RESTRICT p) {
-  int32x2_t a = Undefined(d).raw;
+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
   return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
 }
-HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
+HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
                                     const int16_t* HWY_RESTRICT p) {
-  int32x2_t a = Undefined(d).raw;
+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
   return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
 }
-HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
+HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
                                     const int32_t* HWY_RESTRICT p) {
-  int32x2_t a = Undefined(d).raw;
+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
   int32x2_t b = vld1_lane_s32(p, a, 0);
   return Vec128<int32_t, 1>(b);
 }
-HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
+HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
                                   const float* HWY_RESTRICT p) {
-  float32x2_t a = Undefined(d).raw;
+  float32x2_t a = Undefined(Simd<float, 2>()).raw;
   float32x2_t b = vld1_lane_f32(p, a, 0);
   return Vec128<float, 1>(b);
 }

 // ------------------------------ Load 16

-HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
+HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
                                     const uint8_t* HWY_RESTRICT p) {
-  uint16x4_t a = Undefined(d).raw;
+  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
   uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
   return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
 }
-HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
+HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
                                      const uint16_t* HWY_RESTRICT p) {
-  uint16x4_t a = Undefined(d).raw;
+  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
   uint16x4_t b = vld1_lane_u16(p, a, 0);
   return Vec128<uint16_t, 1>(b);
 }
-
-HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
+HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
                                    const int8_t* HWY_RESTRICT p) {
-  int16x4_t a = Undefined(d).raw;
+  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
   int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
   return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
 }
-HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
+HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
                                     const int16_t* HWY_RESTRICT p) {
-  int16x4_t a = Undefined(d).raw;
+  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
   int16x4_t b = vld1_lane_s16(p, a, 0);
   return Vec128<int16_t, 1>(b);
 }
@@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128<floa
                        float* HWY_RESTRICT aligned) {
   vst1q_f32(aligned, v.raw);
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
                        double* HWY_RESTRICT aligned) {
   vst1q_f64(aligned, v.raw);
@@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128<floa
                        float* HWY_RESTRICT p) {
   vst1_f32(p, v.raw);
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
                        double* HWY_RESTRICT p) {
   vst1_f64(p, v.raw);
@@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128<doub
 HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
                        uint8_t* HWY_RESTRICT p) {
   uint32x2_t a = vreinterpret_u32_u8(v.raw);
-  vst1_lane_u32(p, a, 0);
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
                        uint16_t* HWY_RESTRICT p) {
   uint32x2_t a = vreinterpret_u32_u16(v.raw);
-  vst1_lane_u32(p, a, 0);
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
                        uint32_t* HWY_RESTRICT p) {
@@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128<uint
 HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
                        int8_t* HWY_RESTRICT p) {
   int32x2_t a = vreinterpret_s32_s8(v.raw);
-  vst1_lane_s32(p, a, 0);
+  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
                        int16_t* HWY_RESTRICT p) {
   int32x2_t a = vreinterpret_s32_s16(v.raw);
-  vst1_lane_s32(p, a, 0);
+  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
                        int32_t* HWY_RESTRICT p) {
@@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128<floa
 HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
                        uint8_t* HWY_RESTRICT p) {
   uint16x4_t a = vreinterpret_u16_u8(v.raw);
-  vst1_lane_u16(p, a, 0);
+  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
                        uint16_t* HWY_RESTRICT p) {
@@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128<uint
 HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
                        int8_t* HWY_RESTRICT p) {
   int16x4_t a = vreinterpret_s16_s8(v.raw);
-  vst1_lane_s16(p, a, 0);
+  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
 }
 HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
                        int16_t* HWY_RESTRICT p) {
@@ -2068,18 +2260,18 @@ HWY_INLINE Vec128<uint64_t> PromoteTo(Fu
                                       const Vec128<uint32_t, 2> v) {
   return Vec128<uint64_t>(vmovl_u32(v.raw));
 }
-HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
+HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
                                      const Vec128<uint8_t, 8> v) {
-  return Vec128<int16_t>(vmovl_u8(v.raw));
+  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
 }
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
                                      const Vec128<uint8_t, 4> v) {
   uint16x8_t a = vmovl_u8(v.raw);
-  return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
+  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
 }
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
                                      const Vec128<uint16_t, 4> v) {
-  return Vec128<int32_t>(vmovl_u16(v.raw));
+  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
 }

 // Unsigned: zero-extend to half vector.
@@ -2105,9 +2297,9 @@ HWY_INLINE Vec128<uint64_t, N> PromoteTo
   return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
 }
 template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
+HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
                                         const Vec128<uint8_t, N> v) {
-  return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
+  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
 }
 template <size_t N, HWY_IF_LE64(int32_t, N)>
 HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
@@ -2170,12 +2362,14 @@ HWY_INLINE Vec128<int64_t, N> PromoteTo(

 HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
                                    const Vec128<float16_t, 4> v) {
-  return Vec128<float>(vcvt_f32_f16(v.raw));
+  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
+  return Vec128<float>(f32);
 }
 template <size_t N>
 HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
                                       const Vec128<float16_t, N> v) {
-  return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
+  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
+  return Vec128<float, N>(vget_low_f32(f32));
 }

 #else
@@ -2204,7 +2398,7 @@ HWY_INLINE Vec128<float, N> PromoteTo(Si

 #endif

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64

 HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
                                     const Vec128<float, 2> v) {
@@ -2298,12 +2492,13 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si

 HWY_INLINE Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
                                          const Vec128<float> v) {
-  return Vec128<float16_t, 4>{vcvt_f16_f32(v.raw)};
+  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
 }
 template <size_t N>
 HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
                                          const Vec128<float, N> v) {
-  return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
+  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
+  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
 }

 #else
@@ -2339,7 +2534,7 @@ HWY_INLINE Vec128<float16_t, N> DemoteTo
 }

 #endif
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64

 HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
                                      const Vec128<double> v) {
@@ -2397,7 +2592,7 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
                                       const Vec128<int32_t> v) {
   Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
   Vec128<int16_t, N> b;
-  uint16x8_t c = vcombine_s16(a.raw, b.raw);
+  int16x8_t c = vcombine_s16(a.raw, b.raw);
   return Vec128<int8_t, N>(vqmovn_s16(c));
 }

@@ -2426,7 +2621,7 @@ HWY_INLINE Vec128<int32_t, N> ConvertTo(
   return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
 }

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64

 HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
                                     const Vec128<int64_t> v) {
@@ -2451,7 +2646,7 @@ HWY_INLINE Vec128<int64_t, 1> ConvertTo(

 // ------------------------------ Round (IfThenElse, mask, logical)

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 // Toward nearest integer
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)

@@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor,
 // representation, clearing the lowest 23-exp mantissa bits. This requires 9
 // integer operations and 3 constants, which is likely more expensive.

+namespace detail {
+
+// The original value is already the desired result if NaN or the magnitude is
+// large (i.e. the value is already an integer).
+template <size_t N>
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
+  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
+}
+
+}  // namespace detail
+
 template <size_t N>
 HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
   const Simd<float, N> df;
-  const Simd<int32_t, N> di;
+  const RebindToSigned<decltype(df)> di;

   const auto integer = ConvertTo(di, v);  // round toward 0
   const auto int_f = ConvertTo(df, integer);

-  // The original value is already the desired result if NaN or the magnitude is
-  // large (i.e. the value is already an integer).
-  const auto max = Set(df, MantissaEnd<float>());
-  return IfThenElse(Abs(v) < max, int_f, v);
+  return IfThenElse(detail::UseInt(v), int_f, v);
 }

 template <size_t N>
@@ -2506,7 +2709,7 @@ HWY_INLINE Vec128<float, N> Round(const
 template <size_t N>
 HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
   const Simd<float, N> df;
-  const Simd<int32_t, N> di;
+  const RebindToSigned<decltype(df)> di;

   const auto integer = ConvertTo(di, v);  // round toward 0
   const auto int_f = ConvertTo(df, integer);
@@ -2514,9 +2717,7 @@ HWY_INLINE Vec128<float, N> Ceil(const V
   // Truncating a positive non-integer ends up smaller; if so, add 1.
   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));

-  // Keep original if NaN or the magnitude is large (already an int).
-  const auto max = Set(df, MantissaEnd<float>());
-  return IfThenElse(Abs(v) < max, int_f - neg1, v);
+  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
 }

 template <size_t N>
@@ -2530,16 +2731,14 @@ HWY_INLINE Vec128<float, N> Floor(const
   // Truncating a negative non-integer ends up larger; if so, subtract 1.
   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));

-  // Keep original if NaN or the magnitude is large (already an int).
-  const auto max = Set(df, MantissaEnd<float>());
-  return IfThenElse(Abs(v) < max, int_f + neg1, v);
+  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
 }

 #endif

 // ------------------------------ NearestInt (Round)

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64

 HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
   return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
@@ -2596,7 +2795,7 @@ HWY_INLINE Vec128<int64_t, 1> LowerHalf(
 HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
   return Vec128<float, 2>(vget_low_f32(v.raw));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
   return Vec128<double, 1>(vget_low_f64(v.raw));
 }
@@ -2629,7 +2828,7 @@ HWY_INLINE Vec128<int64_t, 1> UpperHalf(
 HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
   return Vec128<float, 2>(vget_high_f32(v.raw));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
   return Vec128<double, 1>(vget_high_f64(v.raw));
 }
@@ -2714,7 +2913,7 @@ HWY_INLINE Vec128<T, N> ShiftRightLanes(

 // ------------------------------ Broadcast/splat any lane

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 // Unsigned
 template <int kLane>
 HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
@@ -2886,7 +3085,7 @@ HWY_API Vec128<T> TableLookupBytes(const
                                    const Vec128<T> from) {
   const Full128<T> d;
   const Repartition<uint8_t, decltype(d)> d8;
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
                                                BitCast(d8, from).raw)));
 #else
@@ -2911,33 +3110,58 @@ HWY_INLINE Vec128<T, N> TableLookupBytes
                                                 BitCast(d8, from).raw)));
 }

-// ------------------------------ Hard-coded shuffles
+// ------------------------------ TableLookupLanes

-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
+// Returned by SetTableIndices for use by TableLookupLanes.
+template <typename T, size_t N>
+struct Indices128 {
+  typename detail::Raw128<T, N>::type raw;
+};

-// Swap 32-bit halves in 64-bits
-HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
-  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
-}
-HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
-  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
-}
-HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
-  return Vec128<float, 2>(vrev64_f32(v.raw));
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+  for (size_t i = 0; i < N; ++i) {
+    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+  }
+#endif
+
+  const Repartition<uint8_t, decltype(d)> d8;
+  alignas(16) uint8_t control[16] = {0};
+  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+      control[idx_lane * sizeof(T) + idx_byte] =
+          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+    }
+  }
+  return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
 }
-HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>(vrev64q_u32(v.raw));
+
+template <size_t N>
+HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
+    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
 }
-HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
-  return Vec128<int32_t>(vrev64q_s32(v.raw));
+template <size_t N>
+HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
+    const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
 }
-HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
-  return Vec128<float>(vrev64q_f32(v.raw));
+template <size_t N>
+HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+                                             const Indices128<float, N> idx) {
+  const Simd<int32_t, N> di;
+  const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
+  return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
 }

+// ------------------------------ Other shuffles (TableLookupBytes)
+
+// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
+// Shuffle0321 rotates one lane to the right (the previous least-significant
+// lane is now most-significant). These could also be implemented via
+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
+
 // Swap 64-bit halves
 template <typename T>
 HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
@@ -2975,49 +3199,6 @@ HWY_INLINE Vec128<T> Shuffle0123(const V
   return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
 }

-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
-struct Indices128 {
-  uint8x16_t raw;
-};
-
-template <typename T>
-HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
-  const size_t N = 16 / sizeof(T);
-  for (size_t i = 0; i < N; ++i) {
-    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
-  }
-#endif
-
-  const Full128<uint8_t> d8;
-  alignas(16) uint8_t control[16];
-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
-    const size_t idx_lane = idx_byte / sizeof(T);
-    const size_t mod = idx_byte % sizeof(T);
-    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
-  }
-  return Indices128<T>{Load(d8, control).raw};
-}
-
-HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
-                                             const Indices128<uint32_t> idx) {
-  return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
-}
-HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
-                                            const Indices128<int32_t> idx) {
-  return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
-}
-HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
-                                          const Indices128<float> idx) {
-  const Full128<int32_t> di;
-  const Full128<float> df;
-  return BitCast(df,
-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
-}
-
 // ------------------------------ Interleave lanes

 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter
 HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 // For 64 bit types, we only have the "q" version of the function defined as
 // interleaving 64-wide registers with 64-wide types in them makes no sense.
 HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
@@ -3079,7 +3260,7 @@ HWY_INLINE Vec128<float> InterleaveLower
                                          const Vec128<float> b) {
   return Vec128<float>(vzip1q_f32(a.raw, b.raw));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
                                           const Vec128<double> b) {
   return Vec128<double>(vzip1q_f64(a.raw, b.raw));
@@ -3090,10 +3271,10 @@ HWY_INLINE Vec128<float> InterleaveUpper
                                          const Vec128<float> b) {
   return Vec128<float>(vzip2q_f32(a.raw, b.raw));
 }
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
                                           const Vec128<double> b) {
-  return Vec128<double>(vzip2q_s64(a.raw, b.raw));
+  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
 }
 #endif

@@ -3105,119 +3286,125 @@ HWY_INLINE Vec128<double> InterleaveUppe
 // Full vectors
 HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
                                      const Vec128<uint8_t> b) {
-  return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
+  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
                                      const Vec128<uint16_t> b) {
-  return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
+  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
                                      const Vec128<uint32_t> b) {
-  return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
+  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw)));
 }

 HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
                                     const Vec128<int8_t> b) {
-  return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
+  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
                                     const Vec128<int16_t> b) {
-  return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
+  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
                                     const Vec128<int32_t> b) {
-  return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
+  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw)));
 }

 HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
                                      const Vec128<uint8_t> b) {
-  return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
+  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
                                      const Vec128<uint16_t> b) {
-  return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
+  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
                                      const Vec128<uint32_t> b) {
-  return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
+  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw)));
 }

 HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
                                     const Vec128<int8_t> b) {
-  return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
+  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
                                     const Vec128<int16_t> b) {
-  return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
+  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw)));
 }
 HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
                                     const Vec128<int32_t> b) {
-  return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
+  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw)));
 }

 // Half vectors or less
 template <size_t N, HWY_IF_LE64(uint8_t, N)>
 HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
                                                   const Vec128<uint8_t, N> b) {
-  return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
+  return Vec128<uint16_t, (N + 1) / 2>(
+      vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(uint16_t, N)>
 HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
                                                   const Vec128<uint16_t, N> b) {
-  return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
+  return Vec128<uint32_t, (N + 1) / 2>(
+      vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(uint32_t, N)>
 HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
                                                   const Vec128<uint32_t, N> b) {
-  return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
+  return Vec128<uint64_t, (N + 1) / 2>(
+      vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw)));
 }

 template <size_t N, HWY_IF_LE64(int8_t, N)>
 HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
                                                  const Vec128<int8_t, N> b) {
-  return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
+  return Vec128<int16_t, (N + 1) / 2>(
+      vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(int16_t, N)>
 HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
                                                  const Vec128<int16_t, N> b) {
-  return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
+  return Vec128<int32_t, (N + 1) / 2>(
+      vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(int32_t, N)>
 HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
                                                  const Vec128<int32_t, N> b) {
-  return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
+  return Vec128<int64_t, (N + 1) / 2>(
+      vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw)));
 }

 template <size_t N, HWY_IF_LE64(uint8_t, N)>
 HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
                                             const Vec128<uint8_t, N> b) {
-  return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
+  return Vec128<uint16_t, N / 2>(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(uint16_t, N)>
 HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
                                             const Vec128<uint16_t, N> b) {
-  return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
+  return Vec128<uint32_t, N / 2>(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(uint32_t, N)>
 HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
                                             const Vec128<uint32_t, N> b) {
-  return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
+  return Vec128<uint64_t, N / 2>(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw)));
 }

 template <size_t N, HWY_IF_LE64(int8_t, N)>
 HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
                                            const Vec128<int8_t, N> b) {
-  return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
+  return Vec128<int16_t, N / 2>(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(int16_t, N)>
 HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
                                            const Vec128<int16_t, N> b) {
-  return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
+  return Vec128<int32_t, N / 2>(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw)));
 }
 template <size_t N, HWY_IF_LE64(int32_t, N)>
 HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
                                            const Vec128<int32_t, N> b) {
-  return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
+  return Vec128<int64_t, N / 2>(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw)));
 }

 // ------------------------------ Blocks
@@ -3274,84 +3461,113 @@ HWY_INLINE Vec128<T> OddEven(const Vec12

 // ================================================== MISC

-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+// ------------------------------ Scatter (Store)
+
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                           const Vec128<Offset, N> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+  }
+}
+
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                          const Vec128<Index, N> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  for (size_t i = 0; i < N; ++i) {
+    base[index_lanes[i]] = lanes[i];
   }
-  return Load(d, lanes);
 }

-// ------------------------------ Gather (requires GetLane)
+// ------------------------------ Gather (Load/Store)

 template <typename T, size_t N, typename Offset>
 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
                                   const T* HWY_RESTRICT base,
                                   const Vec128<Offset, N> offset) {
-  static_assert(N == 1, "NEON does not support full gather");
-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
-  T val;
-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
-  return Set(d, val);
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  alignas(16) T lanes[N];
+  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+  }
+  return Load(d, lanes);
 }

 template <typename T, size_t N, typename Index>
 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
                                  const Vec128<Index, N> index) {
-  static_assert(N == 1, "NEON does not support full gather");
-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
-  return Set(d, base[GetLane(index)]);
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  alignas(16) T lanes[N];
+  for (size_t i = 0; i < N; ++i) {
+    lanes[i] = base[index_lanes[i]];
+  }
+  return Load(d, lanes);
 }

-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
+// ------------------------------ Reductions

-#if !defined(__aarch64__)
+namespace detail {

-template <size_t N>
-HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
-                                          const Vec128<int64_t, N> b) {
-  const Simd<int32_t, N * 2> d32;
-  const Simd<int64_t, N> d64;
-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-  return MaskFromVec(BitCast(d64, cmp64));
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
+  return v;
 }
-
-template <size_t N>
-HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
-                                           const Vec128<uint64_t, N> b) {
-  const Simd<uint32_t, N * 2> d32;
-  const Simd<uint64_t, N> d64;
-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-  return MaskFromVec(BitCast(d64, cmp64));
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
 }

-HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
-                                      const Vec128<int64_t> b) {
-  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
+// u32/i32/f32: N=2
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
+  return v10 + Shuffle2301(v10);
 }
-HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
-                                         const Vec128<int64_t, 1> b) {
-  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Min(v10, Shuffle2301(v10));
 }
-
-template <size_t N>
-HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
-                                         const Vec128<int64_t, N> b) {
-  return b < a;
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Max(v10, Shuffle2301(v10));
 }
-#endif
-
-// ------------------------------ Reductions

-#if defined(__aarch64__)
-// Supported for 32b and 64b vector types. Returns the sum in each lane.
+// full vectors
+#if HWY_ARCH_ARM_A64
 HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
   return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
 }
@@ -3398,20 +3614,15 @@ HWY_INLINE Vec128<int64_t> SumOfLanes(co
 }
 #endif

-namespace detail {
-
-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Min(v20_31_20_31, v31_20_31_20);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
@@ -3419,15 +3630,13 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
 }

 // For u64/i64[/f64].
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Min(v10, v01);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Max(v10, v01);
 }
@@ -3435,6 +3644,10 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
 }  // namespace detail

 template <typename T, size_t N>
+HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
+  return detail::SumOfLanes(v);
+}
+template <typename T, size_t N>
 HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }
@@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Vec128<uint8_t> values =
       BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   // Can't vaddv - we need two separate bytes (16 bits).
   const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
   const uint8x8_t x4 = vpadd_u8(x2, x2);
   const uint8x8_t x8 = vpadd_u8(x4, x4);
-  return vreinterpret_u16_u8(x8)[0];
+  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
 #else
   // Don't have vpaddq, so keep doubling lane size.
   const uint16x8_t x2 = vpaddlq_u8(values.raw);
   const uint32x4_t x4 = vpaddlq_u16(x2);
   const uint64x2_t x8 = vpaddlq_u32(x4);
-  return (uint64_t(x8[1]) << 8) | x8[0];
+  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
 #endif
 }

@@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
   const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddv_u8(values.raw);
 #else
   const uint16x4_t x2 = vpaddl_u8(values.raw);
@@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Full128<uint16_t> du;
   const Vec128<uint16_t> values =
       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_u16(values.raw);
 #else
   const uint32x4_t x2 = vpaddlq_u16(values.raw);
@@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Simd<uint16_t, N> du;
   const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
   const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddv_u16(values.raw);
 #else
   const uint32x2_t x2 = vpaddl_u16(values.raw);
@@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Full128<uint32_t> du;
   const Vec128<uint32_t> values =
       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_u32(values.raw);
 #else
   const uint64x2_t x2 = vpaddlq_u32(values.raw);
@@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Simd<uint32_t, N> du;
   const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
   const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddv_u32(values.raw);
 #else
   const uint64x1_t x2 = vpaddl_u32(values.raw);
@@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
   const Full128<uint64_t> du;
   const Vec128<uint64_t> values =
       BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_u64(values.raw);
 #else
   return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
@@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
   const int8x16_t ones =
       vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_s8(ones);
 #else
   const int16x8_t x2 = vpaddlq_s8(ones);
   const int32x4_t x4 = vpaddlq_s16(x2);
   const int64x2_t x8 = vpaddlq_s32(x4);
-  return x8[0] + x8[1];
+  return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
 #endif
 }
 template <typename T>
@@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
   const int16x8_t ones =
       vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_s16(ones);
 #else
   const int32x4_t x2 = vpaddlq_s16(ones);
   const int64x2_t x4 = vpaddlq_s32(x2);
-  return x4[0] + x4[1];
+  return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
 #endif
 }

@@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
   const int32x4_t ones =
       vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);

-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   return vaddvq_s32(ones);
 #else
   const int64x2_t x2 = vpaddlq_s32(ones);
-  return x2[0] + x2[1];
+  return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
 #endif
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
-#if defined(__aarch64__)
+#if HWY_ARCH_ARM_A64
   const Full128<int64_t> di;
   const int64x2_t ones =
       vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
   return vaddvq_s64(ones);
 #else
-  const Full128<int64_t> di;
-  const int64x2_t ones =
-      vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
-  return ones[0] + ones[1];
+  const Full128<uint64_t> du;
+  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
+  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
+  return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
 #endif
 }

@@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma
 // Full
 template <typename T>
 HWY_INLINE bool AllFalse(const Mask128<T> m) {
+#if HWY_ARCH_ARM_A64
+  const Full128<uint32_t> d32;
+  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128<T>(), m)));
+  return (vmaxvq_u32(m32.raw) == 0);
+#else
   const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
   uint32x2_t a = vqmovn_u64(v64.raw);
-  return vreinterpret_u64_u32(a)[0] == 0;
+  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
+#endif
 }

 // Partial
@@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128<T,

 namespace detail {

+// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
+HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
+                                      const uint8_t* bytes) {
+  return Vec128<uint8_t>(vreinterpretq_u8_u64(
+      vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
+}
+
+// Load 8 bytes and return half-reg with N <= 8 bytes.
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N> d,
+                                         const uint8_t* bytes) {
+  return Load(d, bytes);
+}
+
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
+                                    const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N> du;
+
+  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
+  // indices for VTBL (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
+      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
+      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
+      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
+      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
+      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
+      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
+      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
+      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
+      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
+      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
+      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
+      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
+      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
+      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
+      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
+      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
+      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
+      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
+      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
+      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
+      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
+      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
+      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
+      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
+      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
+      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
+      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
+      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
+      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
+      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
+      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
+      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
+      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
+      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
+      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
+      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
+      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
+      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
+      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
+      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
+      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
+      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
+      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
+      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
+      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
+      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
+      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
+      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
+      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
+      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
+      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
+      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
+      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
+      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
+      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
+      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
+      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
+      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
+      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
+      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
+      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
+      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
+      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
+      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
+      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
+      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
+      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
+      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
+      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
+      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
+      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
+      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
+      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
+      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
+      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
+      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
+      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
+      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
+      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
+      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
+      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
+      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
+      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
+      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
+      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
+      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
+      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
+      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
+      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
+      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
+      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
+      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
+      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
+      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
+      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
+      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
+      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
+      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
+      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
+      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
+      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
+      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
+      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
+      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
+      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
+      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
+      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
+      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
+      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
+      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
+      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
+      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
+      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
+                                    const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

   // There are only 4 lanes, so we can afford to load the index vector directly.
@@ -3742,7 +4113,8 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(
 #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64

 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
+                                    const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);

   // There are only 2 lanes, so we can afford to load the index vector directly.
@@ -3761,59 +4133,15 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(

 // Helper function called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.
-
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
-                                     const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
-                                    const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-
-#if HWY_CAP_INTEGER64
-
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
-                                     const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
-                                    const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-
-#endif
-
-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
-                                  const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  const Simd<float, N> df;
-  const Simd<int32_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
-}
-
-#if HWY_CAP_FLOAT64
-
-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
-                                   const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  const Simd<double, N> df;
-  const Simd<int64_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx =
+      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
+  using D = Simd<T, N>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

-#endif
-
 }  // namespace detail

 template <typename T, size_t N>
@@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128<T, N
   return PopCount(mask_bits);
 }

+// ------------------------------ StoreInterleaved3
+
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               const Vec128<uint8_t> v2,
+                               Full128<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
+  vst3q_u8(unaligned, triple);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
+                               const Vec128<uint8_t, 8> v1,
+                               const Vec128<uint8_t, 8> v2,
+                               Simd<uint8_t, 8> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
+  vst3_u8(unaligned, triple);
+}
+
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
+                               const Vec128<uint8_t, N> v1,
+                               const Vec128<uint8_t, N> v2,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  alignas(16) uint8_t buf[24];
+  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
+  vst3_u8(buf, triple);
+  CopyBytes<N * 3>(buf, unaligned);
+}
+
+// ------------------------------ StoreInterleaved4
+
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               const Vec128<uint8_t> v2,
+                               const Vec128<uint8_t> v3,
+                               Full128<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+  vst4q_u8(unaligned, quad);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> v0,
+                               const Vec128<uint8_t, 8> v1,
+                               const Vec128<uint8_t, 8> v2,
+                               const Vec128<uint8_t, 8> v3,
+                               Simd<uint8_t, 8> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+  vst4_u8(unaligned, quad);
+}
+
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> v0,
+                               const Vec128<uint8_t, N> v1,
+                               const Vec128<uint8_t, N> v2,
+                               const Vec128<uint8_t, N> v3,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  alignas(16) uint8_t buf[32];
+  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+  vst4_u8(buf, quad);
+  CopyBytes<N * 4>(buf, unaligned);
+}
+
 // ================================================== Operator wrapper

 // These apply to all x86_*-inl.h because there are no restrictions on V.
@@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a
   return a <= b;
 }

-#if !defined(__aarch64__)
+namespace detail {  // for code folding
+#if HWY_ARCH_ARM_V7
 #undef vuzp1_s8
 #undef vuzp1_u8
 #undef vuzp1_s16
@@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
 #undef HWY_NEON_DEF_FUNCTION_UINTS
 #undef HWY_NEON_EVAL
+}  // namespace detail

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12	2021-06-02 10:56:05.230904367 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -39,6 +39,11 @@ using TFromV = TFromD<DFromV<V>>;
   hwy::EnableIf<IsSigned<TFromV<V>>() && !IsFloat<TFromV<V>>()>* = nullptr
 #define HWY_IF_FLOAT_V(V) hwy::EnableIf<IsFloat<TFromV<V>>()>* = nullptr

+// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
+template <typename T, int kShift = 0>
+using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
+                                  : (HWY_LANES(T) << kShift)>;
+
 // ================================================== MACROS

 // Generate specializations and function definitions using X macros. Although
@@ -58,29 +63,30 @@ namespace detail {  // for code folding

 // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the
 // preprocessor cannot easily do it.
-#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP)                  \
-  X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP)                  \
-  X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP)                  \
-  X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP)
-
-#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP)                 \
-  X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP)                 \
-  X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP)
-
-#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP)                 \
-  X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP)
-
-#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP)                \
-  X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP)
+// TODO(janwas): GCC does not yet support fractional LMUL
+#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP)        \
+  X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
+  X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
+  X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
+  X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
+
+#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP)          \
+  X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
+  X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP)  \
+  X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP)  \
+  X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
+
+#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP)          \
+  X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
+  X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
+  X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP)  \
+  X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
+
+#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP)          \
+  X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
+  X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
+  X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
+  X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP)

 // SEW for unsigned:
 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
@@ -153,63 +159,61 @@ namespace detail {  // for code folding

 // Assemble types for use in x-macros
 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
-#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL
-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t
+#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL
+#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
 #define HWY_RVV_M(MLEN) vbool##MLEN##_t

 }  // namespace detail

 // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly

-// TODO(janwas): do we want fractional LMUL? (can encode as negative)
-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they
-// need many registers.
-#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
-  using HWY_RVV_D(CHAR, SEW, LMUL) =                                      \
-      Simd<HWY_RVV_T(BASE, SEW), HWY_LANES(HWY_RVV_T(BASE, SEW)) * LMUL>; \
-  using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL);               \
-  template <>                                                             \
-  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                           \
-    using Lane = HWY_RVV_T(BASE, SEW);                                    \
-    using type = Simd<Lane, HWY_LANES(Lane) * LMUL>;                      \
+// Until we have full intrinsic support for fractional LMUL, mixed-precision
+// code can use LMUL 1..8 (adequate unless they need many registers).
+#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+  using HWY_RVV_D(CHAR, SEW, LMUL) = Full<HWY_RVV_T(BASE, SEW), SHIFT>; \
+  using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL);                \
+  template <>                                                           \
+  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                         \
+    using Lane = HWY_RVV_T(BASE, SEW);                                  \
+    using type = Full<Lane, SHIFT>;                                     \
   };
 using Vf16m1 = vfloat16m1_t;
 using Vf16m2 = vfloat16m2_t;
 using Vf16m4 = vfloat16m4_t;
 using Vf16m8 = vfloat16m8_t;
-using Df16m1 = Simd<float16_t, HWY_LANES(uint16_t) * 1>;
-using Df16m2 = Simd<float16_t, HWY_LANES(uint16_t) * 2>;
-using Df16m4 = Simd<float16_t, HWY_LANES(uint16_t) * 4>;
-using Df16m8 = Simd<float16_t, HWY_LANES(uint16_t) * 8>;
+using Df16m1 = Full<float16_t, 0>;
+using Df16m2 = Full<float16_t, 1>;
+using Df16m4 = Full<float16_t, 2>;
+using Df16m8 = Full<float16_t, 3>;

 HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
 #undef HWY_SPECIALIZE

 // vector = f(d), e.g. Zero
-#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
+#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \
     (void)Lanes(d);                                                       \
-    return v##OP##_##CHAR##SEW##m##LMUL();                                \
+    return v##OP##_##CHAR##SEW##LMUL();                                   \
   }

 // vector = f(vector), e.g. Not
-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
+#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_v_##CHAR##SEW##m##LMUL(v);                             \
+    return v##OP##_v_##CHAR##SEW##LMUL(v);                                \
   }

 // vector = f(vector, scalar), e.g. detail::Add
-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)  \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
-    return v##OP##_##CHAR##SEW##m##LMUL(a, b);                     \
+#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {       \
+    return v##OP##_##CHAR##SEW##LMUL(a, b);                              \
   }

 // vector = f(vector, vector), e.g. Add
-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
+#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b);                        \
+    return v##OP##_vv_##CHAR##SEW##LMUL(a, b);                           \
   }

 // ================================================== INIT
@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)

 // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
 // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
-#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {  \
-    return v##OP##SEW##m##LMUL();                            \
+#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {         \
+    return v##OP##SEW##LMUL();                                      \
   }

 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e)
@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero,
 template <class D>
 using VFromD = decltype(Zero(D()));

+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Full<T>> Zero(Simd<T, N> /*tag*/) {
+  return Zero(Full<T>());
+}
+
 // ------------------------------ Set
 // vector = f(d, scalar), e.g. Set
-#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
+#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                 \
       NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \
     (void)Lanes(d);                                                  \
-    return v##OP##_##CHAR##SEW##m##LMUL(arg);                        \
+    return v##OP##_##CHAR##SEW##LMUL(arg);                           \
   }

 HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x)
 HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f)
 #undef HWY_RVV_SET

+// Partial vectors
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> Set(Simd<T, N> /*tag*/, T arg) {
+  return Set(Full<T>(), arg);
+}
+
 // ------------------------------ Undefined

 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
@@ -265,7 +281,7 @@ HWY_API VFromD<D> Undefined(D d) {
 namespace detail {

 // u8: no change
-#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
+#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
       BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) {                       \
     return v;                                                             \
@@ -276,25 +292,25 @@ namespace detail {
   }

 // Other integers
-#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)            \
-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v);                 \
-  }                                                                       \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
-    return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v);                 \
+#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+    return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v);                    \
+  }                                                                      \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
+      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
+    return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v);                    \
   }

 // Float: first cast to/from unsigned
-#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL(                         \
-        v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v));                  \
-  }                                                                       \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
-    return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL(                   \
-        v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v));                        \
+#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)     \
+  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+    return v##OP##_v_u##SEW##LMUL##_u8##LMUL(                            \
+        v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v));                       \
+  }                                                                      \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
+      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
+    return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL(                        \
+        v##OP##_v_u8##LMUL##_u##SEW##LMUL(v));                           \
   }

 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _)
@@ -315,6 +331,12 @@ HWY_API VFromD<D> BitCast(D d, FromV v)
   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
 }

+// Partial
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> BitCast(Simd<T, N> /*tag*/, FromV v) {
+  return BitCast(Full<T>(), v);
+}
+
 namespace detail {

 template <class V, class DU = RebindToUnsigned<DFromV<V>>>
@@ -336,6 +358,12 @@ HWY_API VFromD<DU> Iota0(const D /*d*/)
   return BitCastToUnsigned(Iota0(DU()));
 }

+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> Iota0(Simd<T, N> /*tag*/) {
+  return Iota0(Full<T>());
+}
+
 }  // namespace detail

 // ================================================== LOGICAL
@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) {
 // ------------------------------ Or

 // Scalar argument plus mask. Used by VecFromMask.
-#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
+#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm,       \
            HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \
-    return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm);    \
+    return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm);       \
   }

 namespace detail {
@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV,
 // ------------------------------ ShiftLeft[Same]

 // Intrinsics do not define .vi forms, so use .vx instead.
-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
-  template <int kBits>                                                     \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {  \
-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits);                      \
-  }                                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
-      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                 \
-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast<uint8_t>(bits)); \
+#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
+  template <int kBits>                                                    \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+    return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits);                        \
+  }                                                                       \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
+      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                \
+    return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits));   \
   }

 HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll)
@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi
 #undef HWY_RVV_SHIFT

 // ------------------------------ Shl
-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)      \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits);                        \
+    return v##OP##_vv_##CHAR##SEW##LMUL(v, bits);                           \
   }

 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll)

-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)              \
+#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) {  \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v,                                \
-                                           detail::BitCastToUnsigned(bits)); \
+    return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \
   }

 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll)
@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons

 // ------------------------------ MulAdd
 // Note: op is still named vv, not vvv.
-#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
+#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
            HWY_RVV_V(BASE, SEW, LMUL) add) {                             \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x);                 \
+    return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x);                    \
   }

 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc)
@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub
 // of all bits; SLEN 8 / LMUL 4 = half of all bits.

 // mask = f(vector, vector)
-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
+#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
   HWY_API HWY_RVV_M(MLEN)                                                \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
     (void)Lanes(DFromV<decltype(a)>());                                  \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b);              \
+    return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b);                 \
   }

 // ------------------------------ Eq
@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo
 #undef HWY_RVV_RETM_ARGMM

 // ------------------------------ IfThenElse
-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                \
-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,       \
-           HWY_RVV_V(BASE, SEW, LMUL) no) {                         \
-    return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes);            \
+#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,              \
+           HWY_RVV_V(BASE, SEW, LMUL) no) {                                \
+    return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes);                      \
   }

 HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge)
@@ -710,7 +737,7 @@ template <class D>
 using MFromD = decltype(MaskFromVec(Zero(D())));

 template <class D, typename MFrom>
-HWY_API MFromD<D> RebindMask(const D d, const MFrom mask) {
+HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
   // No need to check lane size/LMUL are the same: if not, casting MFrom to
   // MFromD<D> would fail.
   return mask;
@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _,

 // ------------------------------ Load

-#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                        \
-      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                    \
-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {   \
-    (void)Lanes(d);                                         \
-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p);          \
+#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
+      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                           \
+           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {          \
+    (void)Lanes(d);                                                \
+    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p);                    \
   }
 HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le)
 #undef HWY_RVV_LOAD

-// Partial load
+// Partial
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API VFromD<Simd<T, N>> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
   return Load(d, p);
@@ -800,16 +827,22 @@ HWY_API VFromD<D> LoadU(D d, const TFrom

 // ------------------------------ Store

-#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                 \
-                    HWY_RVV_D(CHAR, SEW, LMUL) d,                 \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {      \
-    (void)Lanes(d);                                               \
-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v);             \
+#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                        \
+                    HWY_RVV_D(CHAR, SEW, LMUL) d,                        \
+                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {             \
+    (void)Lanes(d);                                                      \
+    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v);                       \
   }
 HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se)
 #undef HWY_RVV_RET_ARGVDP

+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void Store(VFromD<Simd<T, N>> v, Simd<T, N> d, T* HWY_RESTRICT p) {
+  return Store(v, Full<T>(), p);
+}
+
 // ------------------------------ StoreU

 // RVV only requires lane alignment, not natural alignment of the entire vector.
@@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H
   Store(v, d, aligned);
 }

+// ------------------------------ ScatterOffset
+
+#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                     \
+                    HWY_RVV_D(CHAR, SEW, LMUL) /* d */,               \
+                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,         \
+                    HWY_RVV_V(int, SEW, LMUL) offset) {               \
+    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                      \
+        base, detail::BitCastToUnsigned(offset), v);                  \
+  }
+HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sx)
+#undef HWY_RVV_SCATTER
+
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(VFromD<Simd<T, N>> v, Simd<T, N> d,
+                           T* HWY_RESTRICT base,
+                           VFromD<Simd<MakeSigned<T>, N>> offset) {
+  return ScatterOffset(v, Full<T>(), base, offset);
+}
+
+// ------------------------------ ScatterIndex
+
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
+                          const VFromD<RebindToSigned<D>> index) {
+  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
+}
+
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
+                          const VFromD<RebindToSigned<D>> index) {
+  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
+}
+
 // ------------------------------ GatherOffset

-#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                          \
-      NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                \
-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,    \
-           HWY_RVV_V(int, SEW, LMUL) offset) {                \
-    return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL(           \
-        base, detail::BitCastToUnsigned(offset));             \
+#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                 \
+      NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                       \
+           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,           \
+           HWY_RVV_V(int, SEW, LMUL) offset) {                       \
+    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                     \
+        base, detail::BitCastToUnsigned(offset));                    \
   }
 HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lx)
 #undef HWY_RVV_GATHER

+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> GatherOffset(Simd<T, N> d,
+                                        const T* HWY_RESTRICT base,
+                                        VFromD<Simd<MakeSigned<T>, N>> offset) {
+  return GatherOffset(Full<T>(), base, offset);
+}
+
 // ------------------------------ GatherIndex

 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
@@ -852,37 +928,101 @@ HWY_API VFromD<D> GatherIndex(D d, const
   return GatherOffset(d, base, ShiftLeft<3>(index));
 }

-// ================================================== CONVERT
+// ------------------------------ StoreInterleaved3

-// ------------------------------ PromoteTo U
+#define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+  HWY_API void NAME(                                                    \
+      HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b,       \
+      HWY_RVV_V(BASE, SEW, LMUL) c, HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
+      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {                  \
+    const v##BASE##SEW##LMUL##x3_t triple =                             \
+        vcreate_##CHAR##SEW##LMUL##x3(a, b, c);                         \
+    return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple);        \
+  }
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
+HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3)
+HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3)

-HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); }
-HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); }
-HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); }
+#undef HWY_RVV_STORE3

-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); }
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); }
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void StoreInterleaved3(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
+                               VFromD<Simd<T, N>> v2, Simd<T, N> /*tag*/,
+                               T* unaligned) {
+  return StoreInterleaved3(v0, v1, v2, Full<T>(), unaligned);
+}
+
+// ------------------------------ StoreInterleaved4
+
+#define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API void NAME(                                                 \
+      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,  \
+      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3,  \
+      HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                            \
+      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) {                 \
+    const v##BASE##SEW##LMUL##x4_t quad =                            \
+        vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3);               \
+    return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad);         \
+  }
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
+HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4)
+HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4)

-HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) {
-  return vzext_vf2_u32m2(v);
-}
-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) {
-  return vzext_vf2_u32m4(v);
-}
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) {
-  return vzext_vf2_u32m8(v);
-}
+#undef HWY_RVV_STORE4

-HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) {
-  return vzext_vf2_u64m2(v);
-}
-HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) {
-  return vzext_vf2_u64m4(v);
-}
-HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) {
-  return vzext_vf2_u64m8(v);
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void StoreInterleaved4(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
+                               VFromD<Simd<T, N>> v2, VFromD<Simd<T, N>> v3,
+                               Simd<T, N> /*tag*/, T* unaligned) {
+  return StoreInterleaved4(v0, v1, v2, v3, Full<T>(), unaligned);
 }

+// ================================================== CONVERT
+
+#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \
+  HWY_API HWY_RVV_V(BASE, BITS, LMUL)                                          \
+      PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/,                             \
+                HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) {                      \
+    return OP##CHAR##BITS##LMUL(v);                                            \
+  }
+
+// TODO(janwas): GCC does not yet support fractional LMUL
+#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)     \
+  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2)*/ \
+  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1)      \
+  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2)      \
+  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4)
+
+#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)     \
+  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4)*/ \
+  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2)*/ \
+  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1)      \
+  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2)
+
+// ------------------------------ PromoteTo
+
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8)
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16)
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32)
+HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
+
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8)
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16)
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32)
+HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
+
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16)
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32)
+
+// i32 to f64
+HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
+
+#undef HWY_RVV_PROMOTE_X4
+#undef HWY_RVV_PROMOTE_X2
+#undef HWY_RVV_PROMOTE
+
 template <size_t N>
 HWY_API VFromD<Simd<int16_t, N>> PromoteTo(Simd<int16_t, N> d,
                                            VFromD<Simd<uint8_t, N>> v) {
@@ -901,67 +1041,6 @@ HWY_API VFromD<Simd<int32_t, N>> Promote
   return BitCast(d, PromoteTo(Simd<uint32_t, N>(), v));
 }

-// ------------------------------ PromoteTo I
-
-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); }
-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); }
-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); }
-
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); }
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); }
-
-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) {
-  return vsext_vf2_i32m2(v);
-}
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) {
-  return vsext_vf2_i32m4(v);
-}
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) {
-  return vsext_vf2_i32m8(v);
-}
-
-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) {
-  return vsext_vf2_i64m2(v);
-}
-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) {
-  return vsext_vf2_i64m4(v);
-}
-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) {
-  return vsext_vf2_i64m8(v);
-}
-
-// ------------------------------ PromoteTo F
-
-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) {
-  return vfwcvt_f_f_v_f32m2(v);
-}
-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) {
-  return vfwcvt_f_f_v_f32m4(v);
-}
-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) {
-  return vfwcvt_f_f_v_f32m8(v);
-}
-
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) {
-  return vfwcvt_f_f_v_f64m2(v);
-}
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) {
-  return vfwcvt_f_f_v_f64m4(v);
-}
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) {
-  return vfwcvt_f_f_v_f64m8(v);
-}
-
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) {
-  return vfwcvt_f_x_v_f64m2(v);
-}
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) {
-  return vfwcvt_f_x_v_f64m4(v);
-}
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) {
-  return vfwcvt_f_x_v_f64m8(v);
-}
-
 // ------------------------------ DemoteTo U

 // First clamp negative numbers to zero to match x86 packus.
@@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,

 // ------------------------------ ConvertTo F

-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)                 \
+#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)          \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
       HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) {       \
-    return vfcvt_f_x_v_f##SEW##m##LMUL(v);                                     \
+    return vfcvt_f_x_v_f##SEW##LMUL(v);                                        \
   }                                                                            \
   /* Truncates (rounds toward zero). */                                        \
   HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \
                                               HWY_RVV_V(BASE, SEW, LMUL) v) {  \
-    return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v);                                 \
+    return vfcvt_rtz_x_f_v_i##SEW##LMUL(v);                                    \
   }                                                                            \
   /* Uses default rounding mode. */                                            \
   HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return vfcvt_x_f_v_i##SEW##m##LMUL(v);                                     \
+    return vfcvt_x_f_v_i##SEW##LMUL(v);                                        \
   }

 // API only requires f32 but we provide f64 for internal use (otherwise, it
@@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
 HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _)
 #undef HWY_RVV_CONVERT

+// Partial
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> ConvertTo(Simd<T, N> /*tag*/, FromV v) {
+  return ConvertTo(Full<T>(), v);
+}
+
 // ================================================== SWIZZLE

 // ------------------------------ Compress

-#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)  \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                             \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
-    return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v);          \
+#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                   \
+      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) {       \
+    return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v);                   \
   }

+HWY_RVV_FOREACH_UI16(HWY_RVV_COMPRESS, Compress, compress)
 HWY_RVV_FOREACH_UI32(HWY_RVV_COMPRESS, Compress, compress)
 HWY_RVV_FOREACH_UI64(HWY_RVV_COMPRESS, Compress, compress)
 HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress)
@@ -1121,10 +1207,10 @@ HWY_API VFromD<DU> SetTableIndices(D d,

 // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
 // to 2048! We could instead use vrgatherei16.
-#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
+#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx);                        \
+    return v##OP##_vv_##CHAR##SEW##LMUL(v, idx);                           \
   }

 HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
@@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D
   using T = MakeUnsigned<TFromD<D>>;
   return detail::And(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
 }
-
 }  // namespace detail

 template <class V>
@@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) {

 // ------------------------------ GetLane

-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)     \
-  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v);         \
+#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {    \
+    return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v);               \
   }

 HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x)
@@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL

 // ------------------------------ ShiftLeftLanes

-// vector = f(vector, size_t)
-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                         \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) {     \
-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes);     \
+// vector = f(vector, vector, size_t)
+#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+      NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
+           size_t lanes) {                                                 \
+    return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes);                  \
   }

 namespace detail {
@@ -1270,7 +1356,7 @@ template <size_t kLanes, class V>
 HWY_API V ShiftLeftLanes(const V v) {
   using D = DFromV<V>;
   const RebindToSigned<D> di;
-  const auto shifted = detail::SlideUp(v, kLanes);
+  const auto shifted = detail::SlideUp(v, v, kLanes);
   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
@@ -1300,7 +1386,7 @@ template <size_t kLanes, class V>
 HWY_API V ShiftRightLanes(const V v) {
   using D = DFromV<V>;
   const RebindToSigned<D> di;
-  const auto shifted = detail::SlideDown(v, kLanes);
+  const auto shifted = detail::SlideDown(v, v, kLanes);
   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
@@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c
 template <class V>
 HWY_API V ConcatLowerLower(const V hi, const V lo) {
   // Move lower half into upper
-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
+  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
   return ConcatUpperLower(hi_up, lo);
 }

@@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c
 template <class V>
 HWY_API V ConcatUpperUpper(const V hi, const V lo) {
   // Move upper half into lower
-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
+  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
   return ConcatUpperLower(hi, lo_down);
 }

@@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c
 template <class V>
 HWY_API V ConcatLowerUpper(const V hi, const V lo) {
   // Move half of both inputs to the other half
-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
+  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
+  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
   return ConcatUpperLower(hi_up, lo_down);
 }

@@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b)
 // ================================================== REDUCE

 // vector = f(vector, zero_m1)
-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) {    \
-    vsetvlmax_e##SEW##m##LMUL();                                          \
-    return Set(HWY_RVV_D(CHAR, SEW, LMUL)(),                              \
-               GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \
-                   v0, v, v0)));                                          \
+#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)         \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
+      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) {      \
+    vsetvlmax_e##SEW##LMUL();                                                \
+    return Set(                                                              \
+        HWY_RVV_D(CHAR, SEW, LMUL)(),                                        \
+        GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \
   }

 // ------------------------------ SumOfLanes

 namespace detail {
-
 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum)
 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum)
-
 }  // namespace detail

 template <class V>
 HWY_API V SumOfLanes(const V v) {
   using T = TFromV<V>;
-  const auto v0 = Zero(Simd<T, HWY_LANES(T)>());  // always m1
+  const auto v0 = Zero(Full<T>());  // always m1
   return detail::RedSum(v, v0);
 }

 // ------------------------------ MinOfLanes
 namespace detail {
-
 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu)
 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin)
 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin)
-
 }  // namespace detail

 template <class V>
 HWY_API V MinOfLanes(const V v) {
   using T = TFromV<V>;
-  const Simd<T, HWY_LANES(T)> d1;  // always m1
+  const Full<T> d1;  // always m1
   const auto neutral = Set(d1, HighestValue<T>());
   return detail::RedMin(v, neutral);
 }

 // ------------------------------ MaxOfLanes
 namespace detail {
-
 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu)
 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax)
 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax)
-
 }  // namespace detail

 template <class V>
 HWY_API V MaxOfLanes(const V v) {
   using T = TFromV<V>;
-  const Simd<T, HWY_LANES(T)> d1;  // always m1
+  const Full<T> d1;  // always m1
   const auto neutral = Set(d1, LowestValue<T>());
   return detail::RedMax(v, neutral);
 }
@@ -1507,7 +1587,7 @@ HWY_API VFromD<D> LoadDup128(D d, const
 #define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP)                 \
   HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \
     /* LMUL=1 is always enough */                               \
-    Simd<uint8_t, HWY_LANES(uint8_t)> d8;                       \
+    Full<uint8_t> d8;                                           \
     const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN;     \
     /* TODO(janwas): how to convert vbool* to vuint?*/          \
     /*Store(m, d8, p);*/                                        \
@@ -1518,6 +1598,22 @@ HWY_API VFromD<D> LoadDup128(D d, const
 HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _)
 #undef HWY_RVV_STORE_MASK_BITS

+// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
+
+// Disallow for 8-bit because Iota is likely to overflow.
+template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
+  const RebindToSigned<D> di;
+  return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
+}
+
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
+  const auto zero = Zero(d);
+  const auto one = Set(d, 1);
+  return Eq(detail::SlideUp(one, zero, n), one);
+}
+
 // ------------------------------ Neg

 template <class V, HWY_IF_SIGNED_V(V)>
@@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) {
 }

 // vector = f(vector), but argument is repeated
-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)         \
+#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)  \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v);                         \
+    return v##OP##_vv_##CHAR##SEW##LMUL(v, v);                            \
   }

 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn)
@@ -1565,7 +1661,6 @@ template <class V>
 HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
   return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
 }
-
 }  // namespace detail

 template <class V>
@@ -1636,10 +1731,8 @@ HWY_API VFromD<D> Iota(const D d, TFromD
 // Using vwmul does not work for m8, so use mulh instead. Highway only provides
 // MulHigh for 16-bit, so use a private wrapper.
 namespace detail {
-
 HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu)
 HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh)
-
 }  // namespace detail

 template <class V>
@@ -1649,7 +1742,7 @@ HWY_API VFromD<RepartitionToWide<DFromV<
   const auto lo = Mul(a, b);
   const auto hi = detail::MulHigh(a, b);
   const RepartitionToWide<DFromV<V>> dw;
-  return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo));
+  return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo));
 }

 // ================================================== END MACROS
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12	2021-06-02 10:56:05.237904402 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,6 @@
 #include <stdint.h>

 #include <algorithm>  // std::min
-#include <cmath>

 #include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"
@@ -199,7 +198,7 @@ HWY_API Vec1<T> BroadcastSignBit(const V
 template <typename TFrom, typename TTo>
 HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask1<TTo>(m.raw);
+  return Mask1<TTo>{m.bits};
 }

 // v must be 0 or FF..FF.
@@ -224,6 +223,11 @@ Vec1<T> VecFromMask(Sisd<T> /* tag */, c
   return v;
 }

+template <typename T>
+HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
+  return Mask1<T>::FromBool(n != 0);
+}
+
 // Returns mask ? yes : no.
 template <typename T>
 HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
@@ -357,9 +361,9 @@ HWY_INLINE Vec1<T> operator>>(const Vec1

 template <typename T>
 HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
-  const uint64_t a64 = static_cast<int64_t>(a.raw);
-  const uint64_t b64 = static_cast<int64_t>(b.raw);
-  return Vec1<T>((a64 + b64) & ~T(0));
+  const uint64_t a64 = static_cast<uint64_t>(a.raw);
+  const uint64_t b64 = static_cast<uint64_t>(b.raw);
+  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
 }
 HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
   return Vec1<float>(a.raw + b.raw);
@@ -370,9 +374,9 @@ HWY_INLINE Vec1<double> operator+(const

 template <typename T>
 HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
-  const uint64_t a64 = static_cast<int64_t>(a.raw);
-  const uint64_t b64 = static_cast<int64_t>(b.raw);
-  return Vec1<T>((a64 - b64) & ~T(0));
+  const uint64_t a64 = static_cast<uint64_t>(a.raw);
+  const uint64_t b64 = static_cast<uint64_t>(b.raw);
+  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
 }
 HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
   return Vec1<float>(a.raw - b.raw);
@@ -388,21 +392,25 @@ HWY_INLINE Vec1<double> operator-(const
 // Unsigned
 HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255));
+  return Vec1<uint8_t>(
+      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
 }
 HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535));
+  return Vec1<uint16_t>(
+      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
 }

 // Signed
 HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
                                      const Vec1<int8_t> b) {
-  return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127));
+  return Vec1<int8_t>(
+      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
 }
 HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
                                       const Vec1<int16_t> b) {
-  return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767));
+  return Vec1<int16_t>(
+      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
 }

 // ------------------------------ Saturating subtraction
@@ -412,21 +420,25 @@ HWY_INLINE Vec1<int16_t> SaturatedAdd(co
 // Unsigned
 HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255));
+  return Vec1<uint8_t>(
+      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
 }
 HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535));
+  return Vec1<uint16_t>(
+      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
 }

 // Signed
 HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
                                      const Vec1<int8_t> b) {
-  return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127));
+  return Vec1<int8_t>(
+      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
 }
 HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
                                       const Vec1<int16_t> b) {
-  return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767));
+  return Vec1<int16_t>(
+      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
 }

 // ------------------------------ Average
@@ -435,11 +447,11 @@ HWY_INLINE Vec1<int16_t> SaturatedSub(co

 HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>((a.raw + b.raw + 1) / 2);
+  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
 }
 HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>((a.raw + b.raw + 1) / 2);
+  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
 }

 // ------------------------------ Absolute value
@@ -514,15 +526,15 @@ HWY_INLINE Vec1<T> operator/(const Vec1<

 // Returns the upper 16 bits of a * b in each lane.
 HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
-  return Vec1<int16_t>((a.raw * b.raw) >> 16);
+  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
 }
 HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
                                   const Vec1<uint16_t> b) {
   // Cast to uint32_t first to prevent overflow. Otherwise the result of
   // uint16_t * uint16_t is in "int" which may overflow. In practice the result
   // is the same but this way it is also defined.
-  return Vec1<uint16_t>(
-      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16);
+  return Vec1<uint16_t>(static_cast<uint16_t>(
+      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
@@ -617,6 +629,31 @@ HWY_INLINE Vec1<T> Round(const Vec1<T> v
   return Vec1<T>(static_cast<T>(rounded));
 }

+// Round-to-nearest even.
+HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
+  using T = float;
+  using TI = int32_t;
+
+  const T abs = Abs(v).raw;
+  const bool signbit = std::signbit(v.raw);
+
+  if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
+    // Check if too large to cast or NaN
+    if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
+      return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
+    }
+    return Vec1<int32_t>(static_cast<TI>(v.raw));
+  }
+  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
+  const TI rounded = static_cast<TI>(v.raw + bias);
+  if (rounded == 0) return Vec1<int32_t>(0);
+  // Round to even
+  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
+    return Vec1<TI>(rounded - (signbit ? -1 : 1));
+  }
+  return Vec1<TI>(rounded);
+}
+
 template <typename T>
 HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
   using TI = MakeSigned<T>;
@@ -641,7 +678,8 @@ V Ceiling(const V v) {
   Bits bits;
   CopyBytes<sizeof(Bits)>(&v, &bits);

-  const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
+  const int exponent =
+      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
   // Already an integer.
   if (exponent >= kMantissaBits) return v;
   // |v| <= 1 => 0 or 1.
@@ -672,7 +710,8 @@ V Floor(const V v) {
   Bits bits;
   CopyBytes<sizeof(Bits)>(&v, &bits);

-  const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
+  const int exponent =
+      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
   // Already an integer.
   if (exponent >= kMantissaBits) return v;
   // |v| <= 1 => -1 or 0.
@@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1<T> v,
   return Store(v, d, p);
 }

+// ------------------------------ StoreInterleaved3
+
+HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
+                               const Vec1<uint8_t> v2, Sisd<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  StoreU(v0, d, unaligned + 0);
+  StoreU(v1, d, unaligned + 1);
+  StoreU(v2, d, unaligned + 2);
+}
+
+HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
+                               const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
+                               Sisd<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  StoreU(v0, d, unaligned + 0);
+  StoreU(v1, d, unaligned + 1);
+  StoreU(v2, d, unaligned + 2);
+  StoreU(v3, d, unaligned + 3);
+}
+
 // ------------------------------ Stream

 template <typename T>
@@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1<T> v,
   return Store(v, d, aligned);
 }

+// ------------------------------ Scatter
+
+template <typename T, typename Offset>
+HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
+                              const Vec1<Offset> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
+  return Store(v, d, reinterpret_cast<T*>(base8));
+}
+
+template <typename T, typename Index>
+HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
+                             const Vec1<Index> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+  return Store(v, d, base + index.raw);
+}
+
 // ------------------------------ Gather

 template <typename T, typename Offset>
 HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
                                 const Vec1<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
   const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
   return Load(d, reinterpret_cast<const T*>(addr));
 }
@@ -792,7 +868,7 @@ HWY_INLINE Vec1<T> GatherOffset(Sisd<T>
 template <typename T, typename Index>
 HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
                                const Vec1<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
   return Load(d, base + index.raw);
 }

@@ -833,15 +909,20 @@ HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT>

 static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
                                         const Vec1<float16_t> v) {
+#if HWY_NATIVE_FLOAT16
   uint16_t bits16;
   CopyBytes<2>(&v.raw, &bits16);
+#else
+  const uint16_t bits16 = v.raw.bits;
+#endif
   const uint32_t sign = bits16 >> 15;
   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
   const uint32_t mantissa = bits16 & 0x3FF;

   // Subnormal or zero
   if (biased_exp == 0) {
-    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    const float subnormal =
+        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
     return Vec1<float>(sign ? -subnormal : subnormal);
   }

@@ -867,8 +948,12 @@ static HWY_INLINE Vec1<float16_t> Demote
   // Tiny or zero => zero.
   Vec1<float16_t> out;
   if (exp < -24) {
-    bits32 = 0;
-    CopyBytes<2>(&bits32, &out);
+#if HWY_NATIVE_FLOAT16
+    const uint16_t zero = 0;
+    CopyBytes<2>(&zero, &out.raw);
+#else
+    out.raw.bits = 0;
+#endif
     return out;
   }

@@ -890,7 +975,12 @@ static HWY_INLINE Vec1<float16_t> Demote
   HWY_DASSERT(mantissa16 < 1024);
   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
   HWY_DASSERT(bits16 < 0x10000);
-  CopyBytes<2>(&bits16, &out);
+#if HWY_NATIVE_FLOAT16
+  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
+  CopyBytes<2>(&narrowed, &out.raw);
+#else
+  out.raw.bits = static_cast<uint16_t>(bits16);
+#endif
   return out;
 }

@@ -919,18 +1009,6 @@ HWY_INLINE Vec1<uint8_t> U8FromU32(const
   return DemoteTo(Sisd<uint8_t>(), v);
 }

-// Approximation of round-to-nearest for numbers representable as int32_t.
-HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
-  const float f = v.raw;
-  if (std::isinf(f) ||
-      std::fabs(f) > static_cast<float>(LimitsMax<int32_t>())) {
-    return Vec1<int32_t>(std::signbit(f) ? LimitsMin<int32_t>()
-                                         : LimitsMax<int32_t>());
-  }
-  const float bias = f < 0.0f ? -0.5f : 0.5f;
-  return Vec1<int32_t>(static_cast<int>(f + bias));
-}
-
 // ================================================== SWIZZLE

 // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12	2021-06-02 10:56:05.224904336 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -31,11 +31,6 @@
 #undef HWY_ALIGN
 #undef HWY_LANES

-#undef HWY_GATHER_LANES
-#undef HWY_VARIABLE_SHIFT_LANES
-#undef HWY_COMPARE64_LANES
-#undef HWY_MINMAX64_LANES
-
 #undef HWY_CAP_INTEGER64
 #undef HWY_CAP_FLOAT64
 #undef HWY_CAP_GE256
@@ -53,11 +48,6 @@
 #define HWY_ALIGN alignas(16)
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_GATHER_LANES(T) 1
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 1
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
@@ -73,11 +63,6 @@
 #define HWY_ALIGN alignas(32)
 #define HWY_LANES(T) (32 / sizeof(T))

-#define HWY_GATHER_LANES(T) HWY_LANES(T)
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_COMPARE64_LANES 4
-#define HWY_MINMAX64_LANES 1
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
@@ -96,11 +81,6 @@
 #define HWY_ALIGN alignas(64)
 #define HWY_LANES(T) (64 / sizeof(T))

-#define HWY_GATHER_LANES(T) HWY_LANES(T)
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_COMPARE64_LANES 8
-#define HWY_MINMAX64_LANES 8
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
@@ -121,11 +101,6 @@
 #define HWY_ALIGN alignas(16)
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_GATHER_LANES(T) 1
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 2
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
@@ -142,19 +117,14 @@
 #define HWY_ALIGN alignas(16)
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_GATHER_LANES(T) 1
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_MINMAX64_LANES 2
-#define HWY_COMPARE64_LANES 2
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

-#ifdef __arm__
-#define HWY_CAP_FLOAT64 0
-#else
+#if HWY_ARCH_ARM_A64
 #define HWY_CAP_FLOAT64 1
+#else
+#define HWY_CAP_FLOAT64 0
 #endif

 #define HWY_NAMESPACE N_NEON
@@ -162,17 +132,34 @@
 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.

 //-----------------------------------------------------------------------------
+// SVE[2]
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
+
+// SVE only requires lane alignment, not natural alignment of the entire vector.
+#define HWY_ALIGN alignas(8)
+// Upper bound, not the actual lane count!
+#define HWY_LANES(T) (256 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#if HWY_TARGET == HWY_SVE2
+#define HWY_NAMESPACE N_SVE2
+#else
+#define HWY_NAMESPACE N_SVE
+#endif
+
+// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
+
+//-----------------------------------------------------------------------------
 // WASM
 #elif HWY_TARGET == HWY_WASM

 #define HWY_ALIGN alignas(16)
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_GATHER_LANES(T) 1
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 2
-
 #define HWY_CAP_INTEGER64 0
 #define HWY_CAP_FLOAT64 0
 #define HWY_CAP_GE256 0
@@ -194,11 +181,6 @@
 // mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
 #define HWY_LANES(T) (4096 / sizeof(T))

-#define HWY_GATHER_LANES(T) HWY_LANES(T)
-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
-// Cannot use HWY_LANES/sizeof here because these are used in an #if.
-#define HWY_COMPARE64_LANES 256
-#define HWY_MINMAX64_LANES 256

 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
@@ -215,13 +197,9 @@
 #elif HWY_TARGET == HWY_SCALAR

 #define HWY_ALIGN
+// For internal use only; use Lanes(d) instead.
 #define HWY_LANES(T) 1

-#define HWY_GATHER_LANES(T) 1
-#define HWY_VARIABLE_SHIFT_LANES(T) 1
-#define HWY_COMPARE64_LANES 1
-#define HWY_MINMAX64_LANES 1
-
 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
@@ -265,3 +243,7 @@
 #else
 #define HWY_ATTR
 #endif
+
+// DEPRECATED
+#undef HWY_GATHER_LANES
+#define HWY_GATHER_LANES(T) HWY_LANES(T)
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12	2021-06-02 10:56:05.235904392 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -14,6 +14,8 @@

 // Per-target definitions shared by ops/*.h and user code.

+#include <cmath>
+
 // Separate header because foreach_target.h re-enables its include guard.
 #include "hwy/ops/set_macros-inl.h"

@@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si
 }

 // Targets with non-constexpr Lanes define this themselves.
-#if HWY_TARGET != HWY_RVV
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE

 // (Potentially) non-constant actual size of the vector at runtime, subject to
 // the limit imposed by the Simd. Useful for advancing loop counters.
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12	2021-06-02 10:56:05.242904427 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -19,8 +19,6 @@
 #include <stdint.h>
 #include <wasm_simd128.h>

-#include <cmath>
-
 #include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"

@@ -177,6 +175,16 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N

 HWY_DIAGNOSTICS(pop)

+// Returns a vector with lane i=[0, N) set to "first" + i.
+template <typename T, size_t N, typename T2>
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+  HWY_ALIGN T lanes[16 / sizeof(T)];
+  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+  }
+  return Load(d, lanes);
+}
+
 // ================================================== ARITHMETIC

 // ------------------------------ Addition
@@ -273,24 +281,24 @@ HWY_API Vec128<float, N> operator-(const
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_add_saturate(a.raw, b.raw)};
+  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_add_saturate(a.raw, b.raw)};
+  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_add_saturate(a.raw, b.raw)};
+  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_add_saturate(a.raw, b.raw)};
+  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
 }

 // ------------------------------ Saturating subtraction
@@ -301,24 +309,24 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_sub_saturate(a.raw, b.raw)};
+  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_sub_saturate(a.raw, b.raw)};
+  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_sub_saturate(a.raw, b.raw)};
+  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_sub_saturate(a.raw, b.raw)};
+  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
 }

 // ------------------------------ Average
@@ -352,6 +360,12 @@ template <size_t N>
 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
 }
+template <size_t N>
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
+  // TODO(janwas): use wasm_i64x2_abs when available
+  const Vec128<int64_t, N> mask = wasm_i64x2_shr(v.raw, 63);
+  return ((v ^ mask) - mask);
+}

 template <size_t N>
 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
@@ -396,9 +410,38 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
 }

+// 8-bit
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
+  const Simd<T, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
+  return kBits == 1
+             ? (v + v)
+             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+}
+
+template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
+  const Simd<uint8_t, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<uint8_t, N> shifted{
+      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
+  return shifted & Set(d8, 0xFF >> kBits);
+}
+
+template <int kBits, size_t N>
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
+  const Simd<int8_t, N> di;
+  const Simd<uint8_t, N> du;
+  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ Shift lanes by same variable #bits

-// Unsigned (no u8)
+// Unsigned
 template <size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
                                           const int bits) {
@@ -420,7 +463,7 @@ HWY_API Vec128<uint32_t, N> ShiftRightSa
   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
 }

-// Signed (no i8)
+// Signed
 template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
                                          const int bits) {
@@ -442,6 +485,35 @@ HWY_API Vec128<int32_t, N> ShiftRightSam
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
 }

+// 8-bit
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
+  const Simd<T, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<T, N> shifted{
+      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
+  return shifted & Set(d8, (0xFF << bits) & 0xFF);
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
+                                          const int bits) {
+  const Simd<uint8_t, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<uint8_t, N> shifted{
+      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
+  return shifted & Set(d8, 0xFF >> bits);
+}
+
+template <size_t N>
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
+  const Simd<int8_t, N> di;
+  const Simd<uint8_t, N> du;
+  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ Minimum

 // Unsigned
@@ -607,29 +679,29 @@ template <size_t N>
 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
                                     const Vec128<uint16_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_i32x4_widen_low_u16x8(a.raw);
-  const auto ah = wasm_i32x4_widen_high_u16x8(a.raw);
-  const auto bl = wasm_i32x4_widen_low_u16x8(b.raw);
-  const auto bh = wasm_i32x4_widen_high_u16x8(b.raw);
+  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
+  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
+  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
+  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
   const auto l = wasm_i32x4_mul(al, bl);
   const auto h = wasm_i32x4_mul(ah, bh);
   // TODO(eustas): shift-right + narrow?
   return Vec128<uint16_t, N>{
-      wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
                                    const Vec128<int16_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_i32x4_widen_low_i16x8(a.raw);
-  const auto ah = wasm_i32x4_widen_high_i16x8(a.raw);
-  const auto bl = wasm_i32x4_widen_low_i16x8(b.raw);
-  const auto bh = wasm_i32x4_widen_high_i16x8(b.raw);
+  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
+  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
+  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
+  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
   const auto l = wasm_i32x4_mul(al, bl);
   const auto h = wasm_i32x4_mul(ah, bh);
   // TODO(eustas): shift-right + narrow?
   return Vec128<int16_t, N>{
-      wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
@@ -765,53 +837,76 @@ HWY_API Vec128<float, N> ApproximateReci
 // Toward nearest integer, ties to even
 template <size_t N>
 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
-  // TODO(eustas): is it f32x4.nearest? (not implemented yet)
-  alignas(16) float input[4];
-  alignas(16) float output[4];
-  wasm_v128_store(input, v.raw);
-  for (size_t i = 0; i < 4; ++i) {
-    output[i] = std::nearbyint(input[i]);
-  }
-  return Vec128<float, N>{wasm_v128_load(output)};
+  // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not
+  // yet have an instruction for that (f32x4.nearest is not implemented). We
+  // rely on rounding after addition with a large value such that no mantissa
+  // bits remain (assuming the current mode is nearest-even). We may need a
+  // compiler flag for precise floating-point to prevent "optimizing" this out.
+  const Simd<float, N> df;
+  const auto max = Set(df, MantissaEnd<float>());
+  const auto large = CopySignToAbs(max, v);
+  const auto added = large + v;
+  const auto rounded = added - large;
+
+  // Keep original if NaN or the magnitude is large (already an int).
+  return IfThenElse(Abs(v) < max, rounded, v);
 }

+namespace detail {
+
+// Truncating to integer and converting back to float is correct except when the
+// input magnitude is large, in which case the input was already an integer
+// (because mantissa >> exponent is zero).
+template <size_t N>
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
+  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
+}
+
+}  // namespace detail
+
 // Toward zero, aka truncate
 template <size_t N>
 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
   // TODO(eustas): is it f32x4.trunc? (not implemented yet)
-  alignas(16) float input[4];
-  alignas(16) float output[4];
-  wasm_v128_store(input, v.raw);
-  for (size_t i = 0; i < 4; ++i) {
-    output[i] = std::trunc(input[i]);
-  }
-  return Vec128<float, N>{wasm_v128_load(output)};
+  const Simd<float, N> df;
+  const RebindToSigned<decltype(df)> di;
+
+  const auto integer = ConvertTo(di, v);  // round toward 0
+  const auto int_f = ConvertTo(df, integer);
+
+  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
 }

 // Toward +infinity, aka ceiling
 template <size_t N>
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
+HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
   // TODO(eustas): is it f32x4.ceil? (not implemented yet)
-  alignas(16) float input[4];
-  alignas(16) float output[4];
-  wasm_v128_store(input, v.raw);
-  for (size_t i = 0; i < 4; ++i) {
-    output[i] = std::ceil(input[i]);
-  }
-  return Vec128<float, N>{wasm_v128_load(output)};
+  const Simd<float, N> df;
+  const RebindToSigned<decltype(df)> di;
+
+  const auto integer = ConvertTo(di, v);  // round toward 0
+  const auto int_f = ConvertTo(df, integer);
+
+  // Truncating a positive non-integer ends up smaller; if so, add 1.
+  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
+
+  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
 }

 // Toward -infinity, aka floor
 template <size_t N>
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
+HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) {
   // TODO(eustas): is it f32x4.floor? (not implemented yet)
-  alignas(16) float input[4];
-  alignas(16) float output[4];
-  wasm_v128_store(input, v.raw);
-  for (size_t i = 0; i < 4; ++i) {
-    output[i] = std::floor(input[i]);
-  }
-  return Vec128<float, N>{wasm_v128_load(output)};
+  const Simd<float, N> df;
+  const RebindToSigned<decltype(df)> di;
+
+  const auto integer = ConvertTo(di, v);  // round toward 0
+  const auto int_f = ConvertTo(df, integer);
+
+  // Truncating a negative non-integer ends up larger; if so, subtract 1.
+  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
+
+  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
 }

 // ================================================== COMPARE
@@ -902,12 +997,12 @@ HWY_API Mask128<int64_t, N> operator>(co

   // Otherwise, the lower half decides.
   const auto m_eq = a32 == b32;
-  const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
+  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
   const auto lo_gt = And(m_eq, lo_in_hi);

   const auto gt = Or(lo_gt, m_gt);
   // Copy result in upper 32 bits to lower 32 bits.
-  return Mask128<int64_t, N>{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)};
+  return Mask128<int64_t, N>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
 }

 template <size_t N>
@@ -935,6 +1030,14 @@ HWY_API Mask128<float, N> operator>=(con
   return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
 }

+// ------------------------------ FirstN (Iota, Lt)
+
+template <typename T, size_t N>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
+  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
+}
+
 // ================================================== LOGICAL

 // ------------------------------ Not
@@ -1015,7 +1118,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
 }
 template <size_t N>
 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
-  return VecFromMask(v < Zero(Simd<int8_t, N>()));
+  return VecFromMask(Simd<int8_t, N>(), v < Zero(Simd<int8_t, N>()));
 }

 // ------------------------------ Mask
@@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128<T, N> v, Simd
   wasm_v128_store(aligned, v.raw);
 }

-// ------------------------------ Gather
+// ------------------------------ Scatter (Store)
+
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                           const Vec128<Offset, N> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+  }
+}
+
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                          const Vec128<Index, N> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  for (size_t i = 0; i < N; ++i) {
+    base[index_lanes[i]] = lanes[i];
+  }
+}
+
+// ------------------------------ Gather (Load/Store)

 template <typename T, size_t N, typename Offset>
 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
                                   const T* HWY_RESTRICT base,
                                   const Vec128<Offset, N> offset) {
-  static_assert(N == 1, "Wasm does not support full gather");
-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
-  T val;
-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
-  return Set(d, val);
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  alignas(16) T lanes[N];
+  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+  }
+  return Load(d, lanes);
 }

 template <typename T, size_t N, typename Index>
 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
                                  const Vec128<Index, N> index) {
-  static_assert(N == 1, "Wasm does not support full gather");
-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
-  return Set(d, base[GetLane(index)]);
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  alignas(16) T lanes[N];
+  for (size_t i = 0; i < N; ++i) {
+    lanes[i] = base[index_lanes[i]];
+  }
+  return Load(d, lanes);
 }

 // ================================================== SWIZZLE
@@ -1346,12 +1496,12 @@ HWY_API Vec128<T, N / 2> LowerHalf(Vec12
 template <typename T>
 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Vec128<T> v) {
   // TODO(eustas): use swizzle?
-  return Vec128<T, 8 / sizeof(T)>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
 }
 template <>
 HWY_INLINE Vec128<float, 2> UpperHalf(Vec128<float> v) {
   // TODO(eustas): use swizzle?
-  return Vec128<float, 2>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
 }

 // ------------------------------ Shift vector by constant #bytes
@@ -1366,64 +1516,64 @@ HWY_API Vec128<T> ShiftLeftBytes(const V
       return v;

     case 1:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
                                           7, 8, 9, 10, 11, 12, 13, 14)};

     case 2:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
                                           6, 7, 8, 9, 10, 11, 12, 13)};

     case 3:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
                                           4, 5, 6, 7, 8, 9, 10, 11, 12)};

     case 4:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
                                           3, 4, 5, 6, 7, 8, 9, 10, 11)};

     case 5:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
                                           2, 3, 4, 5, 6, 7, 8, 9, 10)};

     case 6:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};

     case 7:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};

     case 8:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};

     case 9:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};

     case 10:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};

     case 11:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};

     case 12:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};

     case 13:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};

     case 14:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 0,
                                           1)};

     case 15:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           0)};
   }
@@ -1447,69 +1597,69 @@ HWY_API Vec128<T> ShiftRightBytes(const
       return v;

     case 1:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
                                           9, 10, 11, 12, 13, 14, 15, 16)};

     case 2:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
                                           10, 11, 12, 13, 14, 15, 16, 16)};

     case 3:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
                                           11, 12, 13, 14, 15, 16, 16, 16)};

     case 4:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 16, 16, 16)};

     case 5:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 16, 16, 16, 16)};

     case 6:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
                                           13, 14, 15, 16, 16, 16, 16, 16, 16)};

     case 7:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
                                           14, 15, 16, 16, 16, 16, 16, 16, 16)};

     case 8:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
                                           15, 16, 16, 16, 16, 16, 16, 16, 16)};

     case 9:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
                                           15, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 10:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 11:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 12:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 13:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 14:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};

     case 15:
-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
                                           16)};
   }
@@ -1535,72 +1685,72 @@ HWY_API Vec128<T> CombineShiftRightBytes
       return lo;

     case 1:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
                                           8, 9, 10, 11, 12, 13, 14, 15, 16)};

     case 2:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
                                           9, 10, 11, 12, 13, 14, 15, 16, 17)};

     case 3:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
                                           10, 11, 12, 13, 14, 15, 16, 17, 18)};

     case 4:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
                                           11, 12, 13, 14, 15, 16, 17, 18, 19)};

     case 5:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 17, 18, 19, 20)};

     case 6:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
                                           12, 13, 14, 15, 16, 17, 18, 19, 20,
                                           21)};

     case 7:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
                                           13, 14, 15, 16, 17, 18, 19, 20, 21,
                                           22)};

     case 8:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
                                           14, 15, 16, 17, 18, 19, 20, 21, 22,
                                           23)};

     case 9:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
                                           24)};

     case 10:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
                                           24, 25)};

     case 11:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
                                           16, 17, 18, 19, 20, 21, 22, 23, 24,
                                           25, 26)};

     case 12:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
                                           17, 18, 19, 20, 21, 22, 23, 24, 25,
                                           26, 27)};

     case 13:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
                                           18, 19, 20, 21, 22, 23, 24, 25, 26,
                                           27, 28)};

     case 14:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
                                           19, 20, 21, 22, 23, 24, 25, 26, 27,
                                           28, 29)};

     case 15:
-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
+      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
                                           20, 21, 22, 23, 24, 25, 26, 27, 28,
                                           29, 30)};
   }
@@ -1613,28 +1763,28 @@ HWY_API Vec128<T> CombineShiftRightBytes
 template <int kLane, size_t N>
 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint16_t, N>{wasm_v16x8_shuffle(
+  return Vec128<uint16_t, N>{wasm_i16x8_shuffle(
       v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
 }
 template <int kLane, size_t N>
 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<uint32_t, N>{
-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
 }

 // Signed
 template <int kLane, size_t N>
 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int16_t, N>{wasm_v16x8_shuffle(
+  return Vec128<int16_t, N>{wasm_i16x8_shuffle(
       v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
 }
 template <int kLane, size_t N>
 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<int32_t, N>{
-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
 }

 // Float
@@ -1642,7 +1792,7 @@ template <int kLane, size_t N>
 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<float, N>{
-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
 }

 // ------------------------------ Shuffle bytes with variable indices
@@ -1652,16 +1802,23 @@ HWY_API Vec128<float, N> Broadcast(const
 template <typename T, size_t N>
 HWY_API Vec128<T, N> TableLookupBytes(const Vec128<T, N> bytes,
                                       const Vec128<T, N> from) {
-  // TODO(eustas): use swizzle? (shuffle does not work for variable indices)
+// Not yet available in all engines, see
+// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
+// V8 implementation of this had a bug, fixed on 2021-04-03:
+// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
+#if 0
+  return Vec128<T, N>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
+#else
   alignas(16) uint8_t control[16];
   alignas(16) uint8_t input[16];
   alignas(16) uint8_t output[16];
   wasm_v128_store(control, from.raw);
   wasm_v128_store(input, bytes.raw);
   for (size_t i = 0; i < 16; ++i) {
-    output[i] = input[control[i]];
+    output[i] = control[i] < 16 ? input[control[i]] : 0;
   }
   return Vec128<T, N>{wasm_v128_load(output)};
+#endif
 }

 // ------------------------------ Hard-coded shuffles
@@ -1673,101 +1830,102 @@ HWY_API Vec128<T, N> TableLookupBytes(co

 // Swap 32-bit halves in 64-bit halves.
 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
 }
 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
 }
 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
 }

 // Swap 64-bit halves
 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }
 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }
 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
-  return Vec128<float>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }

 // Rotate right 32 bits
 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
 }
 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
 }
 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
 }
 // Rotate left 32 bits
 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
 }
 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
 }
 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
 }

 // Reverse
 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
 }
 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
 }
 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
 }

 // ------------------------------ TableLookupLanes

 // Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
+template <typename T, size_t N>
 struct Indices128 {
   __v128_u raw;
 };

-template <typename T>
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
 #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
-  const size_t N = 16 / sizeof(T);
   for (size_t i = 0; i < N; ++i) {
     HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
   }
 #endif

-  const Full128<uint8_t> d8;
-  alignas(16) uint8_t control[16];  // = Lanes()
-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
-    const size_t idx_lane = idx_byte / sizeof(T);
-    const size_t mod = idx_byte % sizeof(T);
-    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
+  const Repartition<uint8_t, decltype(d)> d8;
+  alignas(16) uint8_t control[16] = {0};
+  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+      control[idx_lane * sizeof(T) + idx_byte] =
+          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+    }
   }
-  return Indices128<T>{Load(d8, control).raw};
+  return Indices128<T, N>{Load(d8, control).raw};
 }

-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
-                                          const Indices128<uint32_t> idx) {
-  return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
+    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
 }
-
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
-                                         const Indices128<int32_t> idx) {
-  return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
+                                            const Indices128<int32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
 }
-
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
-                                       const Indices128<float> idx) {
-  const Full128<int32_t> di;
-  const Full128<float> df;
+template <size_t N>
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+                                          const Indices128<float, N> idx) {
+  const Simd<int32_t, N> di;
+  const Simd<float, N> df;
   return BitCast(df,
-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
+                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
 }

 // ------------------------------ Zip lanes
@@ -1778,33 +1936,33 @@ HWY_API Vec128<float> TableLookupLanes(c
 template <size_t N>
 HWY_API Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
                                                const Vec128<uint8_t, N> b) {
-  return Vec128<uint16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
+  return Vec128<uint16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
                                                const Vec128<uint16_t, N> b) {
   return Vec128<uint32_t, (N + 1) / 2>{
-      wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }

 template <size_t N>
 HWY_API Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
                                               const Vec128<int8_t, N> b) {
-  return Vec128<int16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
+  return Vec128<int16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
                                               const Vec128<int16_t, N> b) {
   return Vec128<int32_t, (N + 1) / 2>{
-      wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }

 template <size_t N>
 HWY_API Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
                                          const Vec128<uint8_t, N> b) {
-  return Vec128<uint16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+  return Vec128<uint16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
                                                     10, 26, 11, 27, 12, 28, 13,
                                                     29, 14, 30, 15, 31)};
 }
@@ -1812,13 +1970,13 @@ template <size_t N>
 HWY_API Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint32_t, N / 2>{
-      wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }

 template <size_t N>
 HWY_API Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
                                         const Vec128<int8_t, N> b) {
-  return Vec128<int16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+  return Vec128<int16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
                                                    10, 26, 11, 27, 12, 28, 13,
                                                    29, 14, 30, 15, 31)};
 }
@@ -1826,7 +1984,7 @@ template <size_t N>
 HWY_API Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
   return Vec128<int32_t, N / 2>{
-      wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }

 // ------------------------------ Interleave lanes
@@ -1842,17 +2000,17 @@ HWY_API Vec128<T> InterleaveLower(const
 template <>
 HWY_INLINE Vec128<uint32_t> InterleaveLower<uint32_t>(
     const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <>
 HWY_INLINE Vec128<int32_t> InterleaveLower<int32_t>(const Vec128<int32_t> a,
                                                     const Vec128<int32_t> b) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <>
 HWY_INLINE Vec128<float> InterleaveLower<float>(const Vec128<float> a,
                                                 const Vec128<float> b) {
-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }

 template <typename T>
@@ -1862,17 +2020,17 @@ HWY_API Vec128<T> InterleaveUpper(const
 template <>
 HWY_INLINE Vec128<uint32_t> InterleaveUpper<uint32_t>(
     const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
-  return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+  return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <>
 HWY_INLINE Vec128<int32_t> InterleaveUpper<int32_t>(const Vec128<int32_t> a,
                                                     const Vec128<int32_t> b) {
-  return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+  return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <>
 HWY_INLINE Vec128<float> InterleaveUpper<float>(const Vec128<float> a,
                                                 const Vec128<float> b) {
-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }

 // ------------------------------ Blocks
@@ -1880,13 +2038,13 @@ HWY_INLINE Vec128<float> InterleaveUpper
 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
 template <typename T>
 HWY_API Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)};
+  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
 }

 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
 template <typename T>
 HWY_API Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)};
+  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
 }

 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
@@ -1898,7 +2056,7 @@ HWY_API Vec128<T> ConcatLowerUpper(const
 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
 template <typename T>
 HWY_API Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)};
+  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)};
 }

 // ------------------------------ Odd/even lanes
@@ -1917,12 +2075,12 @@ HWY_API Vec128<T> odd_even_impl(hwy::Siz
 template <typename T>
 HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
                                 const Vec128<T> b) {
-  return Vec128<T>{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
+  return Vec128<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
 }
 template <typename T>
 HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
                                 const Vec128<T> b) {
-  return Vec128<T>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+  return Vec128<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }
 // TODO(eustas): implement
 // template <typename T>
@@ -1939,7 +2097,7 @@ HWY_API Vec128<T> OddEven(const Vec128<T
 template <>
 HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
                                         const Vec128<float> b) {
-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }

 // ================================================== CONVERT
@@ -1950,52 +2108,52 @@ HWY_INLINE Vec128<float> OddEven<float>(
 template <size_t N>
 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
                                       const Vec128<uint8_t, N> v) {
-  return Vec128<uint16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
+  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
                                       const Vec128<uint8_t, N> v) {
   return Vec128<uint32_t, N>{
-      wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
+      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
                                      const Vec128<uint8_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
+  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
                                      const Vec128<uint8_t, N> v) {
   return Vec128<int32_t, N>{
-      wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
+      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
                                       const Vec128<uint16_t, N> v) {
-  return Vec128<uint32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
+  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
                                      const Vec128<uint16_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
+  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }

 // Signed: replicate sign bit.
 template <size_t N>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
                                      const Vec128<int8_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_widen_low_i8x16(v.raw)};
+  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
                                      const Vec128<int8_t, N> v) {
   return Vec128<int32_t, N>{
-      wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))};
+      wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
                                      const Vec128<int16_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_widen_low_i16x8(v.raw)};
+  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
 }

 template <size_t N>
@@ -2122,7 +2280,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

-// ------------------------------ Convert i32 <=> f32
+// ------------------------------ Convert i32 <=> f32 (Round)

 template <size_t N>
 HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
@@ -2133,33 +2291,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
                                      const Vec128<float, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(v.raw)};
+  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  const __f32x4 c00 = wasm_f32x4_splat(0.0f);
-  const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00));
-  const __f32x4 c05 = wasm_f32x4_splat(0.5f);
-  // +0.5 for non-negative lane, -0.5 for other.
-  const __f32x4 delta = wasm_f32x4_add(c05, corr);
-  // Shift input by 0.5 away from 0.
-  const __f32x4 fixed = wasm_f32x4_add(v.raw, delta);
-  return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(fixed)};
+  return ConvertTo(Simd<int32_t, N>(), Round(v));
 }

 // ================================================== MISC

-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
 // ------------------------------ Mask

 namespace detail {
@@ -2167,20 +2308,13 @@ namespace detail {
 template <typename T, size_t N>
 HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                               const Mask128<T, N> mask) {
-  const __i8x16 slice =
-      wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8);
-  // Each u32 lane has byte[i] = (1 << i) or 0.
-  const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice);
-  // OR together 4 bytes of each u32 to get the 4 bits.
-  const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16);
-  const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z);
-  const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8);
-  const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0);
-  const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24);
-  // Assemble four nibbles into 16 bits.
-  alignas(16) uint32_t lanes[4];
-  wasm_v128_store(lanes, nibble_per_u32);
-  return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12);
+  alignas(16) uint64_t lanes[2];
+  wasm_v128_store(lanes, mask.raw);
+
+  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
+  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
+  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
+  return (hi + lo);
 }

 template <typename T, size_t N>
@@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() {

 template <typename T, size_t N>
 HWY_API uint64_t BitsFromMask(const Mask128<T, N> mask) {
-  return OnlyActive<T, N>(
-      BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
+  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
 }

 template <typename T>
@@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128<T
 // Full vector, type-independent
 template <typename T>
 HWY_API bool AllFalse(const Mask128<T> m) {
-  return !wasm_i8x16_any_true(m.raw);
+#if 0
+  // Casting followed by wasm_i8x16_any_true results in wasm error:
+  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
+  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(Full128<T>(), m));
+  return !wasm_i8x16_any_true(v8.raw);
+#else
+  return (wasm_i64x2_extract_lane(m.raw, 0) |
+          wasm_i64x2_extract_lane(m.raw, 1)) == 0;
+#endif
 }

 // Full vector, type-dependent
@@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128<T, N>
 namespace detail {

 template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N> d;
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N> du;
+
+  // We need byte indices for TableLookupBytes (one vector's worth for each of
+  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+  // with the doubling baked into the table. Unpacking nibbles is likely more
+  // costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
+      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
+      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
+      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
+      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
+      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
+      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
+      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
+      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
+      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
+      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
+      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
+      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
+      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
+      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
+      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
+      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
+      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
+      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
+      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
+      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
+      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
+      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
+      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
+      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
+      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
+      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
+      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
+      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
+      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
+      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
+      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
+      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
+      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
+      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
+      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
+      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
+      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
+      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
+      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
+      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
+      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
+      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
+      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
+      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
+      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
+      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
+      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
+      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
+      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
+      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
+      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
+      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
+      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
+      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
+      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
+      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
+      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
+      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
+      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
+      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
+      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
+      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
+      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
+      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
+      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
+      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
+      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
+      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
+      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
+      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
+      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
+      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
+      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
+      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
+      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
+      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
+      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
+      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
+      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
+      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
+      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
+      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
+      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
+      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
+      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
+      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
+      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
+      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
+      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
+      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
+      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
+      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
+      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
+      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
+      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
+      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
+      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
+      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
+      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
+      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
+      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
+      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
+      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
+      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
+      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
+      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
+      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
+      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
+      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
+      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
+      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
+      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
+      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

@@ -2383,57 +2657,37 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(

 #endif

-// Helper function called by both Compress and CompressStore - avoids a
+// Helper functions called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.

-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
-                                     const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
+                              const uint64_t mask_bits) {
+  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
+  using D = Simd<T, N>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
-                                    const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
+                              const uint64_t mask_bits) {
+  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+  using D = Simd<T, N>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

-#if HWY_CAP_INTEGER64
+#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64

-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
+template <typename T, size_t N>
+HWY_API Vec128<uint64_t, N> Compress(hwy::SizeTag<8> /*tag*/,
+                                     Vec128<uint64_t, N> v,
                                      const uint64_t mask_bits) {
   const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
-                                    const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-}
-
-#endif
-
-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
-                                  const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  const Simd<float, N> df;
-  const Simd<int32_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
-}
-
-#if HWY_CAP_FLOAT64
-
-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
-                                   const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  const Simd<double, N> df;
-  const Simd<int64_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+  using D = Simd<T, N>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 #endif
@@ -2442,7 +2696,8 @@ HWY_API Vec128<double, N> Compress(Vec12

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  return detail::Compress(v, detail::BitsFromMask(mask));
+  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+                          detail::BitsFromMask(mask));
 }

 // ------------------------------ CompressStore
@@ -2451,63 +2706,284 @@ template <typename T, size_t N>
 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
                              Simd<T, N> d, T* HWY_RESTRICT aligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
-  Store(detail::Compress(v, mask_bits), d, aligned);
+  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
   return PopCount(mask_bits);
 }

+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
+                               const Vec128<uint8_t> c, Full128<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const auto k5 = Set(d, 5);
+  const auto k6 = Set(d, 6);
+
+  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = Load(d, tbl_r0);
+  const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  StoreU(int0, d, unaligned + 0 * 16);
+
+  // Second vector: g10,r10, bgr[9:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
+  const auto r1 = TableLookupBytes(a, shuf_r1);
+  const auto g1 = TableLookupBytes(b, shuf_g1);
+  const auto b1 = TableLookupBytes(c, shuf_b1);
+  const auto int1 = r1 | g1 | b1;
+  StoreU(int1, d, unaligned + 1 * 16);
+
+  // Third vector: bgr[15:11], b10
+  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
+  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
+  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
+  const auto r2 = TableLookupBytes(a, shuf_r2);
+  const auto g2 = TableLookupBytes(b, shuf_g2);
+  const auto b2 = TableLookupBytes(c, shuf_b2);
+  const auto int2 = r2 | g2 | b2;
+  StoreU(int2, d, unaligned + 2 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
+                               const Vec128<uint8_t, 8> b,
+                               const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors for the shuffles and first result.
+  const Full128<uint8_t> d_full;
+  const auto k5 = Set(d_full, 5);
+  const auto k6 = Set(d_full, 6);
+
+  const Vec128<uint8_t> full_a{a.raw};
+  const Vec128<uint8_t> full_b{b.raw};
+  const Vec128<uint8_t> full_c{c.raw};
+
+  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = Load(d_full, tbl_r0);
+  const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  StoreU(int0, d_full, unaligned + 0 * 16);
+
+  // Second (HALF) vector: bgr[7:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
+  const auto r1 = TableLookupBytes(full_a, shuf_r1);
+  const auto g1 = TableLookupBytes(full_b, shuf_g1);
+  const auto b1 = TableLookupBytes(full_c, shuf_b1);
+  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
+  StoreU(int1, d, unaligned + 1 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
+                               const Vec128<uint8_t, N> b,
+                               const Vec128<uint8_t, N> c,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors for the shuffles and result.
+  const Full128<uint8_t> d_full;
+
+  const Vec128<uint8_t> full_a{a.raw};
+  const Vec128<uint8_t> full_b{b.raw};
+  const Vec128<uint8_t> full_c{c.raw};
+
+  // Shuffle (a,b,c) vector bytes to bgr[3:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
+      0x80, 0x80, 0x80, 0x80};
+  const auto shuf_r0 = Load(d_full, tbl_r0);
+  const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
+  const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
+  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
+  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
+  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  alignas(16) uint8_t buf[16];
+  StoreU(int0, d_full, buf);
+  CopyBytes<N * 3>(buf, unaligned);
+}
+
+// ------------------------------ StoreInterleaved4
+
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               const Vec128<uint8_t> v2,
+                               const Vec128<uint8_t> v3, Full128<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
+  const auto ba8 = ZipUpper(v0, v1);
+  const auto dc8 = ZipUpper(v2, v3);
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
+  const auto dcba_8 = ZipLower(ba8, dc8);  // d..aB d..a8
+  const auto dcba_C = ZipUpper(ba8, dc8);  // d..aF d..aC
+  StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
+  StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
+  StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
+  StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
+                               const Vec128<uint8_t, 8> in1,
+                               const Vec128<uint8_t, 8> in2,
+                               const Vec128<uint8_t, 8> in3,
+                               Simd<uint8_t, 8> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  const Vec128<uint8_t> v2{in2.raw};
+  const Vec128<uint8_t> v3{in3.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);       // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);       // d7 c7 .. d0 c0
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
+  const Full128<uint8_t> d_full;
+  StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
+  StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
+                               const Vec128<uint8_t, N> in1,
+                               const Vec128<uint8_t, N> in2,
+                               const Vec128<uint8_t, N> in3,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  const Vec128<uint8_t> v2{in2.raw};
+  const Vec128<uint8_t> v3{in3.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);       // b3 a3 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);       // d3 c3 .. d0 c0
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  alignas(16) uint8_t buf[16];
+  const Full128<uint8_t> d_full;
+  StoreU(BitCast(d_full, dcba_0), d_full, buf);
+  CopyBytes<4 * N>(buf, unaligned);
+}
+
 // ------------------------------ Reductions

 namespace detail {

-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+
+// u32/i32/f32:
+
+// N=2
+template <typename T>
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
+}
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+
+// N=4 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = v3210 + v1032;
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return v20_31_20_31 + v31_20_31_20;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Min(v20_31_20_31, v31_20_31_20);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Max(v20_31_20_31, v31_20_31_20);
 }

-// For u64/i64/f64.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+// u64/i64/f64:
+
+// N=2 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return v10 + v01;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Min(v10, v01);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Max(v10, v01);
 }

 }  // namespace detail

-// Supported for u/i/f 32/64. Returns the sum in each lane.
+// Supported for u/i/f 32/64. Returns the same value in each lane.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12	2021-06-02 10:56:05.240904417 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -154,27 +154,28 @@ HWY_API Vec128<double, N> Zero(Simd<doub
 // Returns a vector/part with all lanes set to "t".
 template <size_t N, HWY_IF_LE128(uint8_t, N)>
 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
-  return Vec128<uint8_t, N>{_mm_set1_epi8(t)};
+  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
-  return Vec128<uint16_t, N>{_mm_set1_epi16(t)};
+  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(uint32_t, N)>
 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
-  return Vec128<uint32_t, N>{_mm_set1_epi32(t)};
+  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
 }
 template <size_t N, HWY_IF_LE128(uint64_t, N)>
 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
-  return Vec128<uint64_t, N>{_mm_set1_epi64x(t)};
+  return Vec128<uint64_t, N>{
+      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(int8_t, N)>
 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
-  return Vec128<int8_t, N>{_mm_set1_epi8(t)};
+  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(int16_t, N)>
 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
-  return Vec128<int16_t, N>{_mm_set1_epi16(t)};
+  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
@@ -182,7 +183,8 @@ HWY_API Vec128<int32_t, N> Set(Simd<int3
 }
 template <size_t N, HWY_IF_LE128(int64_t, N)>
 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
-  return Vec128<int64_t, N>{_mm_set1_epi64x(t)};
+  return Vec128<int64_t, N>{
+      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
 }
 template <size_t N, HWY_IF_LE128(float, N)>
 HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
@@ -510,7 +512,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<
 template <typename TFrom, typename TTo, size_t N>
 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask128<TTo, N>{m.raw};
+  const Simd<TFrom, N> d;
+  return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
 }

 // ------------------------------ Equality
@@ -683,6 +686,14 @@ HWY_API Mask128<double, N> operator>=(co
   return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
 }

+// ------------------------------ FirstN (Iota, Lt)
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
+  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
+}
+
 // ================================================== ARITHMETIC

 // ------------------------------ Addition
@@ -894,7 +905,7 @@ template <size_t N>
 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
 }
-
+// i64 is implemented after BroadcastSignBit.
 template <size_t N>
 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
   const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
@@ -959,7 +970,6 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> Mu

 // ------------------------------ ShiftLeft

-// Unsigned
 template <int kBits, size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
   return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
@@ -988,6 +998,16 @@ HWY_API Vec128<int64_t, N> ShiftLeft(con
   return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
 }

+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
+  const Simd<T, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
+  return kBits == 1
+             ? (v + v)
+             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+}
+
 // ------------------------------ ShiftRight

 template <int kBits, size_t N>
@@ -1004,6 +1024,15 @@ HWY_API Vec128<uint64_t, N> ShiftRight(c
 }

 template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
+  const Simd<uint8_t, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<uint8_t, N> shifted{
+      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
+  return shifted & Set(d8, 0xFF >> kBits);
+}
+
+template <int kBits, size_t N>
 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
 }
@@ -1012,6 +1041,15 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
   return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
 }

+template <int kBits, size_t N>
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
+  const Simd<int8_t, N> di;
+  const Simd<uint8_t, N> du;
+  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // i64 is implemented after BroadcastSignBit.

 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
@@ -1039,15 +1077,24 @@ HWY_API Vec128<int64_t, N> BroadcastSign
   return VecFromMask(v < Zero(Simd<int64_t, N>()));
 #else
   // Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires
-  // two constants and domain crossing. 32-bit compare only requires Zero()
-  // plus a shuffle to replicate the upper 32 bits.
+  // two constants and domain crossing. 32-bit shift avoids generating a zero.
   const Simd<int32_t, N * 2> d32;
-  const auto sign = BitCast(d32, v) < Zero(d32);
+  const auto sign = ShiftRight<31>(BitCast(d32, v));
   return Vec128<int64_t, N>{
       _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
 #endif
 }

+template <size_t N>
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
+#if HWY_TARGET == HWY_AVX3
+  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
+#else
+  const auto zero = Zero(Simd<int64_t,N>());
+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+#endif
+}
+
 template <int kBits, size_t N>
 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
 #if HWY_TARGET == HWY_AVX3
@@ -1097,6 +1144,15 @@ HWY_API Vec128<int64_t, N> ShiftLeftSame
   return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
+  const Simd<T, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<T, N> shifted{
+      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
+  return shifted & Set(d8, (0xFF << bits) & 0xFF);
+}
+
 // ------------------------------ ShiftRightSame (BroadcastSignBit)

 template <size_t N>
@@ -1116,6 +1172,16 @@ HWY_API Vec128<uint64_t, N> ShiftRightSa
 }

 template <size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
+                                          const int bits) {
+  const Simd<uint8_t, N> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec128<uint8_t, N> shifted{
+      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
+  return shifted & Set(d8, 0xFF >> bits);
+}
+
+template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
                                           const int bits) {
   return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -1140,6 +1206,15 @@ HWY_API Vec128<int64_t, N> ShiftRightSam
 #endif
 }

+template <size_t N>
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
+  const Simd<int8_t, N> di;
+  const Simd<uint8_t, N> du;
+  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ Negate

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
@@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128<double,
   _mm_stream_pd(aligned, v.raw);
 }

-// ------------------------------ Gather
+// ------------------------------ Scatter
+
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")

 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
 using GatherIndex64 = long long int;  // NOLINT(google-runtime-int)
 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");

+#if HWY_TARGET == HWY_AVX3
+namespace detail {
+
+template <typename T, size_t N>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
+                           Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+                           const Vec128<int32_t, N> offset) {
+  if (N == 4) {
+    _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
+  }
+}
+template <typename T, size_t N>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
+                          Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+                          const Vec128<int32_t, N> index) {
+  if (N == 4) {
+    _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
+  }
+}
+
+template <typename T, size_t N>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
+                           Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+                           const Vec128<int64_t, N> offset) {
+  if (N == 2) {
+    _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
+  }
+}
+template <typename T, size_t N>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
+                          Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+                          const Vec128<int64_t, N> index) {
+  if (N == 2) {
+    _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
+  }
+}
+
+}  // namespace detail
+
+template <typename T, size_t N, typename Offset>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                           const Vec128<Offset, N> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+}
+template <typename T, size_t N, typename Index>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                          const Vec128<Index, N> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+}
+
+template <size_t N>
+HWY_INLINE void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */,
+                              float* HWY_RESTRICT base,
+                              const Vec128<int32_t, N> offset) {
+  if (N == 4) {
+    _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
+  }
+}
+template <size_t N>
+HWY_INLINE void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */,
+                             float* HWY_RESTRICT base,
+                             const Vec128<int32_t, N> index) {
+  if (N == 4) {
+    _mm_i32scatter_ps(base, index.raw, v.raw, 4);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
+  }
+}
+
+template <size_t N>
+HWY_INLINE void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */,
+                              double* HWY_RESTRICT base,
+                              const Vec128<int64_t, N> offset) {
+  if (N == 2) {
+    _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
+  }
+}
+template <size_t N>
+HWY_INLINE void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */,
+                             double* HWY_RESTRICT base,
+                             const Vec128<int64_t, N> index) {
+  if (N == 2) {
+    _mm_i64scatter_pd(base, index.raw, v.raw, 8);
+  } else {
+    const __mmask8 mask = (1u << N) - 1;
+    _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
+  }
+}
+#else  // HWY_TARGET == HWY_AVX3
+
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                           const Vec128<Offset, N> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+  }
+}
+
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+                          const Vec128<Index, N> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  for (size_t i = 0; i < N; ++i) {
+    base[index_lanes[i]] = lanes[i];
+  }
+}
+
+#endif
+
+// ------------------------------ Gather (Load/Store)
+
 #if HWY_TARGET == HWY_SSE4

 template <typename T, size_t N, typename Offset>
 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
                                   const T* HWY_RESTRICT base,
                                   const Vec128<Offset, N> offset) {
-  static_assert(N == 1, "SSE4 does not support full gather");
-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
-  T val;
-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
-  return Set(d, val);
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  alignas(16) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  alignas(16) T lanes[N];
+  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+  }
+  return Load(d, lanes);
 }

 template <typename T, size_t N, typename Index>
 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
                                  const Vec128<Index, N> index) {
-  static_assert(N == 1, "SSE4 does not support full gather");
-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
-  return Set(d, base[GetLane(index)]);
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  alignas(16) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  alignas(16) T lanes[N];
+  for (size_t i = 0; i < N; ++i) {
+    lanes[i] = base[index_lanes[i]];
+  }
+  return Load(d, lanes);
 }

 #else
@@ -1832,6 +2071,8 @@ HWY_API Vec128<double, N> GatherIndex(Si

 #endif  // HWY_TARGET != HWY_SSE4

+HWY_DIAGNOSTICS(pop)
+
 // ================================================== SWIZZLE

 // ------------------------------ Extract half
@@ -1859,10 +2100,10 @@ HWY_INLINE Vec128<double, 1> UpperHalf(V
 // ------------------------------ Shift vector by constant #bytes

 // 0x01..0F, kBytes = 1 => 0x02..0F00
-template <int kBytes, typename T>
-HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  return Vec128<T>{_mm_slli_si128(v.raw, kBytes)};
+  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
 }

 template <int kLanes, typename T, size_t N>
@@ -1873,10 +2114,10 @@ HWY_API Vec128<T, N> ShiftLeftLanes(cons
 }

 // 0x01..0F, kBytes = 1 => 0x0001..0E
-template <int kBytes, typename T>
-HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  return Vec128<T>{_mm_srli_si128(v.raw, kBytes)};
+  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
 }

 template <int kLanes, typename T, size_t N>
@@ -2041,44 +2282,47 @@ HWY_API Vec128<float> Shuffle0123(const
 // ------------------------------ TableLookupLanes

 // Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
+template <typename T, size_t N>
 struct Indices128 {
   __m128i raw;
 };

-template <typename T>
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
 #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
-  const size_t N = 16 / sizeof(T);
   for (size_t i = 0; i < N; ++i) {
     HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
   }
 #endif

-  const Full128<uint8_t> d8;
-  alignas(16) uint8_t control[16];
-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
-    const size_t idx_lane = idx_byte / sizeof(T);
-    const size_t mod = idx_byte % sizeof(T);
-    control[idx_byte] = static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + mod);
+  const Repartition<uint8_t, decltype(d)> d8;
+  alignas(16) uint8_t control[16] = {0};
+  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+      control[idx_lane * sizeof(T) + idx_byte] =
+          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+    }
   }
-  return Indices128<T>{Load(d8, control).raw};
+  return Indices128<T, N>{Load(d8, control).raw};
 }

-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
-                                          const Indices128<uint32_t> idx) {
-  return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
+    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
 }
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
-                                         const Indices128<int32_t> idx) {
-  return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
+                                            const Indices128<int32_t, N> idx) {
+  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
 }
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
-                                       const Indices128<float> idx) {
-  const Full128<int32_t> di;
-  const Full128<float> df;
+template <size_t N>
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+                                          const Indices128<float, N> idx) {
+  const Simd<int32_t, N> di;
+  const Simd<float, N> df;
   return BitCast(df,
-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
+                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
 }

 // ------------------------------ Interleave lanes
@@ -2286,47 +2530,47 @@ HWY_INLINE Vec128<double> ConcatUpperLow

 namespace detail {

-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
-                          const Vec128<T> b) {
-  const Full128<T> d;
-  const Full128<uint8_t> d8;
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
+                             const Vec128<T, N> b) {
+  const Simd<T, N> d;
+  const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
                                             0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
 }
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
-                          const Vec128<T> b) {
-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
+                             const Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
 }
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
-                          const Vec128<T> b) {
-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
+                             const Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
 }
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T> a,
-                          const Vec128<T> b) {
-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
+                             const Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
 }

 }  // namespace detail

-template <typename T>
-HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
 }
-template <>
-HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
-                                        const Vec128<float> b) {
-  return Vec128<float>{_mm_blend_ps(a.raw, b.raw, 5)};
+template <size_t N>
+HWY_INLINE Vec128<float, N> OddEven(const Vec128<float, N> a,
+                                    const Vec128<float, N> b) {
+  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
 }

-template <>
-HWY_INLINE Vec128<double> OddEven<double>(const Vec128<double> a,
-                                          const Vec128<double> b) {
-  return Vec128<double>{_mm_blend_pd(a.raw, b.raw, 1)};
+template <size_t N>
+HWY_INLINE Vec128<double, N> OddEven(const Vec128<double, N> a,
+                                     const Vec128<double, N> b) {
+  return Vec128<double, N>{_mm_blend_pd(a.raw, b.raw, 1)};
 }

 // ------------------------------ Shl (ZipLower, Mul)
@@ -2764,7 +3008,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
   return LowerHalf(LowerHalf(BitCast(d8, quad)));
 }

-// ------------------------------ Convert integer <=> floating point
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)

 template <size_t N>
 HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
@@ -2779,13 +3023,20 @@ HWY_API Vec128<double, N> ConvertTo(Simd
   (void)dd;
   return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
 #else
-  alignas(16) int64_t lanes_i[2];
-  Store(v, Simd<int64_t, N>(), lanes_i);
-  alignas(16) double lanes_d[2];
-  for (size_t i = 0; i < N; ++i) {
-    lanes_d[i] = static_cast<double>(lanes_i[i]);
-  }
-  return Load(dd, lanes_d);
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const Repartition<uint32_t, decltype(dd)> d32;
+  const Repartition<uint64_t, decltype(dd)> d64;
+
+  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
+  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
+  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
+
+  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
+  const auto k52 = Set(d32, 0x43300000);
+  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
+
+  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
+  return (v_upper - k84_63_52) + v_lower;  // order matters!
 #endif
 }

@@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128<T
 namespace detail {

 template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N> d;
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N> du;
+
+  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
+  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
+      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
+      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
+      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
+      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
+      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
+      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
+      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
+      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
+      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
+      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
+      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
+      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
+      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
+      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
+      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
+      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
+      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
+      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
+      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
+      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
+      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
+      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
+      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
+      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
+      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
+      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
+      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
+      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
+      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
+      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
+      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
+      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
+      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
+      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
+      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
+      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
+      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
+      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
+      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
+      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
+      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
+      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
+      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
+      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
+      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
+      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
+      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
+      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
+      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
+      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
+      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
+      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
+      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
+      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
+      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
+      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
+      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
+      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
+      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
+      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
+      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
+      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
+      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
+      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
+      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
+      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
+      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
+      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
+      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
+      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
+      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
+      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
+      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
+      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
+      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
+      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
+      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
+      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
+      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
+      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
+      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
+      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
+      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
+      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
+      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
+      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
+      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
+      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
+      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
+      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
+      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
+      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
+      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
+      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
+      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
+      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
+      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
+      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
+      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
+      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
+      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
+      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
+      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
+      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
+      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
+      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
+      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
+      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
+      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
+      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
+      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
+      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
+      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

@@ -2968,71 +3355,42 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
 // Helper function called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.

-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
-                                     const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
-#else
-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
-                                    const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec128<int32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
-#else
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
-                                     const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec128<uint64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
-#else
-  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
-                                    const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec128<int64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
-#else
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  return TableLookupBytes(v, idx);
-#endif
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
+                              const uint64_t mask_bits) {
+  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
+  using D = Simd<T, N>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
-                                  const uint64_t mask_bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
+                              const uint64_t mask_bits) {
+  using D = Simd<T, N>;
+  using TI = MakeSigned<T>;
+  const Rebind<TI, D> di;
 #if HWY_TARGET == HWY_AVX3
-  return Vec128<float, N>{_mm_maskz_compress_ps(mask_bits, v.raw)};
+  return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi32(
+                          mask_bits, BitCast(di, v).raw)});
 #else
-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
-  const Simd<float, N> df;
-  const Simd<int32_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 #endif
 }

-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
-                                   const uint64_t mask_bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
+                              const uint64_t mask_bits) {
+  using D = Simd<T, N>;
+  using TI = MakeSigned<T>;
+  const Rebind<TI, D> di;
 #if HWY_TARGET == HWY_AVX3
-  return Vec128<double, N>{_mm_maskz_compress_pd(mask_bits, v.raw)};
+  return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi64(
+                          mask_bits, BitCast(di, v).raw)});
 #else
-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
-  const Simd<double, N> df;
-  const Simd<int64_t, N> di;
-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+  const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 #endif
 }

@@ -3040,7 +3398,8 @@ HWY_API Vec128<double, N> Compress(Vec12

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  return detail::Compress(v, detail::BitsFromMask(mask));
+  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+                          detail::BitsFromMask(mask));
 }

 // ------------------------------ CompressStore
@@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128<T, N
                              Simd<T, N> d, T* HWY_RESTRICT aligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
   // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
-  Store(detail::Compress(v, mask_bits), d, aligned);
+  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
   return PopCount(mask_bits);
 }

+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               const Vec128<uint8_t> v2, Full128<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const auto k5 = Set(d, 5);
+  const auto k6 = Set(d, 6);
+
+  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = Load(d, tbl_r0);
+  const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(v0, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(v1, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(v2, shuf_b0);  // .4..3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  StoreU(int0, d, unaligned + 0 * 16);
+
+  // Second vector: g10,r10, bgr[9:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
+  const auto r1 = TableLookupBytes(v0, shuf_r1);
+  const auto g1 = TableLookupBytes(v1, shuf_g1);
+  const auto b1 = TableLookupBytes(v2, shuf_b1);
+  const auto int1 = r1 | g1 | b1;
+  StoreU(int1, d, unaligned + 1 * 16);
+
+  // Third vector: bgr[15:11], b10
+  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
+  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
+  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
+  const auto r2 = TableLookupBytes(v0, shuf_r2);
+  const auto g2 = TableLookupBytes(v1, shuf_g2);
+  const auto b2 = TableLookupBytes(v2, shuf_b2);
+  const auto int2 = r2 | g2 | b2;
+  StoreU(int2, d, unaligned + 2 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
+                               const Vec128<uint8_t, 8> v1,
+                               const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors for the shuffles and first result.
+  const Full128<uint8_t> d_full;
+  const auto k5 = Set(d_full, 5);
+  const auto k6 = Set(d_full, 6);
+
+  const Vec128<uint8_t> full_a{v0.raw};
+  const Vec128<uint8_t> full_b{v1.raw};
+  const Vec128<uint8_t> full_c{v2.raw};
+
+  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = Load(d_full, tbl_r0);
+  const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  StoreU(int0, d_full, unaligned + 0 * 16);
+
+  // Second (HALF) vector: bgr[7:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
+  const auto r1 = TableLookupBytes(full_a, shuf_r1);
+  const auto g1 = TableLookupBytes(full_b, shuf_g1);
+  const auto b1 = TableLookupBytes(full_c, shuf_b1);
+  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
+  StoreU(int1, d, unaligned + 1 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
+                               const Vec128<uint8_t, N> v1,
+                               const Vec128<uint8_t, N> v2,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors for the shuffles and result.
+  const Full128<uint8_t> d_full;
+
+  const Vec128<uint8_t> full_a{v0.raw};
+  const Vec128<uint8_t> full_b{v1.raw};
+  const Vec128<uint8_t> full_c{v2.raw};
+
+  // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
+      0x80, 0x80, 0x80, 0x80};
+  const auto shuf_r0 = Load(d_full, tbl_r0);
+  const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
+  const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
+  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
+  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
+  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
+  const auto int0 = r0 | g0 | b0;
+  alignas(16) uint8_t buf[16];
+  StoreU(int0, d_full, buf);
+  CopyBytes<N * 3>(buf, unaligned);
+}
+
+// ------------------------------ StoreInterleaved4
+
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               const Vec128<uint8_t> v2,
+                               const Vec128<uint8_t> v3, Full128<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
+  const auto ba8 = ZipUpper(v0, v1);
+  const auto dc8 = ZipUpper(v2, v3);
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
+  const auto dcba_8 = ZipLower(ba8, dc8);  // d..aB d..a8
+  const auto dcba_C = ZipUpper(ba8, dc8);  // d..aF d..aC
+  StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
+  StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
+  StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
+  StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
+                               const Vec128<uint8_t, 8> in1,
+                               const Vec128<uint8_t, 8> in2,
+                               const Vec128<uint8_t, 8> in3,
+                               Simd<uint8_t, 8> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  const Vec128<uint8_t> v2{in2.raw};
+  const Vec128<uint8_t> v3{in3.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);       // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);       // d7 c7 .. d0 c0
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
+  const Full128<uint8_t> d_full;
+  StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
+  StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
+                               const Vec128<uint8_t, N> in1,
+                               const Vec128<uint8_t, N> in2,
+                               const Vec128<uint8_t, N> in3,
+                               Simd<uint8_t, N> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  const Vec128<uint8_t> v2{in2.raw};
+  const Vec128<uint8_t> v3{in3.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);       // b3 a3 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);       // d3 c3 .. d0 c0
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
+  alignas(16) uint8_t buf[16];
+  const Full128<uint8_t> d_full;
+  StoreU(BitCast(d_full, dcba_0), d_full, buf);
+  CopyBytes<4 * N>(buf, unaligned);
+}
+
 // ------------------------------ Reductions

 namespace detail {

-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                const Vec128<T, 1> v) {
+  return v;
+}
+
+// u32/i32/f32:
+
+// N=2
+template <typename T>
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
+}
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+                                const Vec128<T, 2> v10) {
+  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+}
+
+// N=4 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = v3210 + v1032;
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return v20_31_20_31 + v31_20_31_20;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Min(v20_31_20_31, v31_20_31_20);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Max(v20_31_20_31, v31_20_31_20);
 }

-// For u64/i64/f64.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+// u64/i64/f64:
+
+// N=2 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return v10 + v01;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Min(v10, v01);
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Max(v10, v01);
 }

 }  // namespace detail

-// Supported for u/i/f 32/64. Returns the sum in each lane.
+// Supported for u/i/f 32/64. Returns the same value in each lane.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12	2021-06-02 10:56:05.234904387 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -20,6 +20,20 @@
 // particular, "Broadcast", pack and zip behavior may be surprising.

 #include <immintrin.h>  // AVX2+
+
+#if defined(_MSC_VER) && defined(__clang__)
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
+// including these headers when _MSC_VER is defined, like when using clang-cl.
+// Include these directly here.
+#include <avxintrin.h>
+// avxintrin defines __m256i and must come before avx2intrin.
+#include <avx2intrin.h>
+#include <bmi2intrin.h>  // _pext_u64
+#include <f16cintrin.h>
+#include <fmaintrin.h>
+#include <smmintrin.h>
+#endif
+
 #include <stddef.h>
 #include <stdint.h>

@@ -148,23 +162,24 @@ HWY_API Vec256<uint16_t> Set(Full256<uin
   return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
-  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};  // NOLINT
+  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
 }
 HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
   return Vec256<uint64_t>{
       _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
 }
 HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
-  return Vec256<int8_t>{_mm256_set1_epi8(t)};
+  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))};  // NOLINT
 }
 HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
-  return Vec256<int16_t>{_mm256_set1_epi16(t)};
+  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
   return Vec256<int32_t>{_mm256_set1_epi32(t)};
 }
 HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
-  return Vec256<int64_t>{_mm256_set1_epi64x(t)};
+  return Vec256<int64_t>{
+      _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
 }
 HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
   return Vec256<float>{_mm256_set1_ps(t)};
@@ -340,6 +355,8 @@ HWY_API Vec256<T> VecFromMask(Full256<T>
   return Vec256<T>{v.raw};
 }

+// ------------------------------ IfThenElse
+
 // mask ? yes : no
 template <typename T>
 HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
@@ -412,9 +429,9 @@ HWY_API Mask256<T> Xor(const Mask256<T>
 // Comparisons fill a lane with 1-bits if the condition is true, else 0.

 template <typename TFrom, typename TTo>
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
+HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask256<TTo>{m.raw};
+  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
 }

 // ------------------------------ Equality
@@ -670,6 +687,14 @@ HWY_API Vec256<double> Max(const Vec256<
   return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
 }

+// ------------------------------ FirstN (Iota, Lt)
+
+template <typename T>
+HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
+  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
+  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
+}
+
 // ================================================== ARITHMETIC

 // ------------------------------ Addition
@@ -832,7 +857,13 @@ HWY_API Vec256<uint16_t> AverageRound(co

 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
 HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
+#if HWY_COMPILER_MSVC
+  // Workaround for incorrect codegen? (wrong result)
+  const auto zero = Zero(Full256<int8_t>());
+  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
+#else
   return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
+#endif
 }
 HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
   return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
@@ -840,6 +871,7 @@ HWY_API Vec256<int16_t> Abs(const Vec256
 HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
   return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
 }
+// i64 is implemented after BroadcastSignBit.

 HWY_API Vec256<float> Abs(const Vec256<float> v) {
   const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
@@ -925,6 +957,16 @@ HWY_API Vec256<int64_t> ShiftLeft(const
   return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
 }

+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
+  const Full256<T> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
+  return kBits == 1
+             ? (v + v)
+             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+}
+
 // ------------------------------ ShiftRight

 template <int kBits>
@@ -943,6 +985,14 @@ HWY_API Vec256<uint64_t> ShiftRight(cons
 }

 template <int kBits>
+HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
+  const Full256<uint8_t> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
+  return shifted & Set(d8, 0xFF >> kBits);
+}
+
+template <int kBits>
 HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
   return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
 }
@@ -952,6 +1002,15 @@ HWY_API Vec256<int32_t> ShiftRight(const
   return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
 }

+template <int kBits>
+HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
+  const Full256<int8_t> di;
+  const Full256<uint8_t> du;
+  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // i64 is implemented after BroadcastSignBit.

 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
@@ -989,6 +1048,15 @@ HWY_API Vec256<int64_t> ShiftRight(const
 #endif
 }

+HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
+#if HWY_TARGET == HWY_AVX3
+  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
+#else
+  const auto zero = Zero(Full256<int64_t>());
+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+#endif
+}
+
 // ------------------------------ ShiftLeftSame

 HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@@ -1016,6 +1084,14 @@ HWY_API Vec256<int64_t> ShiftLeftSame(co
   return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
+  const Full256<T> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
+  return shifted & Set(d8, (0xFF << bits) & 0xFF);
+}
+
 // ------------------------------ ShiftRightSame (BroadcastSignBit)

 HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
@@ -1031,6 +1107,13 @@ HWY_API Vec256<uint64_t> ShiftRightSame(
   return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
+  const Full256<uint8_t> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
+  return shifted & Set(d8, 0xFF >> bits);
+}
+
 HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
                                        const int bits) {
   return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -1053,6 +1136,14 @@ HWY_API Vec256<int64_t> ShiftRightSame(c
 #endif
 }

+HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
+  const Full256<int8_t> di;
+  const Full256<uint8_t> du;
+  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ Negate

 template <typename T, HWY_IF_FLOAT(T)>
@@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256<double>
   _mm256_stream_pd(aligned, v.raw);
 }

+// ------------------------------ Scatter
+
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+
+#if HWY_TARGET == HWY_AVX3
+namespace detail {
+
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256<T> v,
+                           Full256<T> /* tag */, T* HWY_RESTRICT base,
+                           const Vec256<int32_t> offset) {
+  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
+}
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256<T> v,
+                          Full256<T> /* tag */, T* HWY_RESTRICT base,
+                          const Vec256<int32_t> index) {
+  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
+}
+
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256<T> v,
+                           Full256<T> /* tag */, T* HWY_RESTRICT base,
+                           const Vec256<int64_t> offset) {
+  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
+}
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256<T> v,
+                          Full256<T> /* tag */, T* HWY_RESTRICT base,
+                          const Vec256<int64_t> index) {
+  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
+}
+
+}  // namespace detail
+
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+                           const Vec256<Offset> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+}
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+                          const Vec256<Index> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+}
+
+template <>
+HWY_INLINE void ScatterOffset<float>(Vec256<float> v, Full256<float> /* tag */,
+                                     float* HWY_RESTRICT base,
+                                     const Vec256<int32_t> offset) {
+  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
+}
+template <>
+HWY_INLINE void ScatterIndex<float>(Vec256<float> v, Full256<float> /* tag */,
+                                    float* HWY_RESTRICT base,
+                                    const Vec256<int32_t> index) {
+  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
+}
+
+template <>
+HWY_INLINE void ScatterOffset<double>(Vec256<double> v,
+                                      Full256<double> /* tag */,
+                                      double* HWY_RESTRICT base,
+                                      const Vec256<int64_t> offset) {
+  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
+}
+template <>
+HWY_INLINE void ScatterIndex<double>(Vec256<double> v,
+                                     Full256<double> /* tag */,
+                                     double* HWY_RESTRICT base,
+                                     const Vec256<int64_t> index) {
+  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
+}
+
+#else
+
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+                           const Vec256<Offset> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+
+  constexpr size_t N = 32 / sizeof(T);
+  alignas(32) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(32) Offset offset_lanes[N];
+  Store(offset, Simd<Offset, N>(), offset_lanes);
+
+  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+  for (size_t i = 0; i < N; ++i) {
+    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+  }
+}
+
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+                          const Vec256<Index> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+
+  constexpr size_t N = 32 / sizeof(T);
+  alignas(32) T lanes[N];
+  Store(v, d, lanes);
+
+  alignas(32) Index index_lanes[N];
+  Store(index, Simd<Index, N>(), index_lanes);
+
+  for (size_t i = 0; i < N; ++i) {
+    base[index_lanes[i]] = lanes[i];
+  }
+}
+
+#endif
+
 // ------------------------------ Gather

 namespace detail {
@@ -1374,13 +1582,13 @@ HWY_API Vec256<T> GatherIndex(hwy::SizeT
 template <typename T, typename Offset>
 HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
                                const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
 }
 template <typename T, typename Index>
 HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
                               const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
   return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
 }

@@ -1410,6 +1618,8 @@ HWY_INLINE Vec256<double> GatherIndex<do
   return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
 }

+HWY_DIAGNOSTICS(pop)
+
 // ================================================== SWIZZLE

 template <typename T>
@@ -1861,38 +2071,26 @@ HWY_API Vec256<int64_t> ZipUpper(const V
   return Vec256<int64_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
 }

-// ------------------------------ Blocks
+// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
+
+// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
+// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
+// UpperUpper at the cost of one extra cycle/instruction.

 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
 template <typename T>
 HWY_API Vec256<T> ConcatLowerLower(const Vec256<T> hi, const Vec256<T> lo) {
-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)};
+  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)};
 }
 template <>
 HWY_INLINE Vec256<float> ConcatLowerLower(const Vec256<float> hi,
                                           const Vec256<float> lo) {
-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)};
+  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)};
 }
 template <>
 HWY_INLINE Vec256<double> ConcatLowerLower(const Vec256<double> hi,
                                            const Vec256<double> lo) {
-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)};
-}
-
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
-template <typename T>
-HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
-}
-template <>
-HWY_INLINE Vec256<float> ConcatUpperUpper(const Vec256<float> hi,
-                                          const Vec256<float> lo) {
-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
-}
-template <>
-HWY_INLINE Vec256<double> ConcatUpperUpper(const Vec256<double> hi,
-                                           const Vec256<double> lo) {
-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
+  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)};
 }

 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
@@ -1927,6 +2125,12 @@ HWY_INLINE Vec256<double> ConcatUpperLow
   return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
 }

+// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
+template <typename T>
+HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
+  return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo)));
+}
+
 // ------------------------------ Odd/even lanes

 namespace detail {
@@ -2211,11 +2415,18 @@ HWY_API Vec128<int8_t> DemoteTo(Full128<
       _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
 }

+  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
+  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
+
 HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
                                    const Vec256<float> v) {
   return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
 }

+HWY_DIAGNOSTICS(pop)
+
 HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
                                const Vec256<double> v) {
   return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
@@ -2241,7 +2452,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
   return BitCast(Simd<uint8_t, 8>(), pair);
 }

-// ------------------------------ Convert integer <=> floating point
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)

 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
                                 const Vec256<int32_t> v) {
@@ -2253,13 +2464,20 @@ HWY_API Vec256<double> ConvertTo(Full256
   (void)dd;
   return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
 #else
-  alignas(32) int64_t lanes_i[4];
-  Store(v, Full256<int64_t>(), lanes_i);
-  alignas(32) double lanes_d[4];
-  for (size_t i = 0; i < 4; ++i) {
-    lanes_d[i] = static_cast<double>(lanes_i[i]);
-  }
-  return Load(dd, lanes_d);
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const Repartition<uint32_t, decltype(dd)> d32;
+  const Repartition<uint64_t, decltype(dd)> d64;
+
+  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
+  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
+  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
+
+  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
+  const auto k52 = Set(d32, 0x43300000);
+  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
+
+  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
+  return (v_upper - k84_63_52) + v_lower;  // order matters!
 #endif
 }

@@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT
   const auto compressed =
       _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
   return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
-
-#endif
+#endif  // HWY_ARCH_X86_64
 }

 template <typename T>
@@ -2473,75 +2690,100 @@ HWY_INLINE Vec256<uint32_t> Idx64x4FromB
   return Load(d32, packed_array + 8 * mask_bits);
 }

-// Helper function called by both Compress and CompressStore - avoids a
+// Helper functions called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.

-HWY_API Vec256<uint32_t> Compress(Vec256<uint32_t> v,
-                                  const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec256<uint32_t>{
-      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
-#else
-  const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
-  return Vec256<uint32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-#endif
-}
-HWY_API Vec256<int32_t> Compress(Vec256<int32_t> v, const uint64_t mask_bits) {
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<4> /*tag*/, Vec256<T> v,
+                           const uint64_t mask_bits) {
+  const auto vu = BitCast(Full256<uint32_t>(), v);
 #if HWY_TARGET == HWY_AVX3
-  return Vec256<int32_t>{
-      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
+  const __m256i ret =
+      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw);
 #else
   const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
-  return Vec256<int32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+  const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
 #endif
+  return BitCast(Full256<T>(), Vec256<uint32_t>{ret});
 }

-HWY_API Vec256<uint64_t> Compress(Vec256<uint64_t> v,
-                                  const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec256<uint64_t>{
-      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
-#else
-  const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
-  return Vec256<uint64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-#endif
-}
-HWY_API Vec256<int64_t> Compress(Vec256<int64_t> v, const uint64_t mask_bits) {
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<8> /*tag*/, Vec256<T> v,
+                           const uint64_t mask_bits) {
+  const auto vu = BitCast(Full256<uint64_t>(), v);
 #if HWY_TARGET == HWY_AVX3
-  return Vec256<int64_t>{
-      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
+  const __m256i ret =
+      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw);
 #else
   const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
-  return Vec256<int64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+  const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
 #endif
+  return BitCast(Full256<T>(), Vec256<uint64_t>{ret});
 }

-HWY_API Vec256<float> Compress(Vec256<float> v, const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec256<float>{
-      _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)};
-#else
-  const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
-  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
-#endif
-}
+// Otherwise, defined in x86_512-inl.h so it can use wider vectors.
+#if HWY_TARGET != HWY_AVX3

-HWY_API Vec256<double> Compress(Vec256<double> v, const uint64_t mask_bits) {
-#if HWY_TARGET == HWY_AVX3
-  return Vec256<double>{
-      _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)};
-#else
-  const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
-  return Vec256<double>{_mm256_castsi256_pd(
-      _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))};
-#endif
+// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using
+// the native Compress is probably more efficient than 2 LUTs.
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
+                           const uint64_t mask_bits) {
+  using D = Full256<T>;
+  const Rebind<uint16_t, D> du;
+  const Repartition<int32_t, D> dw;
+  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
+  const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
+  const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
+
+  const uint64_t mask_bits0 = mask_bits & 0xFF;
+  const uint64_t mask_bits1 = mask_bits >> 8;
+  const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0);
+  const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1);
+
+  const Half<decltype(du)> dh;
+  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
+  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
+
+  const size_t count0 = PopCount(mask_bits0);
+  // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
+  // VPERMD for shifting at 4 byte granularity.
+  alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
+                                             0, 1, 2, 3, 4, 5, 6, 7};
+  const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2);
+  const auto shift1_multiple4 =
+      BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices));
+
+  // Whole-register unconditional shift by 2 bytes.
+  // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead?
+  const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
+                                                  shift1_multiple4.raw, 0x08);
+  const auto shift1_multiple2 =
+      Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
+
+  // Make the shift conditional on the lower bit of count0.
+  const auto m_odd = TestBit(Set(du, count0), Set(du, 1));
+  const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
+
+  // Blend the lower and shifted upper parts.
+  constexpr uint16_t on = 0xFFFF;
+  alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on),
+                                                    HWY_REP4(on), HWY_REP4(on)};
+  const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0));
+  return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1));
 }

+#endif  // HWY_TARGET != HWY_AVX3
+
 }  // namespace detail

+// Otherwise, defined in x86_512-inl.h after detail::Compress.
+#if HWY_TARGET != HWY_AVX3
+
 template <typename T>
 HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
-  return detail::Compress(v, detail::BitsFromMask(mask));
+  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+                          detail::BitsFromMask(mask));
 }

 // ------------------------------ CompressStore
@@ -2550,10 +2792,101 @@ template <typename T>
 HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
                              T* HWY_RESTRICT aligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
-  Store(detail::Compress(v, mask_bits), d, aligned);
+  // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
+  // using StoreU to concatenate the results would cause page faults if
+  // `aligned` is the last valid vector. Instead rely on in-register splicing.
+  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
   return PopCount(mask_bits);
 }

+#endif  // HWY_TARGET != HWY_AVX3
+
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes, ConcatUpperLower)
+
+HWY_API void StoreInterleaved3(const Vec256<uint8_t> v0,
+                               const Vec256<uint8_t> v1,
+                               const Vec256<uint8_t> v2, Full256<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const auto k5 = Set(d, 5);
+  const auto k6 = Set(d, 6);
+
+  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = LoadDup128(d, tbl_r0);
+  const auto shuf_g0 = LoadDup128(d, tbl_g0);  // cannot reuse r0 due to 5
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(v0, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(v1, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(v2, shuf_b0);  // .4..3..2..1..0..
+  const auto interleaved_10_00 = r0 | g0 | b0;
+
+  // Second vector: g10,r10, bgr[9:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
+  const auto r1 = TableLookupBytes(v0, shuf_r1);
+  const auto g1 = TableLookupBytes(v1, shuf_g1);
+  const auto b1 = TableLookupBytes(v2, shuf_b1);
+  const auto interleaved_15_05 = r1 | g1 | b1;
+
+  // We want to write the lower halves of the interleaved vectors, then the
+  // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
+  // that would require two ununaligned stores. For the lower halves, we can
+  // merge two 128-bit stores for the same swizzling cost:
+  const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00);
+  StoreU(out0, d, unaligned + 0 * 32);
+
+  // Third vector: bgr[15:11], b10
+  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
+  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
+  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
+  const auto r2 = TableLookupBytes(v0, shuf_r2);
+  const auto g2 = TableLookupBytes(v1, shuf_g2);
+  const auto b2 = TableLookupBytes(v2, shuf_b2);
+  const auto interleaved_1A_0A = r2 | g2 | b2;
+
+  const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A);
+  StoreU(out1, d, unaligned + 1 * 32);
+
+  const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05);
+  StoreU(out2, d, unaligned + 2 * 32);
+}
+
+// ------------------------------ StoreInterleaved4
+
+HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
+                               const Vec256<uint8_t> v1,
+                               const Vec256<uint8_t> v2,
+                               const Vec256<uint8_t> v3, Full256<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
+  const auto ba8 = ZipUpper(v0, v1);
+  const auto dc8 = ZipUpper(v2, v3);
+  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a13 d..a10 | d..a03 d..a00
+  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a17 d..a14 | d..a07 d..a04
+  const auto dcba_8 = ZipLower(ba8, dc8);  // d..a1B d..a18 | d..a0B d..a08
+  const auto dcba_C = ZipUpper(ba8, dc8);  // d..a1F d..a1C | d..a0F d..a0C
+  // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
+  // efficiently combine two lower halves into 256 bits:
+  const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0));
+  const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8));
+  StoreU(out0, d, unaligned + 0 * 32);
+  StoreU(out1, d, unaligned + 1 * 32);
+  const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0));
+  const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8));
+  StoreU(out2, d, unaligned + 2 * 32);
+  StoreU(out3, d, unaligned + 3 * 32);
+}
+
 // ------------------------------ Reductions

 namespace detail {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12	2021-06-02 10:56:05.218904306 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -19,6 +19,23 @@
 // particular, "Broadcast", pack and zip behavior may be surprising.

 #include <immintrin.h>  // AVX2+
+#if defined(_MSC_VER) && defined(__clang__)
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
+// including these headers when _MSC_VER is defined, like when using clang-cl.
+// Include these directly here.
+#include <smmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <f16cintrin.h>
+#include <fmaintrin.h>
+#include <avx512fintrin.h>
+#include <avx512vlintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512vlbwintrin.h>
+#include <avx512vldqintrin.h>
+#endif
+
 #include <stddef.h>
 #include <stdint.h>

@@ -100,9 +117,8 @@ struct RawMask512<8> {
 // Mask register: one bit per lane.
 template <typename T>
 class Mask512 {
-  using Raw = typename RawMask512<sizeof(T)>::type;
-
  public:
+  using Raw = typename RawMask512<sizeof(T)>::type;
   Raw raw;
 };

@@ -167,23 +183,24 @@ HWY_API Vec512<uint16_t> Set(Full512<uin
   return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
-  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};  // NOLINT
+  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
 }
 HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
   return Vec512<uint64_t>{
       _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
 }
 HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
-  return Vec512<int8_t>{_mm512_set1_epi8(t)};
+  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))};  // NOLINT
 }
 HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
-  return Vec512<int16_t>{_mm512_set1_epi16(t)};
+  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
 }
 HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
   return Vec512<int32_t>{_mm512_set1_epi32(t)};
 }
 HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
-  return Vec512<int64_t>{_mm512_set1_epi64(t)};
+  return Vec512<int64_t>{
+      _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
 }
 HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
   return Vec512<float>{_mm512_set1_ps(t)};
@@ -329,7 +346,45 @@ HWY_API Vec512<T> CopySignToAbs(const Ve
   return CopySign(abs, sign);
 }

-// ------------------------------ Select/blend
+// ------------------------------ FirstN
+
+// Possibilities for constructing a bitmask of N ones:
+// - kshift* only consider the lowest byte of the shift count, so they would
+//   not correctly handle large n.
+// - Scalar shifts >= 64 are UB.
+// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
+//   we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
+
+#if HWY_ARCH_X86_32
+namespace detail {
+
+// 32 bit mask is sufficient for lane size >= 2.
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API Mask512<T> FirstN(size_t n) {
+  using Bits = typename Mask512<T>::Raw;
+  return Mask512<T>{static_cast<Bits>(_bzhi_u32(~uint32_t(0), n))};
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Mask512<T> FirstN(size_t n) {
+  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
+  return Mask512<T>{static_cast<__mmask64>(bits)};
+}
+
+}  // namespace detail
+#endif  // HWY_ARCH_X86_32
+
+template <typename T>
+HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
+#if HWY_ARCH_X86_64
+  using Bits = typename Mask512<T>::Raw;
+  return Mask512<T>{static_cast<Bits>(_bzhi_u64(~uint64_t(0), n))};
+#else
+  return detail::FirstN<T>(n);
+#endif  // HWY_ARCH_X86_64
+}
+
+// ------------------------------ IfThenElse

 // Returns mask ? b : a.

@@ -626,7 +681,13 @@ HWY_API Vec512<uint16_t> AverageRound(co

 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
 HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) {
+#if HWY_COMPILER_MSVC
+  // Workaround for incorrect codegen? (untested due to internal compiler error)
+  const auto zero = Zero(Full512<int8_t>());
+  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
+#else
   return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
+#endif
 }
 HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) {
   return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
@@ -634,6 +695,9 @@ HWY_API Vec512<int16_t> Abs(const Vec512
 HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) {
   return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
 }
+HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
+  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
+}

 // These aren't native instructions, they also involve AND with constant.
 HWY_API Vec512<float> Abs(const Vec512<float> v) {
@@ -675,6 +739,16 @@ HWY_API Vec512<int64_t> ShiftLeft(const
   return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
 }

+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
+  const Full512<T> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
+  return kBits == 1
+             ? (v + v)
+             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+}
+
 // ------------------------------ ShiftRight

 template <int kBits>
@@ -693,6 +767,14 @@ HWY_API Vec512<uint64_t> ShiftRight(cons
 }

 template <int kBits>
+HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
+  const Full512<uint8_t> d8;
+  // Use raw instead of BitCast to support N=1.
+  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
+  return shifted & Set(d8, 0xFF >> kBits);
+}
+
+template <int kBits>
 HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) {
   return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
 }
@@ -707,6 +789,15 @@ HWY_API Vec512<int64_t> ShiftRight(const
   return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
 }

+template <int kBits>
+HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
+  const Full512<int8_t> di;
+  const Full512<uint8_t> du;
+  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ ShiftLeftSame

 HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v,
@@ -734,6 +825,14 @@ HWY_API Vec512<int64_t> ShiftLeftSame(co
   return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
+  const Full512<T> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
+  return shifted & Set(d8, (0xFF << bits) & 0xFF);
+}
+
 // ------------------------------ ShiftRightSame

 HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v,
@@ -749,6 +848,13 @@ HWY_API Vec512<uint64_t> ShiftRightSame(
   return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) {
+  const Full512<uint8_t> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
+  return shifted & Set(d8, 0xFF >> bits);
+}
+
 HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v,
                                        const int bits) {
   return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -763,6 +869,14 @@ HWY_API Vec512<int64_t> ShiftRightSame(c
   return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
 }

+HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
+  const Full512<int8_t> di;
+  const Full512<uint8_t> du;
+  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+  return (shifted ^ shifted_sign) - shifted_sign;
+}
+
 // ------------------------------ Shl

 HWY_API Vec512<uint16_t> operator<<(const Vec512<uint16_t> v,
@@ -1046,6 +1160,10 @@ HWY_API Vec512<float> ApproximateRecipro

 // ------------------------------ Floating-point rounding

+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+
 // Toward nearest integer, tie to even
 HWY_API Vec512<float> Round(const Vec512<float> v) {
   return Vec512<float>{_mm512_roundscale_ps(
@@ -1086,6 +1204,8 @@ HWY_API Vec512<double> Floor(const Vec51
       _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
 }

+HWY_DIAGNOSTICS(pop)
+
 // ================================================== COMPARE

 // Comparisons set a mask bit to 1 if the condition is true, else 0.
@@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512<double>
   _mm512_stream_pd(aligned, v.raw);
 }

+// ------------------------------ Scatter
+
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+
+namespace detail {
+
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v,
+                           Full512<T> /* tag */, T* HWY_RESTRICT base,
+                           const Vec512<int32_t> offset) {
+  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
+}
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v,
+                          Full512<T> /* tag */, T* HWY_RESTRICT base,
+                          const Vec512<int32_t> index) {
+  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
+}
+
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v,
+                           Full512<T> /* tag */, T* HWY_RESTRICT base,
+                           const Vec512<int64_t> offset) {
+  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
+}
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v,
+                          Full512<T> /* tag */, T* HWY_RESTRICT base,
+                          const Vec512<int64_t> index) {
+  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
+}
+
+}  // namespace detail
+
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
+                           const Vec512<Offset> offset) {
+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+}
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
+                          const Vec512<Index> index) {
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+}
+
+template <>
+HWY_INLINE void ScatterOffset<float>(Vec512<float> v, Full512<float> /* tag */,
+                                     float* HWY_RESTRICT base,
+                                     const Vec512<int32_t> offset) {
+  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
+}
+template <>
+HWY_INLINE void ScatterIndex<float>(Vec512<float> v, Full512<float> /* tag */,
+                                    float* HWY_RESTRICT base,
+                                    const Vec512<int32_t> index) {
+  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
+}
+
+template <>
+HWY_INLINE void ScatterOffset<double>(Vec512<double> v,
+                                      Full512<double> /* tag */,
+                                      double* HWY_RESTRICT base,
+                                      const Vec512<int64_t> offset) {
+  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
+}
+template <>
+HWY_INLINE void ScatterIndex<double>(Vec512<double> v,
+                                     Full512<double> /* tag */,
+                                     double* HWY_RESTRICT base,
+                                     const Vec512<int64_t> index) {
+  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
+}
+
 // ------------------------------ Gather

 namespace detail {
@@ -1713,13 +1910,13 @@ HWY_API Vec512<T> GatherIndex(hwy::SizeT
 template <typename T, typename Offset>
 HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
                                const Vec512<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
 }
 template <typename T, typename Index>
 HWY_API Vec512<T> GatherIndex(Full512<T> d, const T* HWY_RESTRICT base,
                               const Vec512<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
   return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
 }

@@ -1749,6 +1946,8 @@ HWY_INLINE Vec512<double> GatherIndex<do
   return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
 }

+HWY_DIAGNOSTICS(pop)
+
 // ================================================== SWIZZLE

 template <typename T>
@@ -2439,7 +2638,11 @@ HWY_API Vec256<int8_t> DemoteTo(Full256<

 HWY_API Vec256<float16_t> DemoteTo(Full256<float16_t> /* tag */,
                                    const Vec512<float> v) {
+  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
+  HWY_DIAGNOSTICS(pop)
 }

 HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
@@ -2633,8 +2836,81 @@ HWY_API Vec512<double> Compress(Vec512<d
   return Vec512<double>{_mm512_maskz_compress_pd(mask.raw, v.raw)};
 }

+namespace detail {
+
+// Ignore IDE redefinition error for these two functions: if this header is
+// included, then the functions weren't actually defined in x86_256-inl.h.
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
+                           const uint64_t mask_bits) {
+  using D = Full256<T>;
+  const Rebind<uint16_t, D> du;
+  const Rebind<int32_t, D> dw;       // 512-bit, not 256!
+  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
+  const Mask512<int32_t> mask{static_cast<__mmask16>(mask_bits)};
+  return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask)));
+}
+
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+                          detail::BitsFromMask(mask));
+}
+
+// Expands to 32-bit, compresses, concatenate demoted halves.
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
+  using D = Full512<T>;
+  const Rebind<uint16_t, D> du;
+  const Repartition<int32_t, D> dw;
+  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
+  const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
+  const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
+
+  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)};
+  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask.raw >> 16)};
+  const auto compressed0 = Compress(promoted0, mask0);
+  const auto compressed1 = Compress(promoted1, mask1);
+
+  const Half<decltype(du)> dh;
+  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
+  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
+
+  // Concatenate into single vector by shifting upper with writemask.
+  const size_t num0 = CountTrue(mask0);
+  const __mmask32 m_upper = ~((1u << num0) - 1);
+  alignas(64) uint16_t iota[64] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+  const auto idx = LoadU(du, iota + 32 - num0);
+  return Vec512<T>{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw,
+                                                 demoted1.raw)};
+}
+
 // ------------------------------ CompressStore

+template <typename T>
+HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
+                             T* HWY_RESTRICT aligned) {
+  const uint64_t mask_bits = detail::BitsFromMask(mask);
+  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
+  return PopCount(mask_bits);
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API size_t CompressStore(Vec512<T> v, const Mask512<T> mask, Full512<T> d,
+                             T* HWY_RESTRICT aligned) {
+  // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
+  // using StoreU to concatenate the results would cause page faults if
+  // `aligned` is the last valid vector. Instead rely on in-register splicing.
+  Store(Compress(v, mask), d, aligned);
+  return CountTrue(mask);
+}
+
 HWY_API size_t CompressStore(Vec512<uint32_t> v, const Mask512<uint32_t> mask,
                              Full512<uint32_t> /* tag */,
                              uint32_t* HWY_RESTRICT aligned) {
@@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512<doub
   return CountTrue(mask);
 }

+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+
+HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
+                               const Vec512<uint8_t> c, Full512<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const auto k5 = Set(d, 5);
+  const auto k6 = Set(d, 6);
+
+  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+  // 0x80 so lanes to be filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_r0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_g0[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const auto shuf_r0 = LoadDup128(d, tbl_r0);
+  const auto shuf_g0 = LoadDup128(d, tbl_g0);  // cannot reuse r0 due to 5
+  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+  const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
+  const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
+  const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
+  const auto i = (r0 | g0 | b0).raw;  // low byte in each 128bit: 30 20 10 00
+
+  // Second vector: g10,r10, bgr[9:6], b5,g5
+  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
+  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
+  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
+  const auto r1 = TableLookupBytes(a, shuf_r1);
+  const auto g1 = TableLookupBytes(b, shuf_g1);
+  const auto b1 = TableLookupBytes(c, shuf_b1);
+  const auto j = (r1 | g1 | b1).raw;  // low byte in each 128bit: 35 25 15 05
+
+  // Third vector: bgr[15:11], b10
+  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
+  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
+  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
+  const auto r2 = TableLookupBytes(a, shuf_r2);
+  const auto g2 = TableLookupBytes(b, shuf_g2);
+  const auto b2 = TableLookupBytes(c, shuf_b2);
+  const auto k = (r2 | g2 | b2).raw;  // low byte in each 128bit: 3A 2A 1A 0A
+
+  // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
+  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
+  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
+  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
+
+  // Alternating order, most-significant 128 bits from the second arg.
+  const __mmask8 m = 0xCC;
+  const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
+  const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
+  const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
+
+  StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64);  //  10 0A 05 00
+  StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64);  //  25 20 1A 15
+  StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64);  //  3A 35 30 2A
+}
+
+// ------------------------------ StoreInterleaved4
+
+HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
+                               const Vec512<uint8_t> v1,
+                               const Vec512<uint8_t> v2,
+                               const Vec512<uint8_t> v3, Full512<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
+  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
+  const auto ba8 = ZipUpper(v0, v1);
+  const auto dc8 = ZipUpper(v2, v3);
+  const auto i = ZipLower(ba0, dc0).raw;  // 4x128bit: d..a3 d..a0
+  const auto j = ZipUpper(ba0, dc0).raw;  // 4x128bit: d..a7 d..a4
+  const auto k = ZipLower(ba8, dc8).raw;  // 4x128bit: d..aB d..a8
+  const auto l = ZipUpper(ba8, dc8).raw;  // 4x128bit: d..aF d..aC
+  // 128-bit blocks were independent until now; transpose 4x4.
+  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
+  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
+  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
+  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
+  constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
+  constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
+  const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
+  const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
+  const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
+  const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
+  StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d, unaligned + 0 * 64);
+  StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d, unaligned + 1 * 64);
+  StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d, unaligned + 2 * 64);
+  StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d, unaligned + 3 * 64);
+}
+
 // ------------------------------ Reductions

 // Returns the sum in each lane.
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12	2021-06-02 10:56:05.281904625 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc	2021-05-31 10:37:11.000000000 -0400
@@ -28,12 +28,12 @@

 #if HWY_ARCH_X86
 #include <xmmintrin.h>
-#ifdef _MSC_VER
+#if HWY_COMPILER_MSVC
 #include <intrin.h>
-#else
+#else  // HWY_COMPILER_MSVC
 #include <cpuid.h>
-#endif
-#endif
+#endif  // HWY_COMPILER_MSVC
+#endif  // HWY_ARCH_X86

 namespace hwy {
 namespace {
@@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const
 // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
 void Cpuid(const uint32_t level, const uint32_t count,
            uint32_t* HWY_RESTRICT abcd) {
-#ifdef _MSC_VER
+#if HWY_COMPILER_MSVC
   int regs[4];
   __cpuidex(regs, level, count);
   for (int i = 0; i < 4; ++i) {
     abcd[i] = regs[i];
   }
-#else
+#else  // HWY_COMPILER_MSVC
   uint32_t a;
   uint32_t b;
   uint32_t c;
@@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u
   abcd[1] = b;
   abcd[2] = c;
   abcd[3] = d;
-#endif
+#endif  // HWY_COMPILER_MSVC
 }

 // Returns the lower 32 bits of extended control register 0.
 // Requires CPU support for "OSXSAVE" (see below).
 uint32_t ReadXCR0() {
-#ifdef _MSC_VER
+#if HWY_COMPILER_MSVC
   return static_cast<uint32_t>(_xgetbv(0));
-#else
+#else  // HWY_COMPILER_MSVC
   uint32_t xcr0, xcr0_high;
   const uint32_t index = 0;
   asm volatile(".byte 0x0F, 0x01, 0xD0"
                : "=a"(xcr0), "=d"(xcr0_high)
                : "c"(index));
   return xcr0;
-#endif
+#endif  // HWY_COMPILER_MSVC
 }

 #endif  // HWY_ARCH_X86
@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13;
 constexpr uint32_t kAVX512DQ = 1u << 14;
 constexpr uint32_t kAVX512BW = 1u << 15;
 constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
-#endif
+#endif  // HWY_ARCH_X86

 }  // namespace

diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12	2021-06-02 10:56:05.267904554 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h	2021-05-31 10:37:11.000000000 -0400
@@ -65,7 +65,9 @@
 // HWY_MAX_DYNAMIC_TARGETS in total.
 #define HWY_HIGHEST_TARGET_BIT_X86 9

-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
+#define HWY_SVE2 0x400
+#define HWY_SVE 0x800
+// 0x1000 reserved for Helium
 #define HWY_NEON 0x2000

 #define HWY_HIGHEST_TARGET_BIT_ARM 13
@@ -90,6 +92,9 @@
 // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved

 #define HWY_SCALAR 0x20000000
+
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
+
 // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.

 //------------------------------------------------------------------------------
@@ -106,25 +111,26 @@
 #ifndef HWY_BROKEN_TARGETS

 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
-// SSE4 codegen (msan failure), so disable all those targets.
+// SSE4 codegen (possibly only for msan), so disable all those targets.
 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
-// TODO: Disable all non-scalar targets for every build target once we have
-// clang-7 enabled in our builders.
-#ifdef MEMORY_SANITIZER
 #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
-#else
-#define HWY_BROKEN_TARGETS 0
-#endif
 // This entails a major speed reduction, so warn unless the user explicitly
 // opts in to scalar-only.
 #if !defined(HWY_COMPILE_ONLY_SCALAR)
 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
 #endif

-// MSVC, or 32-bit may fail to compile AVX2/3.
-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
+// 32-bit may fail to compile AVX2/3.
+#elif HWY_ARCH_X86_32
 #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
+
+// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
+#elif HWY_COMPILER_MSVC != 0
+#define HWY_BROKEN_TARGETS (HWY_AVX3)
+
+// armv7be has not been tested and is not yet supported.
+#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
+#define HWY_BROKEN_TARGETS (HWY_NEON)

 #else
 #define HWY_BROKEN_TARGETS 0
@@ -145,53 +151,74 @@
 // user to override this without any guarantee of success.
 #ifndef HWY_BASELINE_TARGETS

-#ifdef __wasm_simd128__
+// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
+// HWY_TARGET == HWY_SCALAR.
+
+#if HWY_ARCH_WASM && defined(__wasm_simd128__)
 #define HWY_BASELINE_WASM HWY_WASM
 #else
 #define HWY_BASELINE_WASM 0
 #endif

-#ifdef __VSX__
+// Avoid choosing the PPC target until we have an implementation.
+#if HWY_ARCH_PPC && defined(__VSX__) && 0
 #define HWY_BASELINE_PPC8 HWY_PPC8
 #else
 #define HWY_BASELINE_PPC8 0
 #endif

-// GCC 4.5.4 only defines the former; 5.4 defines both.
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+// Avoid choosing the SVE[2] targets the implementation is ready.
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
+#define HWY_BASELINE_SVE2 HWY_SVE2
+#else
+#define HWY_BASELINE_SVE2 0
+#endif
+
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
+#define HWY_BASELINE_SVE HWY_SVE
+#else
+#define HWY_BASELINE_SVE 0
+#endif
+
+// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
+#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
 #define HWY_BASELINE_NEON HWY_NEON
 #else
 #define HWY_BASELINE_NEON 0
 #endif

-#ifdef __SSE4_1__
+// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
+// we at least get SSE4 on machines supporting AVX but not AVX2.
+// https://stackoverflow.com/questions/18563978/
+#if HWY_ARCH_X86 && \
+    (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
 #define HWY_BASELINE_SSE4 HWY_SSE4
 #else
 #define HWY_BASELINE_SSE4 0
 #endif

-#ifdef __AVX2__
+#if HWY_ARCH_X86 && defined(__AVX2__)
 #define HWY_BASELINE_AVX2 HWY_AVX2
 #else
 #define HWY_BASELINE_AVX2 0
 #endif

-#ifdef __AVX512F__
+#if HWY_ARCH_X86 && defined(__AVX512F__)
 #define HWY_BASELINE_AVX3 HWY_AVX3
 #else
 #define HWY_BASELINE_AVX3 0
 #endif

-#ifdef __riscv_vector
+#if HWY_ARCH_RVV && defined(__riscv_vector)
 #define HWY_BASELINE_RVV HWY_RVV
 #else
 #define HWY_BASELINE_RVV 0
 #endif

 #define HWY_BASELINE_TARGETS                                                \
-  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
-   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
-   HWY_BASELINE_RVV)
+  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
+   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 |               \
+   HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)

 #endif  // HWY_BASELINE_TARGETS

@@ -242,13 +269,12 @@
 #define HWY_TARGETS HWY_STATIC_TARGET

 // 3) For tests: include all attainable targets (in particular: scalar)
-#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS

 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
 // excluding superseded targets, in particular scalar.
 #else
-
 #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))

 #endif  // target policy
@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha
 #endif

 #if HWY_ARCH_ARM
+    case HWY_SVE2:
+      return "SVE2";
+    case HWY_SVE:
+      return "SVE";
     case HWY_NEON:
       return "Neon";
 #endif
@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha
       return "Scalar";

     default:
-      return "?";
+      return "Unknown";  // must satisfy gtest IsValidParamName()
   }
 }

@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha
       nullptr,                    /* SSE3 */     \
       nullptr                     /* SSE2 */

-#endif  // HWY_ARCH_X86
-
-#if HWY_ARCH_ARM
+#elif HWY_ARCH_ARM
 // See HWY_ARCH_X86 above for details.
 #define HWY_MAX_DYNAMIC_TARGETS 4
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
 #define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  nullptr,                       /* reserved */ \
-      nullptr,                   /* reserved */ \
+  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
+      HWY_CHOOSE_SVE(func_name), /* SVE */      \
       nullptr,                   /* reserved */ \
       HWY_CHOOSE_NEON(func_name) /* NEON */

-#endif  // HWY_ARCH_ARM
-
-#if HWY_ARCH_PPC
+#elif HWY_ARCH_PPC
 // See HWY_ARCH_X86 above for details.
 #define HWY_MAX_DYNAMIC_TARGETS 5
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha
       nullptr,                    /* VSX */      \
       nullptr                     /* AltiVec */

-#endif  // HWY_ARCH_PPC
-
-#if HWY_ARCH_WASM
+#elif HWY_ARCH_WASM
 // See HWY_ARCH_X86 above for details.
 #define HWY_MAX_DYNAMIC_TARGETS 4
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha
       nullptr,                   /* reserved */ \
       HWY_CHOOSE_WASM(func_name) /* WASM */

-#endif  // HWY_ARCH_WASM
-
-#if HWY_ARCH_RVV
+#elif HWY_ARCH_RVV
 // See HWY_ARCH_X86 above for details.
 #define HWY_MAX_DYNAMIC_TARGETS 4
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha
       nullptr,                   /* reserved */ \
       HWY_CHOOSE_RVV(func_name) /* RVV */

-#endif  // HWY_ARCH_RVV
+#else
+// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
+// still creating single-entry tables in HWY_EXPORT to ensure portability.
+#define HWY_MAX_DYNAMIC_TARGETS 1
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
+#endif

 struct ChosenTarget {
  public:
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12	2021-06-02 10:56:05.264904539 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR)
 HWY_EXPORT(FakeFunction);

 void CheckFakeFunction() {
-#define CHECK_ARRAY_ENTRY(TGT)                                          \
-  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                 \
-    hwy::SetSupportedTargetsForTest(HWY_##TGT);                         \
-    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
-    /* the pointer to the already cached function. */                   \
-    hwy::chosen_target.Update();                                        \
-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
-    /* Calling DeInit() will test that the initializer function */      \
-    /* also calls the right function. */                                \
-    hwy::chosen_target.DeInit();                                        \
-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
-    /* Second call uses the cached value from the previous call. */     \
-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
+#define CHECK_ARRAY_ENTRY(TGT)                                              \
+  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                     \
+    hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \
+    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \
+    /* the pointer to the already cached function. */                       \
+    hwy::chosen_target.Update();                                            \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+    /* Calling DeInit() will test that the initializer function */          \
+    /* also calls the right function. */                                    \
+    hwy::chosen_target.DeInit();                                            \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+    /* Second call uses the cached value from the previous call. */         \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
   }
   CHECK_ARRAY_ENTRY(AVX3)
   CHECK_ARRAY_ENTRY(AVX2)
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12	2021-06-02 10:56:05.251904473 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -16,7 +16,6 @@
 #include <stdint.h>

 #include <algorithm>
-#include <cmath>
 #include <limits>

 #undef HWY_TARGET_INCLUDE
@@ -173,16 +172,8 @@ struct TestFloatAbs {
 };

 HWY_NOINLINE void TestAllAbs() {
-  const ForPartialVectors<TestAbs> test;
-  test(int8_t());
-  test(int16_t());
-  test(int32_t());
-
-  const ForPartialVectors<TestFloatAbs> test_float;
-  test_float(float());
-#if HWY_CAP_FLOAT64
-  test_float(double());
-#endif
+  ForSignedTypes(ForPartialVectors<TestAbs>());
+  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
 }

 template <bool kSigned>
@@ -199,6 +190,45 @@ struct TestLeftShifts {
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);

+    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+
+    // 1
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+  }
+};
+
+template <bool kSigned>
+struct TestVariableLeftShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    if (kSigned) {
+      // Also test positive values
+      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
+    }
+
+    using TI = MakeSigned<T>;
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
     const auto v0 = Zero(d);
     const auto v1 = Set(d, 1);
     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
@@ -209,8 +239,6 @@ struct TestLeftShifts {
     const auto large_shifts = max_shift - small_shifts;

     // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
     HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));

     // Same: 1
@@ -218,8 +246,6 @@ struct TestLeftShifts {
       const T value = kSigned ? T(i) - T(N) : T(i);
       expected[i] = T(TU(value) << 1);
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));

     // Same: max
@@ -227,8 +253,6 @@ struct TestLeftShifts {
       const T value = kSigned ? T(i) - T(N) : T(i);
       expected[i] = T(TU(value) << kMaxShift);
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));

     // Variable: small
@@ -252,6 +276,37 @@ struct TestUnsignedRightShifts {
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);

+    const auto values = Iota(d, 0);
+
+    const T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
+  }
+};
+
+struct TestVariableUnsignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
     const auto v0 = Zero(d);
     const auto v1 = Set(d, 1);
     const auto values = Iota(d, 0);
@@ -265,21 +320,15 @@ struct TestUnsignedRightShifts {
     const auto large_shifts = max_shift - small_shifts;

     // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
     HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));

     // Same: 1
     for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(i >> 1);
+      expected[i] = T(T(i & kMax) >> 1);
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));

     // Same: max
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
     HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));

     // Variable: small
@@ -296,33 +345,120 @@ struct TestUnsignedRightShifts {
   }
 };

-struct TestSignedRightShifts {
+template <int kAmount, typename T>
+T RightShiftNegative(T val) {
+  // C++ shifts are implementation-defined for negative numbers, and we have
+  // seen divisions replaced with shifts, so resort to bit operations.
+  using TU = hwy::MakeUnsigned<T>;
+  TU bits;
+  CopyBytes<sizeof(T)>(&val, &bits);
+
+  const TU shifted = bits >> kAmount;
+
+  const TU all = ~TU(0);
+  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
+  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
+
+  bits = shifted | sign_extended;
+  CopyBytes<sizeof(T)>(&bits, &val);
+  return val;
+}
+
+class TestSignedRightShifts {
+ public:
   template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    // Also test positive values
-    TestUnsignedRightShifts()(t, d);
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    constexpr T kMin = LimitsMin<T>();
+    constexpr T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto values = Iota(d, 0) & Set(d, kMax);
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+
+    // Even negative value
+    Test<0>(kMin, d, __LINE__);
+    Test<1>(kMin, d, __LINE__);
+    Test<2>(kMin, d, __LINE__);
+    Test<kMaxShift>(kMin, d, __LINE__);
+
+    const T odd = static_cast<T>(kMin + 1);
+    Test<0>(odd, d, __LINE__);
+    Test<1>(odd, d, __LINE__);
+    Test<2>(odd, d, __LINE__);
+    Test<kMaxShift>(odd, d, __LINE__);
+  }
+
+ private:
+  template <int kAmount, typename T, class D>
+  void Test(T val, D d, int line) {
+    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
+    const auto in = Set(d, val);
+    const char* file = __FILE__;
+    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
+    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
+  }
+};

+struct TestVariableSignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using TU = MakeUnsigned<T>;
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);

     constexpr T kMin = LimitsMin<T>();
-    const auto values = Iota(d, kMin);
+    constexpr T kMax = LimitsMax<T>();

     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto positive = Iota(d, 0) & Set(d, kMax);
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
+
     const auto max_shift = Set(d, kMaxShift);
     const auto small_shifts = And(Iota(d, 0), max_shift);
     const auto large_shifts = max_shift - small_shifts;

-    // Test varying values to shift
+    const auto negative = Iota(d, kMin);
+
+    // Test varying negative to shift
     for (size_t i = 0; i < N; ++i) {
-      // We want a right-shift here, which is undefined behavior for negative
-      // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if
-      // minT is odd and negative.
-      T minT = static_cast<T>(kMin + i);
-      expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0));
+      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1)));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));

     // Shift MSB right by small amounts
     for (size_t i = 0; i < N; ++i) {
@@ -343,6 +479,13 @@ struct TestSignedRightShifts {
 };

 HWY_NOINLINE void TestAllShifts() {
+  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
+  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
+  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
+  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
+}
+
+HWY_NOINLINE void TestAllVariableShifts() {
   const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
   const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
   const ForPartialVectors<TestUnsignedRightShifts> shr_u;
@@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() {
   ForFloatTypes(ForPartialVectors<TestRound>());
 }

+struct TestNearestInt {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF tf, const DF df) {
+    using TI = MakeSigned<TF>;
+    const RebindToSigned<DF> di;
+
+    size_t padded;
+    auto in = RoundTestCases(tf, df, padded);
+    auto expected = AllocateAligned<TI>(padded);
+
+    constexpr double max = static_cast<double>(LimitsMax<TI>());
+    for (size_t i = 0; i < padded; ++i) {
+      if (std::isnan(in[i])) {
+        // We replace NaN with 0 below (no_nan)
+        expected[i] = 0;
+      } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
+        // Avoid undefined result for lrintf
+        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
+      } else {
+        expected[i] = lrintf(in[i]);
+      }
+    }
+    for (size_t i = 0; i < padded; i += Lanes(df)) {
+      const auto v = Load(df, &in[i]);
+      const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
+      HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllNearestInt() {
+  ForPartialVectors<TestNearestInt>()(float());
+}
+
 struct TestTrunc {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T t, D d) {
@@ -909,8 +1086,7 @@ struct TestSumOfLanes {
 };

 HWY_NOINLINE void TestAllSumOfLanes() {
-  // Only full vectors because lanes in partial vectors are undefined.
-  const ForFullVectors<TestSumOfLanes> sum;
+  const ForPartialVectors<TestSumOfLanes> sum;

   // No u8/u16/i8/i16.
   sum(uint32_t());
@@ -976,9 +1152,8 @@ struct TestMaxOfLanes {
 };

 HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  // Only full vectors because lanes in partial vectors are undefined.
-  const ForFullVectors<TestMinOfLanes> min;
-  const ForFullVectors<TestMaxOfLanes> max;
+  const ForPartialVectors<TestMinOfLanes> min;
+  const ForPartialVectors<TestMaxOfLanes> max;

   // No u8/u16/i8/i16.
   min(uint32_t());
@@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
@@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest,
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12	2021-06-02 10:56:05.252904478 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyCombineTest);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif

 #else
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12	2021-06-02 10:56:05.249904463 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyCompareTest);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12	2021-06-02 10:56:05.261904523 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -16,8 +16,6 @@
 #include <stdint.h>
 #include <string.h>

-#include <cmath>
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/convert_test.cc"
 #include "hwy/foreach_target.h"
@@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() {
 #endif
 }

-struct TestNearestInt {
-  template <typename TI, class DI>
-  HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
-    using TF = MakeFloat<TI>;
-    const Rebind<TF, DI> df;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f)));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f)));
-
-    // Above positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f)));
-
-    // Below positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f)));
-
-    const TF eps = static_cast<TF>(0.0001);
-    // Above negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps)));
-
-    // Below negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps)));
-  }
-};
-
-HWY_NOINLINE void TestAllNearestInt() {
-  ForPartialVectors<TestNearestInt>()(int32_t());
-}

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyConvertTest);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
@@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12	2021-06-02 10:56:05.245904442 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -14,6 +14,7 @@

 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcmp

 #include "hwy/base.h"

@@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() {
   ForFloatTypes(ForPartialVectors<TestCopySign>());
 }

+struct TestFirstN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto mask_lanes = AllocateAligned<T>(N);
+
+    // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we
+    // cannot reliably compare against all bits set (NaN for float types).
+    const T off = 1;
+
+    for (size_t len = 0; len <= N; ++len) {
+      for (size_t i = 0; i < N; ++i) {
+        mask_lanes[i] = i < len ? T(0) : off;
+      }
+      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
+      HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllFirstN() {
+  ForAllTypes(ForPartialVectors<TestFirstN>());
+}
+
 struct TestIfThenElse {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() {
   ForAllTypes(ForPartialVectors<TestIfThenElse>());
 }

-// Also tests MaskFromVec/VecFromMask
+struct TestMaskVec {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    auto mask_lanes = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
+      }
+
+      const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
+      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskVec() {
+  const ForPartialVectors<TestMaskVec> test;
+
+  test(uint16_t());
+  test(int16_t());
+  // TODO(janwas): float16_t - cannot compare yet
+
+  test(uint32_t());
+  test(int32_t());
+  test(float());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+#if HWY_CAP_FLOAT64
+  test(double());
+#endif
+}
+
 struct TestCompress {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     RandomState rng;

+    using TU = MakeUnsigned<T>;
+    const Rebind<TU, D> du;
     const size_t N = Lanes(d);
     auto in_lanes = AllocateAligned<T>(N);
-    auto mask_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TU>(N);
     auto expected = AllocateAligned<T>(N);
     auto actual = AllocateAligned<T>(N);

@@ -224,35 +290,56 @@ struct TestCompress {
     for (size_t rep = 0; rep < 100; ++rep) {
       size_t expected_pos = 0;
       for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = static_cast<T>(Random32(&rng));
-        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i] = T();  // cannot initialize float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+        mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
         if (mask_lanes[i] == 0) {  // Zero means true (easier to compare)
           expected[expected_pos++] = in_lanes[i];
         }
       }

       const auto in = Load(d, in_lanes.get());
-      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
+      const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));

-      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
       Store(Compress(in, mask), d, actual.get());
       // Upper lanes are undefined.
       for (size_t i = 0; i < expected_pos; ++i) {
-        HWY_ASSERT(actual[i] == expected[i]);
+        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
       }

       // Also check CompressStore in the same way.
-      std::fill(actual.get(), actual.get() + N, T(0));
+      memset(actual.get(), 0, N * sizeof(T));
       const size_t num_written = CompressStore(in, mask, d, actual.get());
       HWY_ASSERT_EQ(expected_pos, num_written);
       for (size_t i = 0; i < expected_pos; ++i) {
-        HWY_ASSERT_EQ(expected[i], actual[i]);
+        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
       }
     }
   }
 };

 #if 0
+namespace detail {  // for code folding
+void PrintCompress16x8Tables() {
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+  }
+  printf("\n");
+}
+
 // Compressed to nibbles
 void PrintCompress32x8Tables() {
   constexpr size_t N = 8;  // AVX2
@@ -340,16 +427,22 @@ void PrintCompress64x2Tables() {
   }
   printf("\n");
 }
-
+}  // namespace detail
 #endif

 HWY_NOINLINE void TestAllCompress() {
-  // PrintCompress32x8Tables();
-  // PrintCompress64x4Tables();
-  // PrintCompress32x4Tables();
-  // PrintCompress64x2Tables();
+  // detail::PrintCompress32x8Tables();
+  // detail::PrintCompress64x4Tables();
+  // detail::PrintCompress32x4Tables();
+  // detail::PrintCompress64x2Tables();
+  // detail::PrintCompress16x8Tables();

   const ForPartialVectors<TestCompress> test;
+
+  test(uint16_t());
+  test(int16_t());
+  test(float16_t());
+
   test(uint32_t());
   test(int32_t());
   test(float());
@@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() {
   test(uint64_t());
   test(int64_t());
 #endif
-
 #if HWY_CAP_FLOAT64
   test(double());
 #endif
@@ -432,7 +524,7 @@ struct TestTestBit {
 };

 HWY_NOINLINE void TestAllTestBit() {
-  ForIntegerTypes(ForFullVectors<TestTestBit>());
+  ForIntegerTypes(ForPartialVectors<TestTestBit>());
 }

 struct TestAllTrueFalse {
@@ -445,6 +537,8 @@ struct TestAllTrueFalse {
     auto lanes = AllocateAligned<T>(N);
     std::fill(lanes.get(), lanes.get() + N, T(0));

+    auto mask_lanes = AllocateAligned<T>(N);
+
     HWY_ASSERT(AllTrue(Eq(v, zero)));
     HWY_ASSERT(!AllFalse(Eq(v, zero)));

@@ -456,7 +550,13 @@ struct TestAllTrueFalse {
     for (size_t i = 0; i < N; ++i) {
       lanes[i] = T(1);
       v = Load(d, lanes.get());
-      HWY_ASSERT(!AllTrue(Eq(v, zero)));
+
+      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
+      // Assigning to an lvalue is insufficient but storing to memory prevents
+      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
+      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
+      HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
+
       HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));

       lanes[i] = T(-1);
@@ -596,7 +696,7 @@ struct TestLogicalMask {
 };

 HWY_NOINLINE void TestAllLogicalMask() {
-  ForAllTypes(ForFullVectors<TestLogicalMask>());
+  ForAllTypes(ForPartialVectors<TestLogicalMask>());
 }
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyLogicalTest);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
@@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12	2021-06-02 10:56:05.247904453 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -12,6 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
+// detected. Must come before Highway headers.
+#if defined(_WIN32) || defined(_WIN64)
+#include <Windows.h>
+#endif
+
 #include <stddef.h>
 #include <stdint.h>

@@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() {
   ForAllTypes(ForPartialVectors<TestLoadStore>());
 }

+struct TestStoreInterleaved3 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<uint8_t>(3 * N);
+    for (size_t i = 0; i < 3 * N; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(4 * N);
+    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[3 * i + 0] = bytes[0 * N + i];
+        expected[3 * i + 1] = bytes[1 * N + i];
+        expected[3 * i + 2] = bytes[2 * N + i];
+        // Ensure we do not write more than 3*N bytes
+        expected[3 * N + i] = actual[3 * N + i] = 0;
+      }
+      StoreInterleaved3(in0, in1, in2, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
+        Print(d, "in0", in0, pos / 3);
+        Print(d, "in1", in1, pos / 3);
+        Print(d, "in2", in2, pos / 3);
+        const size_t i = pos - pos % 3;
+        fprintf(stderr, "interleaved %d %d %d  %d %d %d\n", actual[i],
+                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+                actual[i + 5]);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestStoreInterleaved3, 4> test;
+#else
+  const ForPartialVectors<TestStoreInterleaved3> test;
+#endif
+  test(uint8_t());
+}
+
+struct TestStoreInterleaved4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<uint8_t>(4 * N);
+    for (size_t i = 0; i < 4 * N; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+    const auto in3 = Load(d, &bytes[3 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(5 * N);
+    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[4 * i + 0] = bytes[0 * N + i];
+        expected[4 * i + 1] = bytes[1 * N + i];
+        expected[4 * i + 2] = bytes[2 * N + i];
+        expected[4 * i + 3] = bytes[3 * N + i];
+        // Ensure we do not write more than 4*N bytes
+        expected[4 * N + i] = actual[4 * N + i] = 0;
+      }
+      StoreInterleaved4(in0, in1, in2, in3, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        Print(d, "in2", in2, pos / 4);
+        Print(d, "in3", in3, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved %d %d %d %d  %d %d %d %d\n", actual[i],
+                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+                actual[i + 5], actual[i + 6], actual[i + 7]);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestStoreInterleaved4, 4> test;
+#else
+  const ForPartialVectors<TestStoreInterleaved4> test;
+#endif
+  test(uint8_t());
+}
+
 struct TestLoadDup128 {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -86,13 +205,14 @@ struct TestLoadDup128 {
     for (size_t i = 0; i < N128; ++i) {
       lanes[i] = static_cast<T>(1 + i);
     }
-    const auto v = LoadDup128(d, lanes);
+
     const size_t N = Lanes(d);
-    auto out = AllocateAligned<T>(N);
-    Store(v, d, out.get());
+    auto expected = AllocateAligned<T>(N);
     for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
+      expected[i] = static_cast<T>(i % N128 + 1);
     }
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
 #else
     (void)d;
 #endif
@@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() {
   ForFloatTypes(test);
 }

+// Assumes little-endian byte order!
+struct TestScatter {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Offset = MakeSigned<T>;
+
+    const size_t N = Lanes(d);
+    const size_t range = 4 * N;                  // number of items to scatter
+    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
+
+    RandomState rng;
+
+    // Data to be scattered
+    auto bytes = AllocateAligned<uint8_t>(max_bytes);
+    for (size_t i = 0; i < max_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
+
+    // Scatter into these regions, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(range);
+    auto actual = AllocateAligned<T>(range);
+
+    const Rebind<Offset, D> d_offsets;
+    auto offsets = AllocateAligned<Offset>(N);  // or indices
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Byte offsets
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] =
+            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+        CopyBytes<sizeof(T)>(
+            bytes.get() + i * sizeof(T),
+            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
+      }
+      const auto voffsets = Load(d_offsets, offsets.get());
+      ScatterOffset(data, d, actual.get(), voffsets);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Offsets", voffsets);
+        HWY_ASSERT(false);
+      }
+
+      // Indices
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
+        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
+                             &expected[offsets[i]]);
+      }
+      const auto vindices = Load(d_offsets, offsets.get());
+      ScatterIndex(data, d, actual.get(), vindices);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Indices", vindices);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllScatter() {
+  // No u8,u16,i8,i16.
+  const ForPartialVectors<TestScatter> test;
+  test(uint32_t());
+  test(int32_t());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+
+  ForFloatTypes(test);
+}
+
 struct TestGather {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -183,21 +381,15 @@ struct TestGather {

 HWY_NOINLINE void TestAllGather() {
   // No u8,u16,i8,i16.
-  const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint32_t)> test32;
-  test32(uint32_t());
-  test32(int32_t());
+  const ForPartialVectors<TestGather> test;
+  test(uint32_t());
+  test(int32_t());

 #if HWY_CAP_INTEGER64
-  const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint64_t)> test64;
-  test64(uint64_t());
-  test64(int64_t());
-#endif
-
-  ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(float)>()(float());
-
-#if HWY_CAP_FLOAT64
-  ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(double)>()(double());
+  test(uint64_t());
+  test(int64_t());
 #endif
+  ForFloatTypes(test);
 }

 HWY_NOINLINE void TestAllCache() {
@@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() {
   int test = 0;
   Prefetch(&test);
   FlushCacheline(&test);
+  Pause();
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwyMemoryTest);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12	2021-06-02 10:56:05.259904513 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc	2021-05-31 10:37:11.000000000 -0400
@@ -223,6 +223,7 @@ struct TestTableLookupBytes {
 HWY_NOINLINE void TestAllTableLookupBytes() {
   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
 }
+
 struct TestTableLookupLanes {
 #if HWY_TARGET == HWY_RVV
   using Index = uint32_t;
@@ -242,12 +243,13 @@ struct TestTableLookupLanes {
     if (N <= 8) {  // Test all permutations
       for (size_t i0 = 0; i0 < N; ++i0) {
         idx[0] = static_cast<Index>(i0);
+
         for (size_t i1 = 0; i1 < N; ++i1) {
-          idx[1] = static_cast<Index>(i1);
+          if (N >= 2) idx[1] = static_cast<Index>(i1);
           for (size_t i2 = 0; i2 < N; ++i2) {
-            idx[2] = static_cast<Index>(i2);
+            if (N >= 4) idx[2] = static_cast<Index>(i2);
             for (size_t i3 = 0; i3 < N; ++i3) {
-              idx[3] = static_cast<Index>(i3);
+              if (N >= 4) idx[3] = static_cast<Index>(i3);

               for (size_t i = 0; i < N; ++i) {
                 expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
@@ -286,7 +288,7 @@ struct TestTableLookupLanes {
 };

 HWY_NOINLINE void TestAllTableLookupLanes() {
-  const ForFullVectors<TestTableLookupLanes> test;
+  const ForPartialVectors<TestTableLookupLanes> test;
   test(uint32_t());
   test(int32_t());
   test(float());
@@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+namespace hwy {
 HWY_BEFORE_TEST(HwySwizzleTest);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
@@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
-HWY_AFTER_TEST();
+}  // namespace hwy
 #endif
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12	2021-06-02 10:56:05.254904488 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h	2021-05-31 10:37:11.000000000 -0400
@@ -23,7 +23,6 @@
 #include <stdio.h>
 #include <string.h>

-#include <cmath>  // isfinite
 #include <cstddef>
 #include <string>
 #include <utility>  // std::forward
@@ -73,7 +72,8 @@ class TestWithParamTarget : public testi

 // Function to convert the test parameter of a TestWithParamTarget for
 // displaying it in the gtest test name.
-std::string TestParamTargetName(const testing::TestParamInfo<uint32_t>& info) {
+static inline std::string TestParamTargetName(
+    const testing::TestParamInfo<uint32_t>& info) {
   return TargetName(info.param);
 }

@@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT(
   static_assert(true, "For requiring trailing semicolon")

 #define HWY_BEFORE_TEST(suite)                      \
-  namespace hwy {                                   \
   class suite : public hwy::TestWithParamTarget {}; \
   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
   static_assert(true, "For requiring trailing semicolon")

-#define HWY_AFTER_TEST() \
-  } /* namespace hwy */       \
-  static_assert(true, "For requiring trailing semicolon")
-
-// Calls test for each enabled and available target.
-template <class Func, typename... Args>
-HWY_NOINLINE void RunTest(const Func& func, Args&&... args) {
-  SetSupportedTargetsForTest(0);
-  auto targets = SupportedAndGeneratedTargets();
-
-  for (uint32_t target : targets) {
-    SetSupportedTargetsForTest(target);
-    fprintf(stderr, "Testing for target %s.\n",
-            TargetName(static_cast<int>(target)));
-    func(std::forward<Args>(args)...);
-  }
-  // Disable the mask after the test.
-  SetSupportedTargetsForTest(0);
-}
-
 // 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
 // which triggers a compiler bug.
 class RandomState {
@@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand
 // built-in types.
 template <class T>
 inline void PreventElision(T&& output) {
-#ifndef _MSC_VER
+#if HWY_COMPILER_MSVC
+  (void)output;
+#else   // HWY_COMPILER_MSVC
   asm volatile("" : "+r"(output) : : "memory");
-#endif
+#endif  // HWY_COMPILER_MSVC
 }

 // Returns a name for the vector/part/scalar. The type prefix is u/i/f for
@@ -234,23 +215,34 @@ inline void PreventElision(T&& output) {
 // understanding which instantiation of a generic test failed.
 template <typename T>
 static inline std::string TypeName(T /*unused*/, size_t N) {
-  std::string prefix(IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u"));
-  prefix += std::to_string(sizeof(T) * 8);
-
-  // Scalars: omit the xN suffix.
-  if (N == 1) return prefix;
-
-  return prefix + 'x' + std::to_string(N);
+  const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
+  char name[64];
+  // Omit the xN suffix for scalars.
+  if (N == 1) {
+    snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
+  } else {
+    snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
+  }
+  return name;
 }

 // String comparison

 template <typename T1, typename T2>
-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) {
+inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
+                       size_t* pos = nullptr) {
   const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
   const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
   for (size_t i = 0; i < size; ++i) {
-    if (bytes1[i] != bytes2[i]) return false;
+    if (bytes1[i] != bytes2[i]) {
+      fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
+              size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
+              TypeName(T2(), 1).c_str());
+      if (pos != nullptr) {
+        *pos = i;
+      }
+      return false;
+    }
   }
   return true;
 }
@@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const
   auto lanes = AllocateAligned<T>(N);
   Store(v, d, lanes.get());
   const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
-  const size_t end = std::min(begin + 5, N);
+  const size_t end = std::min(begin + 7, N);
   fprintf(stderr, "%s %s [%zu+ ->]:\n  ", TypeName(T(), N).c_str(), caption,
           begin);
   for (size_t i = begin; i < end; ++i) {
-    fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str());
+    fprintf(stderr, "%g,", double(lanes[i]));
   }
   if (begin >= end) fprintf(stderr, "(out of bounds)");
   fprintf(stderr, "\n");
@@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex
                               const char* filename = "", const int line = -1,
                               const size_t lane = 0) {
   if (!IsEqual(expected, actual)) {
-    const std::string expected_str = std::to_string(expected);
-    const std::string actual_str = std::to_string(actual);
-    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(),
-                  actual_str.c_str());
+    char expected_str[100];
+    snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
+    char actual_str[100];
+    snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
+    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
+                  actual_str);
   }
 }

@@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co
       fprintf(stderr, "\n\n");
       Print(d, "expect", expected, i);
       Print(d, "actual", actual, i);
+
+      char expected_str[100];
+      snprintf(expected_str, sizeof(expected_str), "%g",
+               double(expected_lanes[i]));
+      char actual_str[100];
+      snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
+
       NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
-                    std::to_string(expected_lanes[i]).c_str(),
-                    std::to_string(actual_lanes[i]).c_str());
+                    expected_str, actual_str);
     }
   }
 }
@@ -458,11 +458,8 @@ struct ForeachSizeR<T, 0, kMinLanes, Tes

 // These adapters may be called directly, or via For*Types:

-// Calls Test for all powers of two in [kMinLanes, kMaxLanes / kDivLanes].
-// kMaxLanes is used for HWY_GATHER_LANES etc; use a large default because we
-// don't have access to T in the template argument list.
-template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1,
-          size_t kMaxLanes = 1ul << 30>
+// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
+template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
 struct ForPartialVectors {
   template <typename T>
   void operator()(T /*unused*/) const {
@@ -470,8 +467,8 @@ struct ForPartialVectors {
     // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
     ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
 #else
-    ForeachSizeR<T, HWY_MIN(kMaxLanes, HWY_LANES(T)) / kDivLanes / kMinLanes,
-                 kMinLanes, Test>::Do();
+    ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
+                 Test>::Do();
 #endif
   }
 };
@@ -505,33 +502,19 @@ struct ForGE128Vectors {
   }
 };

-// Calls Test for all powers of two in [128 bits, max bits/2].
-template <class Test>
+// Calls Test for all vectors that can be expanded by kFactor.
+template <class Test, size_t kFactor = 2>
 struct ForExtendableVectors {
   template <typename T>
   void operator()(T /*unused*/) const {
 #if HWY_TARGET == HWY_RVV
-    ForeachSizeR<T, 4, HWY_LANES(T), Test>::Do();
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
 #else
-    ForeachSizeR<T, HWY_LANES(T) / 2 / (16 / sizeof(T)), (16 / sizeof(T)),
+    ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
                  Test>::Do();
 #endif
   }
 };
-
-// Calls Test for full vectors only.
-template <class Test>
-struct ForFullVectors {
-  template <typename T>
-  void operator()(T t) const {
-#if HWY_TARGET == HWY_RVV
-    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
-    (void)t;
-#else
-    Test()(t, HWY_FULL(T)());
-#endif
-  }
-};

 // Type lists to shorten call sites:

diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE
diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile
diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE
diff -up chromium-91.0.4472.77/third_party/highway/src/README.md.12 chromium-91.0.4472.77/third_party/highway/src/README.md
--- chromium-91.0.4472.77/third_party/highway/src/README.md.12	2021-06-02 10:56:05.295904696 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/README.md	2021-05-31 10:37:11.000000000 -0400
@@ -15,15 +15,19 @@ applying the same operation to 'lanes'.
 ## Current status

 Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
-A port to RVV is in progress.
+Ports to RVV and SVE/SVE2 are in progress.

 Version 0.11 is considered stable enough to use in other projects, and is
 expected to remain backwards compatible unless serious issues are discovered
 while implementing SVE/RVV targets. After these targets are added, Highway will
 reach version 1.0.

-Continuous integration tests use a recent version of Clang and older version of
-MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1.
+Continuous integration tests build with a recent version of Clang (running on
+x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
+
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
+GCC cross-compile and QEMU. See the
+[testing process](g3doc/release_testing_process.md) for details.

 The `contrib` directory contains SIMD-related utilities: an image class with
 aligned rows, and a math library (16 functions already implemented, mostly
@@ -62,9 +66,11 @@ To test on all the attainable targets fo
 default configuration skips baseline targets (e.g. scalar) that are superseded
 by another baseline target.

+Bazel is also supported for building, but it is not as widely used/tested.
+
 ## Quick start

-You can use the `skeleton` examples inside examples/ as a starting point.
+You can use the `benchmark` inside examples/ as a starting point.

 A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
 and their parameters, and the [instruction_matrix][instmtx] indicates the
diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat
--- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12	2021-06-02 10:56:05.293904685 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat	2021-05-31 10:37:11.000000000 -0400
@@ -2,9 +2,9 @@
 REM Switch directory of this batch file
 cd %~dp0

-if not exist build mkdir build
+if not exist build_win mkdir build_win

-cd build
+cd build_win
 cmake .. -G Ninja || goto error
 ninja || goto error
 ctest -j || goto error
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.sh.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.sh
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE
diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time
diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1