From 0ae19f4f97707e0fc120d17cfc7c0d3979928b36 Mon Sep 17 00:00:00 2001
From: Tom spot Callaway <spot@fedoraproject.org>
Date: Wed, 2 Jun 2021 16:11:50 -0400
Subject: [PATCH] 91.0.4472.77

---
 chromium-91-libyuv-aarch64.patch              |    91 +
 chromium-91-pcscan-vector-types.patch         |    47 +
 ....0.4472.77-aarch64-cxxflags-addition.patch |    14 +
 ...72.77-disable-fontconfig-cache-magic.patch |    13 +
 ...-91.0.4472.77-initial_prefs-etc-path.patch |    15 +
 ...-java-only-allowed-in-android-builds.patch |    13 +
 ...m-91.0.4472.77-update-highway-0.12.2.patch | 12203 ++++++++++++++++
 chromium.spec                                 |    57 +-
 clean_ffmpeg.sh                               |     2 +-
 sources                                       |     2 +-
 10 files changed, 12432 insertions(+), 25 deletions(-)
 create mode 100644 chromium-91-libyuv-aarch64.patch
 create mode 100644 chromium-91-pcscan-vector-types.patch
 create mode 100644 chromium-91.0.4472.77-aarch64-cxxflags-addition.patch
 create mode 100644 chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch
 create mode 100644 chromium-91.0.4472.77-initial_prefs-etc-path.patch
 create mode 100644 chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch
 create mode 100644 chromium-91.0.4472.77-update-highway-0.12.2.patch

diff --git a/chromium-91-libyuv-aarch64.patch b/chromium-91-libyuv-aarch64.patch
new file mode 100644
index 0000000..77b8f4c
--- /dev/null
+++ b/chromium-91-libyuv-aarch64.patch
@@ -0,0 +1,91 @@
+diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
+index 350c964..2aab413 100644
+--- a/third_party/libyuv/source/row_neon64.cc
++++ b/third_party/libyuv/source/row_neon64.cc
+@@ -1835,7 +1835,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+       : "+r"(src_argb),          // %0
+         "+r"(dst_ab64),          // %1
+         "+r"(width)              // %2
+-      : "m"(kShuffleARGBToABGR)  // %3
++      : "Q"(kShuffleARGBToABGR)  // %3
+       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ }
+ 
+@@ -1859,7 +1859,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+       : "+r"(src_ar64),          // %0
+         "+r"(dst_argb),          // %1
+         "+r"(width)              // %2
+-      : "m"(kShuffleAR64ToARGB)  // %3
++      : "Q"(kShuffleAR64ToARGB)  // %3
+       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ }
+ 
+@@ -1883,7 +1883,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+       : "+r"(src_ab64),          // %0
+         "+r"(dst_argb),          // %1
+         "+r"(width)              // %2
+-      : "m"(kShuffleAB64ToARGB)  // %3
++      : "Q"(kShuffleAB64ToARGB)  // %3
+       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ }
+ 
+diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc
+index 8656fec..9f9636e 100644
+--- a/third_party/libyuv/source/scale_neon64.cc
++++ b/third_party/libyuv/source/scale_neon64.cc
+@@ -601,8 +601,8 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+       "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+       "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+ 
+-      "mov         v0.8h, v4.8h                  \n"
+-      "mov         v1.8h, v5.8h                  \n"
++      "mov         v0.16b, v4.16b                \n"
++      "mov         v1.16b, v5.16b                \n"
+       "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+       "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+       "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+@@ -642,7 +642,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+       "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+ 
+-      "mov         v2.8h, v0.8h                  \n"
++      "mov         v2.16b, v0.16b                \n"
+       "mla         v0.8h, v1.8h, v31.8h          \n"  // 3*near+far (odd)
+       "mla         v1.8h, v2.8h, v31.8h          \n"  // 3*near+far (even)
+ 
+@@ -679,7 +679,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+       "ld1         {v3.8h}, [%2], #16            \n"  // 12345678 (16b)
+       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+ 
+-      "mov         v0.8h, v2.8h                  \n"
++      "mov         v0.16b, v2.16b                \n"
+       "mla         v2.8h, v3.8h, v31.8h          \n"  // 3*near+far (odd)
+       "mla         v3.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+ 
+@@ -687,12 +687,12 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+       "ld1         {v5.8h}, [%3], #16            \n"  // 12345678 (16b)
+       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+ 
+-      "mov         v0.8h, v4.8h                  \n"
++      "mov         v0.16b, v4.16b                \n"
+       "mla         v4.8h, v5.8h, v31.8h          \n"  // 3*near+far (odd)
+       "mla         v5.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+ 
+-      "mov         v0.8h, v4.8h                  \n"
+-      "mov         v1.8h, v5.8h                  \n"
++      "mov         v0.16b, v4.16b                \n"
++      "mov         v1.16b, v5.16b                \n"
+       "mla         v4.8h, v2.8h, v31.8h          \n"  // 9 3 3 1 (1, odd)
+       "mla         v5.8h, v3.8h, v31.8h          \n"  // 9 3 3 1 (1, even)
+       "mla         v2.8h, v0.8h, v31.8h          \n"  // 9 3 3 1 (2, odd)
+@@ -887,8 +887,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+       "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+       "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+ 
+-      "mov         v0.8h, v4.8h                  \n"
+-      "mov         v1.8h, v5.8h                  \n"
++      "mov         v0.16b, v4.16b                \n"
++      "mov         v1.16b, v5.16b                \n"
+       "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+       "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+       "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
diff --git a/chromium-91-pcscan-vector-types.patch b/chromium-91-pcscan-vector-types.patch
new file mode 100644
index 0000000..33fc89e
--- /dev/null
+++ b/chromium-91-pcscan-vector-types.patch
@@ -0,0 +1,47 @@
+From 429e6f78a88473208e96689afa2f6e91f07a4f8c Mon Sep 17 00:00:00 2001
+From: Stephan Hartmann <stha09@googlemail.com>
+Date: Sat, 10 Apr 2021 17:02:49 +0000
+Subject: [PATCH] GCC: fix vector types in pcscan
+
+ * _mm_cmpeq_epi64 result is __m128i
+ * maybe_ptrs is __m128i already and doesn't require cast
+
+Bug: 819294
+Change-Id: I3f8c6cc327191827838e80aea1431ac09315fe88
+Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2817544
+Reviewed-by: Anton Bikineev <bikineev@chromium.org>
+Commit-Queue: Stephan Hartmann <stha09@googlemail.com>
+Cr-Commit-Position: refs/heads/master@{#871265}
+---
+
+diff --git a/base/allocator/partition_allocator/starscan/pcscan.cc b/base/allocator/partition_allocator/starscan/pcscan.cc
+index c7854ff..d5c0aea 100644
+--- a/base/allocator/partition_allocator/starscan/pcscan.cc
++++ b/base/allocator/partition_allocator/starscan/pcscan.cc
+@@ -1143,7 +1143,7 @@
+       const __m128i maybe_ptrs =
+           _mm_loadu_si128(reinterpret_cast<__m128i*>(payload));
+       const __m128i vand = _mm_and_si128(maybe_ptrs, cage_mask);
+-      const __m128d vcmp = _mm_cmpeq_epi64(vand, vbase);
++      const __m128i vcmp = _mm_cmpeq_epi64(vand, vbase);
+       const int mask = _mm_movemask_pd(_mm_castsi128_pd(vcmp));
+       if (LIKELY(!mask))
+         continue;
+@@ -1153,15 +1153,14 @@
+       if (mask & 0b01) {
+         quarantine_size +=
+             pcscan_task_.TryMarkObjectInNormalBuckets<GigaCageLookupPolicy>(
+-                _mm_cvtsi128_si64(_mm_castpd_si128(maybe_ptrs)));
++                _mm_cvtsi128_si64(maybe_ptrs));
+       }
+       if (mask & 0b10) {
+         // Extraction intrinsics for qwords are only supported in SSE4.1, so
+         // instead we reshuffle dwords with pshufd. The mask is used to move the
+         // 4th and 3rd dwords into the second and first position.
+         static constexpr int kSecondWordMask = (3 << 2) | (2 << 0);
+-        const __m128i shuffled =
+-            _mm_shuffle_epi32(_mm_castpd_si128(maybe_ptrs), kSecondWordMask);
++        const __m128i shuffled = _mm_shuffle_epi32(maybe_ptrs, kSecondWordMask);
+         quarantine_size +=
+             pcscan_task_.TryMarkObjectInNormalBuckets<GigaCageLookupPolicy>(
+                 _mm_cvtsi128_si64(shuffled));
diff --git a/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch b/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch
new file mode 100644
index 0000000..249adf4
--- /dev/null
+++ b/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch
@@ -0,0 +1,14 @@
+diff -up chromium-91.0.4472.77/build/config/compiler/BUILD.gn.aarch-cxxflags chromium-91.0.4472.77/build/config/compiler/BUILD.gn
+--- chromium-91.0.4472.77/build/config/compiler/BUILD.gn.aarch-cxxflags	2021-06-02 12:58:21.998750145 -0400
++++ chromium-91.0.4472.77/build/config/compiler/BUILD.gn	2021-06-02 12:59:29.762092189 -0400
+@@ -1511,6 +1511,10 @@ config("default_warnings") {
+       cflags += [ "-Wno-psabi" ]
+     }
+ 
++    if (current_cpu == "arm" && !is_clang) {
++      cflags_cc += [ "-flax-vector-conversions" ]
++    }
++
+     if (!is_clang) {
+       cflags_cc += [
+         # See comment for -Wno-c++11-narrowing.
diff --git a/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch b/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch
new file mode 100644
index 0000000..f3f362c
--- /dev/null
+++ b/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch
@@ -0,0 +1,13 @@
+diff -up chromium-91.0.4472.77/base/test/BUILD.gn.nofontconfigcache chromium-91.0.4472.77/base/test/BUILD.gn
+--- chromium-91.0.4472.77/base/test/BUILD.gn.nofontconfigcache	2021-06-01 16:41:40.094756454 -0400
++++ chromium-91.0.4472.77/base/test/BUILD.gn	2021-06-01 16:42:47.736100516 -0400
+@@ -198,9 +198,6 @@ static_library("test_support") {
+     sources += [ "test_file_util_linux.cc" ]
+     public_deps += [ ":fontconfig_util_linux" ]
+     data_deps += [ "//third_party/test_fonts" ]
+-    if (current_toolchain == host_toolchain) {
+-      data_deps += [ ":do_generate_fontconfig_caches" ]
+-    }
+   }
+ 
+   if (is_mac) {
diff --git a/chromium-91.0.4472.77-initial_prefs-etc-path.patch b/chromium-91.0.4472.77-initial_prefs-etc-path.patch
new file mode 100644
index 0000000..96825e2
--- /dev/null
+++ b/chromium-91.0.4472.77-initial_prefs-etc-path.patch
@@ -0,0 +1,15 @@
+diff -up chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc.etc chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc
+--- chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc.etc	2021-06-01 16:37:39.182531036 -0400
++++ chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc	2021-06-01 16:39:31.590102809 -0400
+@@ -20,9 +20,9 @@ bool IsOrganicFirstRun() {
+ 
+ base::FilePath InitialPrefsPath() {
+   // The standard location of the initial prefs is next to the chrome binary.
++  // ...but we patch it to use /etc/chromium
+   base::FilePath initial_prefs;
+-  if (!base::PathService::Get(base::DIR_EXE, &initial_prefs))
+-    return base::FilePath();
++  initial_prefs = base::FilePath("/etc/chromium");
+ 
+   base::FilePath new_path = initial_prefs.AppendASCII(installer::kInitialPrefs);
+   if (base::PathIsReadable(new_path))
diff --git a/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch b/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch
new file mode 100644
index 0000000..a4748d9
--- /dev/null
+++ b/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch
@@ -0,0 +1,13 @@
+diff -up chromium-91.0.4472.77/third_party/closure_compiler/compiler.py.java-allowed chromium-91.0.4472.77/third_party/closure_compiler/compiler.py
+--- chromium-91.0.4472.77/third_party/closure_compiler/compiler.py.java-allowed	2021-06-02 17:14:48.445064647 +0000
++++ chromium-91.0.4472.77/third_party/closure_compiler/compiler.py	2021-06-02 17:15:12.994836949 +0000
+@@ -13,8 +13,7 @@ import subprocess
+ 
+ 
+ _CURRENT_DIR = os.path.join(os.path.dirname(__file__))
+-_JAVA_PATH = os.path.join(_CURRENT_DIR, "..", "jdk", "current", "bin", "java")
+-assert os.path.isfile(_JAVA_PATH), "java only allowed in android builds"
++_JAVA_PATH = "java"
+ 
+ class Compiler(object):
+   """Runs the Closure compiler on given source files to typecheck them
diff --git a/chromium-91.0.4472.77-update-highway-0.12.2.patch b/chromium-91.0.4472.77-update-highway-0.12.2.patch
new file mode 100644
index 0000000..7b53aac
--- /dev/null
+++ b/chromium-91.0.4472.77-update-highway-0.12.2.patch
@@ -0,0 +1,12203 @@
+diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time
+diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10
+diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6
+diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt
+--- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12	2021-06-02 10:56:05.305904746 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt	2021-05-31 10:37:11.000000000 -0400
+@@ -19,7 +19,7 @@ if(POLICY CMP0083)
+   cmake_policy(SET CMP0083 NEW)
+ endif()
+ 
+-project(hwy VERSION 0.1)
++project(hwy VERSION 0.12.2)  # Keep in sync with highway.h version
+ 
+ set(CMAKE_CXX_STANDARD 11)
+ set(CMAKE_CXX_EXTENSIONS OFF)
+@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE)
+   set(CMAKE_BUILD_TYPE RelWithDebInfo)
+ endif()
+ 
++set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
++
+ include(CheckCXXSourceCompiles)
+ check_cxx_source_compiles(
+    "int main() {
+@@ -51,10 +53,13 @@ check_cxx_source_compiles(
+   HWY_EMSCRIPTEN
+ )
+ 
++set(HWY_CONTRIB_SOURCES
++    hwy/contrib/image/image.cc
++    hwy/contrib/image/image.h
++    hwy/contrib/math/math-inl.h
++)
++
+ set(HWY_SOURCES
+-    contrib/image/image.cc
+-    contrib/image/image.h
+-    contrib/math/math-inl.h
+     hwy/aligned_allocator.cc
+     hwy/aligned_allocator.h
+     hwy/base.h
+@@ -64,6 +69,7 @@ set(HWY_SOURCES
+     hwy/nanobenchmark.cc
+     hwy/nanobenchmark.h
+     hwy/ops/arm_neon-inl.h
++    hwy/ops/arm_sve-inl.h
+     hwy/ops/scalar-inl.h
+     hwy/ops/set_macros-inl.h
+     hwy/ops/shared-inl.h
+@@ -146,13 +152,28 @@ else()
+       -fno-exceptions
+     )
+   endif()
+-endif()
++
++  if (HWY_CMAKE_ARM7)
++    list(APPEND HWY_FLAGS
++      -march=armv7-a
++      -mfpu=neon-vfpv4
++      -mfloat-abi=hard  # must match the toolchain specified as CXX=
++      -mfp16-format=ieee  # required for vcvt_f32_f16
++    )
++  endif()  # HWY_CMAKE_ARM7
++
++endif()  # !MSVC
+ 
+ add_library(hwy STATIC ${HWY_SOURCES})
+ target_compile_options(hwy PRIVATE ${HWY_FLAGS})
+ set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
+ target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+ 
++add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
++target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
++set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
++target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
++
+ # -------------------------------------------------------- install library
+ install(TARGETS hwy
+   DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES})
+   endif()
+ endforeach()
+ 
+-# Add a pkg-config file for libhwy and the test library.
++install(TARGETS hwy_contrib
++  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
++# Install all the headers keeping the relative path to the current directory
++# when installing them.
++foreach (source ${HWY_CONTRIB_SOURCES})
++  if ("${source}" MATCHES "\.h$")
++    get_filename_component(dirname "${source}" DIRECTORY)
++    install(FILES "${source}"
++        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
++  endif()
++endforeach()
++
++# Add a pkg-config file for libhwy and the contrib/test libraries.
+ set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
+-foreach (pc libhwy.pc libhwy-test.pc)
++foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
+   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
+   install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
+       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+@@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD
+ # Avoids mismatch between GTest's static CRT and our dynamic.
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+ 
+-add_executable(skeleton hwy/examples/skeleton_main.cc)
+-target_sources(skeleton PRIVATE
+-    hwy/examples/skeleton-inl.h
+-    hwy/examples/skeleton.cc
+-    hwy/examples/skeleton.h
+-    hwy/examples/skeleton_shared.h)
+-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
+-# observe the difference in targets printed.
+-target_compile_options(skeleton PRIVATE ${HWY_FLAGS})
+-target_link_libraries(skeleton hwy)
+-set_target_properties(skeleton
+-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
+-
+-# Similar: shared headers but without the runtime dispatch in skeleton.cc/h
+-add_executable(skeleton_static hwy/examples/skeleton_static_main.cc)
+-target_sources(skeleton_static PRIVATE
+-    hwy/examples/skeleton-inl.h
+-    hwy/examples/skeleton_shared.h)
+-target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS})
+-target_link_libraries(skeleton_static hwy)
+-set_target_properties(skeleton_static
+-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
+-
+ # Programming exercise with integrated benchmark
+ add_executable(hwy_benchmark hwy/examples/benchmark.cc)
+ target_sources(hwy_benchmark PRIVATE
+     hwy/nanobenchmark.cc
+     hwy/nanobenchmark.h)
++# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
++# observe the difference in targets printed.
+ target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
+ target_link_libraries(hwy_benchmark hwy)
+ set_target_properties(hwy_benchmark
+@@ -272,19 +284,21 @@ endif()
+ endif() # HWY_SYSTEM_GTEST
+ 
+ set(HWY_TEST_FILES
+-  contrib/image/image_test.cc
+-  # contrib/math/math_test.cc
++  hwy/contrib/image/image_test.cc
++  # hwy/contrib/math/math_test.cc
++  hwy/aligned_allocator_test.cc
++  hwy/base_test.cc
++  hwy/highway_test.cc
++  hwy/targets_test.cc
+   hwy/examples/skeleton_test.cc
+   hwy/tests/arithmetic_test.cc
+   hwy/tests/combine_test.cc
+   hwy/tests/compare_test.cc
+   hwy/tests/convert_test.cc
+-  hwy/tests/hwy_test.cc
+   hwy/tests/logical_test.cc
+   hwy/tests/memory_test.cc
+   hwy/tests/swizzle_test.cc
+-  hwy/aligned_allocator_test.cc
+-  hwy/targets_test.cc
++  hwy/tests/test_util_test.cc
+ )
+ 
+ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+@@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE
+   get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+   add_executable(${TESTNAME} ${TESTFILE})
+   target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
++  # Test all targets, not just the best/baseline. This changes the default
++  # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
++  # cause compile errors because only one may be set, and other CMakeLists.txt
++  # that include us may set them.
++  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
+ 
+   if(HWY_SYSTEM_GTEST)
+-    target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
++    target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
+   else()
+-    target_link_libraries(${TESTNAME} hwy gtest gtest_main)
++    target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
+   endif()
+   # Output test targets in the test directory.
+   set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE
+diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in
+diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING
+diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog
+--- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12	2021-06-02 10:56:05.151903967 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog	2021-05-31 10:37:11.000000000 -0400
+@@ -1,3 +1,26 @@
++highway (0.12.2-1) UNRELEASED; urgency=medium
++
++  * fix scalar-only test and Windows macro conflict with Load/StoreFence
++  * replace deprecated wasm intrinsics
++
++ -- Jan Wassenberg <janwas@google.com>  Mon, 31 May 2021 16:00:00 +0200
++
++highway (0.12.1-1) UNRELEASED; urgency=medium
++
++  * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
++  * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
++  * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
++
++ -- Jan Wassenberg <janwas@google.com>  Wed, 19 May 2021 15:00:00 +0200
++
++highway (0.12.0-1) UNRELEASED; urgency=medium
++
++  * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
++  * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
++  * Proper IEEE rounding, reduce libstdc++ usage, inlined math
++
++ -- Jan Wassenberg <janwas@google.com>  Thu, 15 Apr 2021 20:00:00 +0200
++
+ highway (0.11.1-1) UNRELEASED; urgency=medium
+ 
+   * Fix clang7 asan error, finish f16 conversions and add test
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format
+diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf
+Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md
+--- chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12	2021-06-02 10:56:05.117903795 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md	2021-05-31 10:37:11.000000000 -0400
+@@ -33,6 +33,12 @@ The public headers are:
+ *   hwy/cache_control.h: defines stand-alone functions to control caching (e.g.
+     prefetching) and memory barriers, independent of actual SIMD.
+ 
++*   hwy/nanobenchmark.h: library for precisely measuring elapsed time (under
++    varying inputs) for benchmarking small/medium regions of code.
++
++*   hwy/tests/test_util-inl.h: defines macros for invoking tests on all
++    available targets, plus per-target functions useful in tests (e.g. Print).
++
+ SIMD implementations must be preceded and followed by the following:
+ 
+ ```
+@@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE();
+ 
+ ## Vector and descriptor types
+ 
+-Highway vectors consist of one or more 'lanes' of the same built-in type `T =
+-uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16,
+-32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load,
+-store, and conversion to/from `float32_t`; infinity or NaN have
+-implementation-defined results.
+-
+-Each vector has `N` lanes (a power of two, possibly unknown at compile time).
+-
+-Platforms such as x86 support multiple vector types, and other platforms require
+-that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be
+-wrapped inside a class. The Highway API satisfies these constraints because it
+-is designed around overloaded functions selected via a zero-sized tag parameter
+-`d` of type `D = Simd<T, N>`. These are typically constructed using aliases:
+-
+-*   `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native
+-    vector for the current target. For targets (e.g. RVV) that support register
+-    groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers
+-    in the group. This effectively multiplies the lane count in each operation
+-    by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of
+-    the sizes of the largest and smallest type. `LMUL > 1` is more efficient on
+-    single-issue machines, but larger values reduce the effective number of
+-    registers, which may cause the compiler to spill them to memory.
++Highway vectors consist of one or more 'lanes' of the same built-in type
++`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32,
++64`.
++
++In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store,
++and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN
++are implementation-defined due to ARMv7.
++
++On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway
++API allows using built-in types as vectors because operations are expressed as
++overloaded functions. Instead of constructors, overloaded initialization
++functions such as `Set` take a zero-sized tag argument called `d` of type `D =
++Simd<T, N>` and return an actual vector of unspecified type.
++
++`T` is one of the lane types above, and may be retrieved via `TFromD<D>`.
++
++`N` is target-dependent and not directly user-specified. The actual lane count
++may not be known at compile time, but can be obtained via `Lanes(d)`. Use this
++value, which is potentially different from `N`, to increment loop counters etc.
++It is typically a power of two, but that is not guaranteed e.g. on SVE.
++
++`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases:
++
++*   Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a
++    native vector. This is preferred because it fully utilizes vector lanes.
++
++    For targets (e.g. RVV) that support register groups, the optional `LMUL` (1,
++    2, 4, 8) specifies the number of registers in the group. This effectively
++    multiplies the lane count in each operation by `LMUL`. For mixed-precision
++    code, `LMUL` must be at least the ratio of the sizes of the largest and
++    smallest type. `LMUL > 1` is more efficient on single-issue machines, but
++    larger values reduce the effective number of registers, which may cause the
++    compiler to spill them to memory.
++
++*   Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector
++    which may be native width, but no more than `N` lanes have observable
++    effects such as loading/storing to memory. This is less performance-portable
++    because it may not use all available lanes. Note that the resulting lane
++    count may also be less than `N`.
++
++    For targets (e.g. RVV) that have compile-time-unknown lane counts, such
++    vectors incur additional runtime cost in `Load` etc.
++
++User-specified lane counts or tuples of vectors could cause spills on targets
++with fewer or smaller vectors. By contrast, Highway encourages vector-length
++agnostic code, which is more performance-portable.
++
++Given that lane counts are potentially compile-time-unknown, storage for vectors
++should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For
++applications that require a compile-time estimate, `MaxLanes(d)` returns the `N`
++from `Simd<T, N>`, which is NOT necessarily the actual lane count. This is
++DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may
++be very large) and some compilers are not able to interpret it as constexpr.
+ 
+-*   `const HWY_CAPPED(T, N) d;` for up to `N` lanes.
+-
+-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors
+-for the smaller types must be obtained from those of the larger type (e.g. via
++For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for
++the smaller types must be obtained from those of the larger type (e.g. via
+ `Rebind<uint8_t, HWY_FULL(float)>`).
+ 
+-The type `T` may be accessed as `TFromD<D>`. There are three possibilities for
+-the template parameter `N`:
+-
+-1.  Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a
+-    target with compile-time constant vectors.
++## Using unspecified vector types
+ 
+-1.  Less than the hardware vector width. This is the result of a compile-time
+-    decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of
+-    lanes, even when the hardware vector width could be greater.
+-
+-1.  Unrelated to the hardware vector width, e.g. when the hardware vector width
+-    is not known at compile-time and may be very large.
+-
+-In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by
+-which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd<T, N>`,
+-which is NOT necessarily the actual vector size (see above) and some compilers
+-are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use
+-alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of
+-`Lanes(d)` elements.
+-
+-Highway is designed to map a vector variable to a (possibly partial) hardware
+-register or register group. By discouraging user-specified `N` and tuples of
+-vector variables, we improve performance portability (e.g. by reducing spills to
+-memory for platforms that have smaller vectors than the developer expected).
+-
+-To construct vectors, call factory functions (see "Initialization" below) with
+-a tag parameter `d`.
+-
+-Local variables typically use auto for type deduction. For some generic
+-functions, a template argument `V` is sufficient: `template<class V> V Squared(V
+-v) { return v * v; }`. In general, functions have a `D` template argument and
+-can return vectors of type `Vec<D>`.
+-
+-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
+-functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway
+-functions generally take either a `Simd` or vector argument, which are also
+-defined in namespace `hwy`, they will typically be found via Argument-Dependent
+-Lookup and namespace qualifiers are not necessary. As an exception, Highway
+-functions that are templates (e.g. because they require a compile-time argument
+-such as a lane index or shift count) require a using-declaration such as
+-`using hwy::HWY_NAMESPACE::ShiftLeft`.
++Because vector types are unspecified, local vector variables are typically
++defined using `auto` for type deduction. A template argument `V` suffices for
++simple generic functions: `template<class V> V Squared(V v) { return v * v; }`.
++
++Many functions will need a `D` template argument in order to initialize any
++constants. They can use a separate `V` template argument for vectors, or use
++`Vec<D>`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such
++aliases instead of auto may improve readability of mixed-type code. They can
++also be used for member variables, which are discouraged because compilers often
++have difficulty mapping them to registers.
+ 
+ ## Operations
+ 
+@@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ
+ bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and
+ bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used.
+ 
++Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
++functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions
++generally take either a `Simd` or vector/mask argument. For targets where
++vectors and masks are defined in namespace `hwy`, the functions will be found
++via Argument-Dependent Lookup. However, this does not work for function
++templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using
++hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`.
++
+ ### Initialization
+ 
+ *   <code>V **Zero**(D)</code>: returns N-lane vector with all bits set to 0.
+@@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f
+ *   `V`: `{i,f}` \
+     <code>V **Neg**(V a)</code>: returns `-a[i]`.
+ 
+-*   `V`: `{i}{8,16,32}, {f}` \
++*   `V`: `{i,f}` \
+     <code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
+     integers, `LimitsMin()` maps to `LimitsMax() + 1`.
+ 
+@@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift
+ shifting `MakeUnsigned<T>` and casting to `T`. Right-shifting negative signed
+ `T` is the same as an unsigned shift, except that 1-bits are shifted in.
+ 
+-Compile-time constant shifts, generally the most efficient variant:
++Compile-time constant shifts, generally the most efficient variant (though 8-bit
++shifts are potentially slower than other lane sizes):
+ 
+-*   `V`: `{u,i}{16,32,64}` \
++*   `V`: `{u,i}` \
+     <code>V **ShiftLeft**&lt;int&gt;(V a)</code> returns `a[i] << int`.
+ 
+-*   `V`: `{u,i}{16,32,64}` \
++*   `V`: `{u,i}` \
+     <code>V **ShiftRight**&lt;int&gt;(V a)</code> returns `a[i] >> int`.
+ 
+ Shift all lanes by the same (not necessarily compile-time constant) amount:
+ 
+-*   `V`: `{u,i}{16,32,64}` \
++*   `V`: `{u,i}` \
+     <code>V **ShiftLeftSame**(V a, int bits)</code> returns `a[i] << bits`.
+ 
+-*   `V`: `{u,i}{16,32,64}` \
++*   `V`: `{u,i}` \
+     <code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.
+ 
+-Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2):
++Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2):
+ 
+ *   `V`: `{u,i}{16,32,64}` \
+     <code>V **operator<<**(V a, V b)</code> returns `a[i] << b[i]`.
+@@ -332,12 +347,17 @@ Special functions for signed types:
+     slightly more efficient; requires the first argument to be non-negative.
+ 
+ *   `V`: `i32/64` \
+-    <code>V **BroadcastSignBit(V a)</code> returns `a[i] < 0 ? -1 : 0`.
++    <code>V **BroadcastSignBit**(V a)</code> returns `a[i] < 0 ? -1 : 0`.
+ 
+ ### Masks
+ 
+ Let `M` denote a mask capable of storing true/false for each lane.
+ 
++*   <code>M **FirstN**(D, size_t N)</code>: returns mask with the first `N`
++    lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in
++    an all-true mask. Useful for implementing "masked" stores by loading `prev`
++    followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`.
++
+ *   <code>M1 **RebindMask**(D, M2 m)</code>: returns same mask bits as `m`, but
+     reinterpreted as a mask for lanes of type `TFromD<D>`. `M1` and `M2` must
+     have the same number of lanes.
+@@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing
+ *   <code>size_t **CountTrue**(M m)</code>: returns how many of `m[i]` are true
+     [0, N]. This is typically more expensive than AllTrue/False.
+ 
+-*   `V`: `{u,i,f}{32,64}` \
++*   `V`: `{u,i,f}{16,32,64}` \
+     <code>V **Compress**(V v, M m)</code>: returns `r` such that `r[n]` is
+     `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true.
+     Compacts lanes whose mask is set into the lower lanes; upper lanes are
+-    implementation-defined.
++    implementation-defined. Slow with 16-bit lanes.
+ 
+-*   `V`: `{u,i,f}{32,64}` \
++*   `V`: `{u,i,f}{16,32,64}` \
+     <code>size_t **CompressStore**(V v, M m, D, T* aligned)</code>: writes lanes
+     whose mask is set into `aligned`, starting from lane 0. Returns
+     `CountTrue(m)`, the number of valid lanes. All subsequent lanes may be
+-    overwritten! Alignment ensures inactive lanes will not cause faults.
++    overwritten! Alignment ensures inactive lanes will not cause faults. Slow
++    with 16-bit lanes.
+ 
+ ### Comparisons
+ 
+@@ -429,10 +450,16 @@ Memory operands are little-endian, other
+ lane configuration. Pointers are the addresses of `N` consecutive `T` values,
+ either naturally-aligned (`aligned`) or possibly unaligned (`p`).
+ 
++**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
++bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
++are naturally aligned. An unaligned access may require two load ports.
++
+ #### Load
+ 
+ *   <code>Vec&lt;D&gt; **Load**(D, const T* aligned)</code>: returns
+-    `aligned[i]`.
++    `aligned[i]`. May fault if the pointer is not aligned to the vector size.
++    Using this whenever possible improves codegen on SSE4: unlike `LoadU`,
++    `Load` can be fused into a memory operand, which reduces register pressure.
+ *   <code>Vec&lt;D&gt; **LoadU**(D, const T* p)</code>: returns `p[i]`.
+ 
+ *   <code>Vec&lt;D&gt; **LoadDup128**(D, const T* p)</code>: returns one 128-bit
+@@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or
+     be faster than broadcasting single values, and is more convenient than
+     preparing constants for the actual vector length.
+ 
+-#### Gather
++#### Scatter/Gather
+ 
+-**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`:
++**Note**: Offsets/indices are of type `VI = Vec<RebindToSigned<D>>` and need not
++be unique. The results are implementation-defined if any are negative.
+ 
+-*   `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
+-    <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>.
+-    Returns elements of base selected by possibly repeated *byte* `offsets[i]`.
+-    Results are implementation-defined if `offsets[i]` is negative.
+-
+-*   `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
+-    <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>.
+-    Returns vector of `base[indices[i]]`. Indices need not be unique, but
+-    results are implementation-defined if they are negative.
++**Note**: Where possible, applications should `Load/Store/TableLookup*` entire
++vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form
++`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] =
++F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
++
++*   `D`: `{u,i,f}{32,64}` \
++    <code>void **ScatterOffset**(Vec&lt;D&gt; v, D, const T* base, VI
++    offsets)</code>: stores `v[i]` to the base address plus *byte* `offsets[i]`.
++
++*   `D`: `{u,i,f}{32,64}` \
++    <code>void **ScatterIndex**(Vec&lt;D&gt; v, D, const T* base, VI
++    indices)</code>: stores `v[i]` to `base[indices[i]]`.
++
++*   `D`: `{u,i,f}{32,64}` \
++    <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>:
++    returns elements of base selected by *byte* `offsets[i]`.
++
++*   `D`: `{u,i,f}{32,64}` \
++    <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>:
++    returns vector of `base[indices[i]]`.
+ 
+ #### Store
+ 
+@@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or
+ *   <code>void **StoreU**(Vec&lt;D&gt; a, D, T* p)</code>: as Store, but without
+     the alignment requirement.
+ 
++*   `D`: `u8` \
++    <code>void **StoreInterleaved3**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
++    Vec&lt;D&gt; v2, D, T* p)</code>: equivalent to shuffling `v0, v1, v2`
++    followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0],
++    p[2] == v1[0]`. Useful for RGB samples.
++
++*   `D`: `u8` \
++    <code>void **StoreInterleaved4**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
++    Vec&lt;D&gt; v2, Vec&lt;D&gt; v3, D, T* p)</code>: as above, but for four
++    vectors (e.g. RGBA samples).
++
+ ### Cache control
+ 
+ All functions except Stream are defined in cache_control.h.
+@@ -483,6 +533,9 @@ All functions except Stream are defined
+ *   <code>void **Prefetch**(const T* p)</code>: begins loading the cache line
+     containing "p".
+ 
++*   <code>void **Pause**()</code>: when called inside a spin-loop, may reduce
++    power consumption.
++
+ ### Type conversion
+ 
+ *   <code>Vec&lt;D&gt; **BitCast**(D, V)</code>: returns the bits of `V`
+@@ -525,7 +578,8 @@ if the input exceeds the destination ran
+     zero and converts the value to same-sized integer.
+ 
+ *   `V`: `f32`; `Ret`: `i32` \
+-    <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`.
++    <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
++    results are undefined for NaN.
+ 
+ ### Swizzle
+ 
+@@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with
+ 
+ ### Reductions
+ 
+-**Note**: the following are only available for full vectors (including scalar).
+-These 'reduce' all lanes to a single result. This result is broadcasted to all
+-lanes at no extra cost; you can use `GetLane` to obtain the value.
++**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
++broadcasted to all lanes at no extra cost. To obtain a scalar, you can call
++`GetLane`.
+ 
+ Being a horizontal operation (across lanes of the same vector), these are slower
+ than normal SIMD operations and are typically used outside critical loops.
+@@ -697,9 +751,6 @@ generate such instructions (implying the
+     finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to
+     provide an alternative to functions which are not supported by HWY_SCALAR.
+ 
+-*   `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by
+-    HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof.
+-
+ *   `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as
+     `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out.
+ 
+@@ -707,26 +758,15 @@ The following signal capabilities and ex
+ 
+ *   `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes.
+ *   `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes.
++
++The following were used to signal the maximum number of lanes for certain
++operations, but this is no longer necessary (nor possible on SVE/RVV), so they
++are DEPRECATED:
++
++*   `HWY_GATHER_LANES(T)`.
+ *   `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits.
+ *   `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits.
+ 
+-The following indicate the maximum number of lanes for certain operations. For
+-targets that support the feature/operation, the macro evaluates to
+-`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))`
+-generates the best possible code (or scalar fallback) from the same source code.
+-
+-*   `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset.
+-*   `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2).
+-    DEPRECATED, this always matches HWY_LANES(T) and will be removed.
+-
+-As above, but the feature implies the type so there is no T parameter, thus
+-these can be used in `#if` expressions.
+-
+-*   `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this
+-    always matches HWY_LANES(int64_t) and will be removed.
+-*   `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED,
+-    this always matches HWY_LANES(int64_t) and will be removed.
+-
+ ## Detecting supported targets
+ 
+ `SupportedTargets()` returns a cached (initialized on-demand) bitfield of the
+@@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`:
+     and permitted by the compiler, independently of autovectorization), which
+     maximizes coverage in tests.
+ 
+-If none are defined, the default is to select all attainable targets except any
+-non-best baseline (typically `HWY_SCALAR`), which reduces code size.
++If none are defined, but `HWY_IS_TEST` is defined, the default is
++`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
++targets except any non-best baseline (typically `HWY_SCALAR`), which reduces
++code size.
+ 
+ ## Compiler support
+ 
+@@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags
+ However, this enables AVX2 instructions in the entire translation unit, which
+ may violate the one-definition rule and cause crashes. Instead, we use
+ target-specific attributes introduced via #pragma. Function using SIMD must
+-reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`.
++reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively,
++individual functions or lambdas may be prefixed with `HWY_ATTR`.
+ 
+ Immediates (compile-time constants) are specified as template arguments to avoid
+ constant-propagation issues with Clang on ARM.
+diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12	2021-06-02 10:56:05.278904609 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h	2021-05-31 10:37:11.000000000 -0400
+@@ -111,6 +111,32 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
+       new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
+ }
+ 
++// Helpers for array allocators (avoids overflow)
++namespace detail {
++
++// Returns x such that 1u << x == n (if n is a power of two).
++static inline constexpr size_t ShiftCount(size_t n) {
++  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
++}
++
++template <typename T>
++T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
++  constexpr size_t size = sizeof(T);
++
++  constexpr bool is_pow2 = (size & (size - 1)) == 0;
++  constexpr size_t bits = ShiftCount(size);
++  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
++
++  const size_t bytes = is_pow2 ? items << bits : items * size;
++  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
++  if (check != items) {
++    return nullptr;  // overflowed
++  }
++  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
++}
++
++}  // namespace detail
++
+ // Aligned memory equivalent of make_unique<T[]> for array types using the
+ // custom allocators alloc/free. This function calls the constructor with the
+ // passed Args... on every created item. The destructor of each element will be
+@@ -118,10 +144,11 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
+ template <typename T, typename... Args>
+ AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
+     size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
+-  T* ptr =
+-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
+-  for (size_t i = 0; i < items; i++) {
+-    new (ptr + i) T(std::forward<Args>(args)...);
++  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
++  if (ptr != nullptr) {
++    for (size_t i = 0; i < items; i++) {
++      new (ptr + i) T(std::forward<Args>(args)...);
++    }
+   }
+   return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
+ }
+@@ -165,7 +192,7 @@ template <typename T>
+ AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
+                                           FreePtr free, void* opaque) {
+   return AlignedFreeUniquePtr<T[]>(
+-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
++      detail::AllocateAlignedItems<T>(items, alloc, opaque),
+       AlignedFreer(free, opaque));
+ }
+ 
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12	2021-06-02 10:56:05.273904584 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -16,6 +16,7 @@
+ 
+ #include <stddef.h>
+ 
++#include <array>
+ #include <new>
+ #include <random>
+ #include <vector>
+@@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr)
+                    /*opaque_ptr=*/nullptr);
+ }
+ 
++TEST(AlignedAllocatorTest, Log2) {
++  EXPECT_EQ(0u, detail::ShiftCount(1));
++  EXPECT_EQ(1u, detail::ShiftCount(2));
++  EXPECT_EQ(3u, detail::ShiftCount(8));
++}
++
++// Allocator returns null when it detects overflow of items * sizeof(T).
++TEST(AlignedAllocatorTest, Overflow) {
++  constexpr size_t max = ~size_t(0);
++  constexpr size_t msb = (max >> 1) + 1;
++  using Size5 = std::array<uint8_t, 5>;
++  using Size10 = std::array<uint8_t, 10>;
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
++}
++
+ TEST(AlignedAllocatorTest, AllocDefaultPointers) {
+   const size_t kSize = 7777;
+   void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
+                                    /*opaque_ptr=*/nullptr);
+   ASSERT_NE(nullptr, ptr);
+   // Make sure the pointer is actually aligned.
+-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
++  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+   char* p = static_cast<char*>(ptr);
+   size_t ret = 0;
+   for (size_t i = 0; i < kSize; i++) {
+@@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP
+     p[i] = static_cast<char>(i & 0x7F);
+     if (i) ret += p[i] * p[i - 1];
+   }
+-  EXPECT_NE(0, ret);
++  EXPECT_NE(0U, ret);
+   FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
+ }
+ 
+@@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc)
+       AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
+   ASSERT_NE(nullptr, ptr);
+   // We should have only requested one alloc from the allocator.
+-  EXPECT_EQ(1u, fake_alloc.PendingAllocs());
++  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
+   // Make sure the pointer is actually aligned.
+-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
++  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+   FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
+-  EXPECT_EQ(0u, fake_alloc.PendingAllocs());
++  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
+ }
+ 
+ TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
+@@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
+ TEST(AlignedAllocatorTest, AllocSingleInt) {
+   auto ptr = AllocateAligned<uint32_t>(1);
+   ASSERT_NE(nullptr, ptr.get());
+-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
++  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+   // Force delete of the unique_ptr now to check that it doesn't crash.
+   ptr.reset(nullptr);
+   EXPECT_EQ(nullptr, ptr.get());
+@@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
+   const size_t kSize = 7777;
+   auto ptr = AllocateAligned<uint32_t>(kSize);
+   ASSERT_NE(nullptr, ptr.get());
+-  EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
++  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+   // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
+   // underlying type chosen by AllocateAligned() for the std::unique_ptr.
+   EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
+@@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
+     ptr[i] = static_cast<uint32_t>(i);
+     if (i) ret += ptr[i] * ptr[i - 1];
+   }
+-  EXPECT_NE(0, ret);
++  EXPECT_NE(0U, ret);
+ }
+ 
+ TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
+@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
+     auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
+         7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
+         &counter);
+-    // An array shold still only call a single allocation.
++    ASSERT_NE(nullptr, arr.get());
++    // An array should still only call a single allocation.
+     EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+     EXPECT_EQ(7, counter);
+     for (size_t i = 0; i < 7; i++) {
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12	2021-06-02 10:56:05.266904549 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h	2021-05-31 10:37:11.000000000 -0400
+@@ -34,7 +34,10 @@
+ //------------------------------------------------------------------------------
+ // Detect compiler using predefined macros
+ 
+-#ifdef _MSC_VER
++// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
++// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
++// purpose.
++#if defined(_MSC_VER) && !defined(__clang__)
+ #define HWY_COMPILER_MSVC _MSC_VER
+ #else
+ #define HWY_COMPILER_MSVC 0
+@@ -200,6 +203,10 @@
+ #define HWY_ARCH_X86_64 0
+ #endif
+ 
++#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
++#error "Cannot have both x86-32 and x86-64"
++#endif
++
+ #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
+ #define HWY_ARCH_X86 1
+ #else
+@@ -212,14 +219,29 @@
+ #define HWY_ARCH_PPC 0
+ #endif
+ 
+-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
++#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
++#define HWY_ARCH_ARM_A64 1
++#else
++#define HWY_ARCH_ARM_A64 0
++#endif
++
++#if defined(__arm__) || defined(_M_ARM)
++#define HWY_ARCH_ARM_V7 1
++#else
++#define HWY_ARCH_ARM_V7 0
++#endif
++
++#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
++#error "Cannot have both A64 and V7"
++#endif
++
++#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
+ #define HWY_ARCH_ARM 1
+ #else
+ #define HWY_ARCH_ARM 0
+ #endif
+ 
+-// There isn't yet a standard __wasm or __wasm__.
+-#ifdef __EMSCRIPTEN__
++#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
+ #define HWY_ARCH_WASM 1
+ #else
+ #define HWY_ARCH_WASM 0
+@@ -231,9 +253,11 @@
+ #define HWY_ARCH_RVV 0
+ #endif
+ 
++// It is an error to detect multiple architectures at the same time, but OK to
++// detect none of the above.
+ #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
+-     HWY_ARCH_RVV) != 1
+-#error "Must detect exactly one platform"
++     HWY_ARCH_RVV) > 1
++#error "Must not detect more than one architecture"
+ #endif
+ 
+ //------------------------------------------------------------------------------
+@@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t
+ // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
+ // by concatenating base type and bits.
+ 
+-// RVV already has a builtin type.
+-#if !HWY_ARCH_RVV
++// RVV already has a builtin type and the GCC intrinsics require it.
++#if HWY_ARCH_RVV && HWY_COMPILER_GCC
++#define HWY_NATIVE_FLOAT16 1
++#else
++#define HWY_NATIVE_FLOAT16 0
++#endif
++
++#if HWY_NATIVE_FLOAT16
++using float16_t = __fp16;
++// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
++// arguments, so use a wrapper.
++// TODO(janwas): replace with _Float16 when that is supported?
++#else
++#pragma pack(push, 1)
+ struct float16_t {
+-  // __fp16 cannot be used as a function parameter in clang, so use a wrapper.
+   uint16_t bits;
+ };
++#pragma pack(pop)
+ #endif
++
+ using float32_t = float;
+ using float64_t = double;
+ 
+@@ -506,6 +543,13 @@ struct Relations<int64_t> {
+   using Narrow = int32_t;
+ };
+ template <>
++struct Relations<float16_t> {
++  using Unsigned = uint16_t;
++  using Signed = int16_t;
++  using Float = float16_t;
++  using Wide = float;
++};
++template <>
+ struct Relations<float> {
+   using Unsigned = uint32_t;
+   using Signed = int32_t;
+@@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t
+ 
+ // Undefined results for x == 0.
+ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
+-#ifdef _MSC_VER
++#if HWY_COMPILER_MSVC
+   unsigned long index;  // NOLINT
+   _BitScanForward(&index, x);
+   return index;
+-#else
++#else  // HWY_COMPILER_MSVC
+   return static_cast<size_t>(__builtin_ctz(x));
+-#endif
++#endif  // HWY_COMPILER_MSVC
+ }
+ 
+ HWY_API size_t PopCount(uint64_t x) {
+@@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) {
+   return static_cast<size_t>(__builtin_popcountll(x));
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+   return _mm_popcnt_u64(x);
+-#elif HWY_COMPILER_MSVC
++#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
+   return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+ #else
+   x -= ((x >> 1) & 0x55555555U);
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12	2021-06-02 10:56:05.280904620 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h	2021-05-31 10:37:11.000000000 -0400
+@@ -20,7 +20,9 @@
+ 
+ #include "hwy/base.h"
+ 
+-#ifndef __SSE2__
++// Requires SSE2; fails to compile on 32-bit Clang 7 (see
++// https://github.com/gperftools/gperftools/issues/946).
++#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
+ #undef HWY_DISABLE_CACHE_CONTROL
+ #define HWY_DISABLE_CACHE_CONTROL
+ #endif
+@@ -30,6 +32,14 @@
+ #include <emmintrin.h>  // SSE2
+ #endif
+ 
++// Windows.h #defines these, which causes infinite recursion. Temporarily
++// undefine them in this header; these functions are anyway deprecated.
++// TODO(janwas): remove when these functions are removed.
++#pragma push_macro("LoadFence")
++#pragma push_macro("StoreFence")
++#undef LoadFence
++#undef StoreFence
++
+ namespace hwy {
+ 
+ // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
+@@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach
+ #endif
+ }
+ 
++// Reduces power consumption in spin-loops. No effect on non-x86.
++HWY_INLINE HWY_ATTR_CACHE void Pause() {
++#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
++  _mm_pause();
++#endif
++}
++
+ }  // namespace hwy
+ 
++// TODO(janwas): remove when these functions are removed. (See above.)
++#pragma pop_macro("StoreFence")
++#pragma pop_macro("LoadFence")
++
+ #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12	2021-06-02 10:56:05.195904190 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -19,7 +19,6 @@
+ #include <stddef.h>
+ #include <stdio.h>
+ 
+-#include <cmath>
+ #include <memory>
+ #include <numeric>  // iota
+ 
+@@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi
+ 
+ class TwoArray {
+  public:
+-  // Passed to ctor as a value NOT known to the compiler. Must be a multiple of
+-  // the vector lane count * 8.
++  // Must be a multiple of the vector lane count * 8.
+   static size_t NumItems() { return 3456; }
+ 
+-  explicit TwoArray(const size_t num_items)
+-      : a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
+-    const float init = num_items / NumItems();  // 1, but compiler doesn't know
+-    std::iota(a_.get(), a_.get() + num_items, init);
+-    std::iota(b_, b_ + num_items, init);
++  TwoArray()
++      : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
++    // = 1, but compiler doesn't know
++    const float init = static_cast<float>(Unpredictable1());
++    std::iota(a_.get(), a_.get() + NumItems(), init);
++    std::iota(b_, b_ + NumItems(), init);
+   }
+ 
+  protected:
+@@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) {
+   const FuncInput inputs[kNumInputs] = {num_items};
+   Result results[kNumInputs];
+ 
+-  Benchmark benchmark(num_items);
++  Benchmark benchmark;
+ 
+   Params p;
+   p.verbose = false;
+@@ -101,7 +100,7 @@ void Intro() {
+ // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
+ class BenchmarkDot : public TwoArray {
+  public:
+-  explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
++  BenchmarkDot() : dot_{-1.0f} {}
+ 
+   FuncOutput operator()(const size_t num_items) {
+     HWY_FULL(float) d;
+@@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray {
+         sum[i] += sum[i + power];
+       }
+     }
+-    return dot_ = GetLane(SumOfLanes(sum[0]));
++    dot_ = GetLane(SumOfLanes(sum[0]));
++    return static_cast<FuncOutput>(dot_);
+   }
+   void Verify(size_t num_items) {
+     if (dot_ == -1.0f) {
+@@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray {
+ // INTERMEDIATE: delta coding
+ // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
+ struct BenchmarkDelta : public TwoArray {
+-  explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
+-
+   FuncOutput operator()(const size_t num_items) const {
+ #if HWY_TARGET == HWY_SCALAR
+     b_[0] = a_[0];
+@@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray
+       Store(a - shifted, df, &b_[i]);
+     }
+ #endif
+-    return b_[num_items - 1];
++    return static_cast<FuncOutput>(b_[num_items - 1]);
+   }
+ 
+   void Verify(size_t num_items) {
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12	2021-06-02 10:56:05.189904159 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -22,27 +22,62 @@
+ // For runtime dispatch, specify the name of the current file (unfortunately
+ // __FILE__ is not reliable) so that foreach_target.h can re-include it.
+ #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
+-// Re-include this file once per enabled target to generate code for it.
++// Generates code for each enabled target by re-including this source file.
+ #include "hwy/foreach_target.h"
+ 
+-#include "hwy/examples/skeleton_shared.h"
+ #include "hwy/highway.h"
+ 
+-// Optional: factor out parts of the implementation into *-inl.h
+-#include "hwy/examples/skeleton-inl.h"
+-
+ // Optional, can instead add HWY_ATTR to all functions.
+ HWY_BEFORE_NAMESPACE();
+ namespace skeleton {
+ namespace HWY_NAMESPACE {
+ 
+-// Compiled once per target via multiple inclusion.
+-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
+-              float* HWY_RESTRICT out) {
+-  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET),
+-         ExampleGatherStrategy());
++// Highway ops reside here; ADL does not find templates nor builtins.
++using namespace hwy::HWY_NAMESPACE;
++
++// Computes log2 by converting to a vector of floats. Compiled once per target.
++template <class DF>
++HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
++                               uint8_t* HWY_RESTRICT log2) {
++  // Type tags for converting to other element types (Rebind = same count).
++  const Rebind<int32_t, DF> d32;
++  const Rebind<uint8_t, DF> d8;
++
++  const auto u8 = Load(d8, values);
++  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
++  const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
++  Store(DemoteTo(d8, exponent), d8, log2);
++}
++
++HWY_NOINLINE void CodepathDemo() {
++  // Highway defaults to portability, but per-target codepaths may be selected
++  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
++#if HWY_CAP_INTEGER64
++  const char* gather = "Has int64";
++#else
++  const char* gather = "No int64";
++#endif
++  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
++}
+ 
+-  ExampleMulAdd(in1, in2, out);
++HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
++                            uint8_t* HWY_RESTRICT log2) {
++  CodepathDemo();
++
++  // Second argument is necessary on RVV until it supports fractional lengths.
++  HWY_FULL(float, 4) df;
++
++  const size_t N = Lanes(df);
++  size_t i = 0;
++  for (; i + N <= count; i += N) {
++    OneFloorLog2(df, values + i, log2 + i);
++  }
++  // TODO(janwas): implement
++#if HWY_TARGET != HWY_RVV
++  for (; i < count; ++i) {
++    OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
++  }
++#endif
+ }
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+@@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE();
+ 
+ namespace skeleton {
+ 
+-// This macro declares a static array SkeletonHighwayDispatchTable used for
+-// dynamic dispatch. This macro should be placed in the same namespace that
+-// defines the Skeleton function above.
+-HWY_EXPORT(Skeleton);
++// This macro declares a static array used for dynamic dispatch; it resides in
++// the same outer namespace that contains FloorLog2.
++HWY_EXPORT(FloorLog2);
+ 
+ // This function is optional and only needed in the case of exposing it in the
+-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in
+-// this module is equivalent to inlining this optional function..
+-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
+-              float* HWY_RESTRICT out) {
+-  return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out);
++// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
++// is equivalent to inlining this function.
++void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
++                   uint8_t* HWY_RESTRICT out) {
++  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
+ }
+ 
+ // Optional: anything to compile only once, e.g. non-SIMD implementations of
+-// public functions provided by this module, can go inside #if HWY_ONCE
+-// (after end_target-inl.h).
++// public functions provided by this module, can go inside #if HWY_ONCE.
+ 
+ }  // namespace skeleton
+ #endif  // HWY_ONCE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12	2021-06-02 10:56:05.213904281 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h	2021-05-31 10:37:11.000000000 -0400
+@@ -18,15 +18,17 @@
+ #ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+ #define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+ 
+-// Tiny subset of Highway API: essentials for declaring an interface, without
+-// any implementation details.
++#include <stddef.h>
++
++// Platform-specific definitions used for declaring an interface, independent of
++// the SIMD instruction set.
+ #include "hwy/base.h"  // HWY_RESTRICT
+ 
+ namespace skeleton {
+ 
+-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
+-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
+-              float* HWY_RESTRICT out);
++// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
++void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
++                   uint8_t* HWY_RESTRICT out);
+ 
+ }  // namespace skeleton
+ 
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12	2021-06-02 10:56:05.164904033 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -29,41 +29,31 @@
+ // It is fine to #include normal or *-inl headers.
+ #include <stddef.h>
+ 
+-#include "hwy/examples/skeleton_shared.h"
+ #include "hwy/highway.h"
+ 
+ HWY_BEFORE_NAMESPACE();
+ namespace skeleton {
+ namespace HWY_NAMESPACE {
+ 
+-using hwy::HWY_NAMESPACE::MulAdd;
++using namespace hwy::HWY_NAMESPACE;
+ 
+-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
+-HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1,
+-                                    const float* HWY_RESTRICT in2,
+-                                    float* HWY_RESTRICT out) {
+-  // Descriptor(s) for all vector types used in this function.
+-  HWY_FULL(float) df;
+-
+-  const auto mul = Set(df, kMultiplier);
+-  for (size_t i = 0; i < 256; i += Lanes(df)) {
+-    const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i));
+-    Store(result, df, out + i);
++// Example of a type-agnostic (caller-specified lane type) and width-agnostic
++// (uses best available instruction set) function in a header.
++//
++// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
++template <class D, typename T>
++HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
++                                 const T* HWY_RESTRICT add_array,
++                                 const size_t size, T* HWY_RESTRICT x_array) {
++  for (size_t i = 0; i < size; i += Lanes(d)) {
++    const auto mul = Load(d, mul_array + i);
++    const auto add = Load(d, add_array + i);
++    auto x = Load(d, x_array + i);
++    x = MulAdd(mul, x, add);
++    Store(x, d, x_array + i);
+   }
+ }
+ 
+-// (This doesn't generate SIMD instructions, so is not required here)
+-HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() {
+-  // Highway functions generate per-target implementations from the same source
+-  // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed,
+-  // entirely different codepaths can also be selected like so:
+-#if HWY_GATHER_LANES > 1
+-  return "Has gather";
+-#else
+-  return "Gather is limited to one lane";
+-#endif
+-}
+-
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+ }  // namespace HWY_NAMESPACE
+ }  // namespace skeleton
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12	2021-06-02 10:56:05.170904063 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -12,30 +12,96 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+ 
+-// Example of unit test for the "skeleton" module.
++// Example of unit test for the "skeleton" library.
+ 
+-#include "hwy/examples/skeleton.h"  // Skeleton
++#include "hwy/examples/skeleton.h"
+ 
+ #include <stdio.h>
+ 
+-#include "hwy/tests/test_util-inl.h"  // RunTest
++#undef HWY_TARGET_INCLUDE
++#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
++#include "hwy/foreach_target.h"
++#include "hwy/highway.h"
++#include "hwy/tests/test_util-inl.h"
+ 
++// Optional: factor out parts of the implementation into *-inl.h
++#include "hwy/examples/skeleton-inl.h"
++
++HWY_BEFORE_NAMESPACE();
+ namespace skeleton {
++namespace HWY_NAMESPACE {
++
++using namespace hwy::HWY_NAMESPACE;
++
++// Calls function defined in skeleton.cc.
++struct TestFloorLog2 {
++  template <class T, class DF>
++  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
++    const size_t count = 5 * Lanes(df);
++    auto in = hwy::AllocateAligned<uint8_t>(count);
++    auto expected = hwy::AllocateAligned<uint8_t>(count);
++
++    hwy::RandomState rng;
++    for (size_t i = 0; i < count; ++i) {
++      expected[i] = Random32(&rng) & 7;
++      in[i] = static_cast<uint8_t>(1u << expected[i]);
++    }
++    auto out = hwy::AllocateAligned<uint8_t>(count);
++    CallFloorLog2(in.get(), count, out.get());
++    int sum = 0;
++    for (size_t i = 0; i < count; ++i) {
++      // TODO(janwas): implement
++#if HWY_TARGET != HWY_RVV
++      HWY_ASSERT_EQ(expected[i], out[i]);
++#endif
++      sum += out[i];
++    }
++    hwy::PreventElision(sum);
++  }
++};
++
++HWY_NOINLINE void TestAllFloorLog2() {
++  ForPartialVectors<TestFloorLog2>()(float());
++}
++
++// Calls function defined in skeleton-inl.h.
++struct TestSumMulAdd {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    hwy::RandomState rng;
++    const size_t count = 4096;
++    EXPECT_TRUE(count % Lanes(d) == 0);
++    auto mul = hwy::AllocateAligned<T>(count);
++    auto x = hwy::AllocateAligned<T>(count);
++    auto add = hwy::AllocateAligned<T>(count);
++    for (size_t i = 0; i < count; ++i) {
++      mul[i] = static_cast<T>(Random32(&rng) & 0xF);
++      x[i] = static_cast<T>(Random32(&rng) & 0xFF);
++      add[i] = static_cast<T>(Random32(&rng) & 0xFF);
++    }
++    double expected_sum = 0.0;
++    for (size_t i = 0; i < count; ++i) {
++      expected_sum += mul[i] * x[i] + add[i];
++    }
+ 
+-TEST(SkeletonTest, MainTest) {
+-  HWY_ALIGN_MAX float in1[256];
+-  HWY_ALIGN_MAX float in2[256];
+-  HWY_ALIGN_MAX float out[256];
+-  for (size_t i = 0; i < 256; ++i) {
+-    in1[i] = static_cast<float>(i);
+-    in2[i] = in1[i] + 300;
++    MulAddLoop(d, mul.get(), add.get(), count, x.get());
++    HWY_ASSERT_EQ(4344240.0, expected_sum);
+   }
++};
+ 
+-  // Tests will run for all compiled targets to ensure all are OK.
+-  hwy::RunTest([&in1, &in2, &out]() {
+-    Skeleton(in1, in2, out);
+-    // Add EXPECT_... calls here.
+-  });
++HWY_NOINLINE void TestAllSumMulAdd() {
++  ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
+ }
+ 
++// NOLINTNEXTLINE(google-readability-namespace-comments)
++}  // namespace HWY_NAMESPACE
++}  // namespace skeleton
++HWY_AFTER_NAMESPACE();
++
++#if HWY_ONCE
++namespace skeleton {
++HWY_BEFORE_TEST(SkeletonTest);
++HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
++HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
+ }  // namespace skeleton
++#endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12	2021-06-02 10:56:05.269904564 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h	2021-05-31 10:37:11.000000000 -0400
+@@ -25,10 +25,10 @@
+ 
+ namespace hwy {
+ 
+-// API version (https://semver.org/)
++// API version (https://semver.org/); keep in sync with CMakeLists.txt.
+ #define HWY_MAJOR 0
+-#define HWY_MINOR 11
+-#define HWY_PATCH 1
++#define HWY_MINOR 12
++#define HWY_PATCH 2
+ 
+ //------------------------------------------------------------------------------
+ // Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
+@@ -49,7 +49,7 @@ namespace hwy {
+   HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
+ #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
+ 
+-// Vector of up to MAX_N lanes.
++// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
+ #define HWY_CAPPED(T, MAX_N) \
+   hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
+ 
+@@ -75,6 +75,10 @@ namespace hwy {
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_NEON
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
++#elif HWY_STATIC_TARGET == HWY_SVE
++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
++#elif HWY_STATIC_TARGET == HWY_SVE2
++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_PPC8
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_SSE4
+@@ -143,6 +147,18 @@ FunctionCache<RetType, Args...> Function
+ #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
+ #endif
+ 
++#if HWY_TARGETS & HWY_SVE
++#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
++#else
++#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
++#endif
++
++#if HWY_TARGETS & HWY_SVE2
++#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
++#else
++#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
++#endif
++
+ #if HWY_TARGETS & HWY_PPC8
+ #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
+ #else
+@@ -261,8 +277,11 @@ FunctionCache<RetType, Args...> Function
+ #elif HWY_TARGET == HWY_AVX3
+ #include "hwy/ops/x86_512-inl.h"
+ #elif HWY_TARGET == HWY_PPC8
++#error "PPC is not yet supported"
+ #elif HWY_TARGET == HWY_NEON
+ #include "hwy/ops/arm_neon-inl.h"
++#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
++#include "hwy/ops/arm_sve-inl.h"
+ #elif HWY_TARGET == HWY_WASM
+ #include "hwy/ops/wasm_128-inl.h"
+ #elif HWY_TARGET == HWY_RVV
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12	2021-06-02 10:56:05.276904599 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -29,128 +29,43 @@
+ #include <string>
+ #include <vector>
+ 
++#if defined(_WIN32) || defined(_WIN64)
++#ifndef NOMINMAX
++#define NOMINMAX
++#endif  // NOMINMAX
++#include <windows.h>
++#endif
++
++#if defined(__MACH__)
++#include <mach/mach.h>
++#include <mach/mach_time.h>
++#endif
++
++#if defined(__HAIKU__)
++#include <OS.h>
++#endif
++
+ #include "hwy/base.h"
+ #if HWY_ARCH_PPC
+ #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
+ #elif HWY_ARCH_X86
+ 
+-#ifdef _MSC_VER
++#if HWY_COMPILER_MSVC
+ #include <intrin.h>
+ #else
+ #include <cpuid.h>  // NOLINT
+-#endif              // _MSC_VER
++#endif              // HWY_COMPILER_MSVC
+ 
+ #endif  // HWY_ARCH_X86
+ 
+ namespace hwy {
+-namespace platform {
+-namespace {
+-
+-#if HWY_ARCH_X86
+-
+-void Cpuid(const uint32_t level, const uint32_t count,
+-           uint32_t* HWY_RESTRICT abcd) {
+-#if HWY_COMPILER_MSVC
+-  int regs[4];
+-  __cpuidex(regs, level, count);
+-  for (int i = 0; i < 4; ++i) {
+-    abcd[i] = regs[i];
+-  }
+-#else
+-  uint32_t a;
+-  uint32_t b;
+-  uint32_t c;
+-  uint32_t d;
+-  __cpuid_count(level, count, a, b, c, d);
+-  abcd[0] = a;
+-  abcd[1] = b;
+-  abcd[2] = c;
+-  abcd[3] = d;
+-#endif
+-}
+-
+-std::string BrandString() {
+-  char brand_string[49];
+-  std::array<uint32_t, 4> abcd;
+-
+-  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+-  Cpuid(0x80000000U, 0, abcd.data());
+-  if (abcd[0] < 0x80000004U) {
+-    return std::string();
+-  }
+-
+-  for (size_t i = 0; i < 3; ++i) {
+-    Cpuid(0x80000002U + i, 0, abcd.data());
+-    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+-  }
+-  brand_string[48] = 0;
+-  return brand_string;
+-}
+-
+-// Returns the frequency quoted inside the brand string. This does not
+-// account for throttling nor Turbo Boost.
+-double NominalClockRate() {
+-  const std::string& brand_string = BrandString();
+-  // Brand strings include the maximum configured frequency. These prefixes are
+-  // defined by Intel CPUID documentation.
+-  const char* prefixes[3] = {"MHz", "GHz", "THz"};
+-  const double multipliers[3] = {1E6, 1E9, 1E12};
+-  for (size_t i = 0; i < 3; ++i) {
+-    const size_t pos_prefix = brand_string.find(prefixes[i]);
+-    if (pos_prefix != std::string::npos) {
+-      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+-      if (pos_space != std::string::npos) {
+-        const std::string digits =
+-            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+-        return std::stod(digits) * multipliers[i];
+-      }
+-    }
+-  }
+-
+-  return 0.0;
+-}
+-
+-#endif  // HWY_ARCH_X86
+-
+-}  // namespace
+-
+-// Returns tick rate. Invariant means the tick counter frequency is independent
+-// of CPU throttling or sleep. May be expensive, caller should cache the result.
+-double InvariantTicksPerSecond() {
+-#if HWY_ARCH_PPC
+-  return __ppc_get_timebase_freq();
+-#elif HWY_ARCH_X86
+-  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
+-  return NominalClockRate();
+-#else
+-  // Fall back to clock_gettime nanoseconds.
+-  return 1E9;
+-#endif
+-}
+-
+-}  // namespace platform
+ namespace {
+-
+-// Prevents the compiler from eliding the computations that led to "output".
+-template <class T>
+-inline void PreventElision(T&& output) {
+-#if HWY_COMPILER_MSVC == 0
+-  // Works by indicating to the compiler that "output" is being read and
+-  // modified. The +r constraint avoids unnecessary writes to memory, but only
+-  // works for built-in types (typically FuncOutput).
+-  asm volatile("" : "+r"(output) : : "memory");
+-#else
+-  // MSVC does not support inline assembly anymore (and never supported GCC's
+-  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+-  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+-  // with volatile pointers generates inefficient code on MSVC 2017.
+-  static std::atomic<T> dummy(T{});
+-  dummy.store(output, std::memory_order_relaxed);
+-#endif
+-}
+-
+ namespace timer {
+ 
++// Ticks := platform-specific timer values (CPU cycles on x86). Must be
++// unsigned to guarantee wraparound on overflow.
++using Ticks = uint64_t;
++
+ // Start/Stop return absolute timestamps and must be placed immediately before
+ // and after the region to measure. We provide separate Start/Stop functions
+ // because they use different fences.
+@@ -202,8 +117,8 @@ namespace timer {
+ 
+ // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+ // divide by InvariantTicksPerSecond.
+-inline uint64_t Start64() {
+-  uint64_t t;
++inline Ticks Start() {
++  Ticks t;
+ #if HWY_ARCH_PPC
+   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+ #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+@@ -228,8 +143,15 @@ inline uint64_t Start64() {
+       : "rdx", "memory", "cc");
+ #elif HWY_ARCH_RVV
+   asm volatile("rdcycle %0" : "=r"(t));
+-#else
+-  // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
++#elif defined(_WIN32) || defined(_WIN64)
++  LARGE_INTEGER counter;
++  (void)QueryPerformanceCounter(&counter);
++  t = counter.QuadPart;
++#elif defined(__MACH__)
++  t = mach_absolute_time();
++#elif defined(__HAIKU__)
++  t = system_time_nsecs();  // since boot
++#else  // POSIX
+   timespec ts;
+   clock_gettime(CLOCK_MONOTONIC, &ts);
+   t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+@@ -237,7 +159,7 @@ inline uint64_t Start64() {
+   return t;
+ }
+ 
+-inline uint64_t Stop64() {
++inline Ticks Stop() {
+   uint64_t t;
+ #if HWY_ARCH_PPC
+   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+@@ -261,61 +183,7 @@ inline uint64_t Stop64() {
+       // "cc" = flags modified by SHL.
+       : "rcx", "rdx", "memory", "cc");
+ #else
+-  t = Start64();
+-#endif
+-  return t;
+-}
+-
+-// Returns a 32-bit timestamp with about 4 cycles less overhead than
+-// Start64. Only suitable for measuring very short regions because the
+-// timestamp overflows about once a second.
+-inline uint32_t Start32() {
+-  uint32_t t;
+-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-  t = static_cast<uint32_t>(__rdtsc());
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-#elif HWY_ARCH_X86_64
+-  asm volatile(
+-      "lfence\n\t"
+-      "rdtsc\n\t"
+-      "lfence"
+-      : "=a"(t)
+-      :
+-      // "memory" avoids reordering. rdx = TSC >> 32.
+-      : "rdx", "memory");
+-#elif HWY_ARCH_RVV
+-  asm volatile("rdcycle %0" : "=r"(t));
+-#else
+-  t = static_cast<uint32_t>(Start64());
+-#endif
+-  return t;
+-}
+-
+-inline uint32_t Stop32() {
+-  uint32_t t;
+-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+-  _ReadWriteBarrier();
+-  unsigned aux;
+-  t = static_cast<uint32_t>(__rdtscp(&aux));
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-#elif HWY_ARCH_X86_64
+-  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+-  asm volatile(
+-      "rdtscp\n\t"
+-      "lfence"
+-      : "=a"(t)
+-      :
+-      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+-      : "rcx", "rdx", "memory");
+-#else
+-  t = static_cast<uint32_t>(Stop64());
++  t = Start();
+ #endif
+   return t;
+ }
+@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value
+ }
+ 
+ }  // namespace robust_statistics
++}  // namespace
++namespace platform {
++namespace {
+ 
+-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
+-// read than 64 bit.
+-using Ticks = uint32_t;
++// Prevents the compiler from eliding the computations that led to "output".
++template <class T>
++inline void PreventElision(T&& output) {
++#if HWY_COMPILER_MSVC == 0
++  // Works by indicating to the compiler that "output" is being read and
++  // modified. The +r constraint avoids unnecessary writes to memory, but only
++  // works for built-in types (typically FuncOutput).
++  asm volatile("" : "+r"(output) : : "memory");
++#else
++  // MSVC does not support inline assembly anymore (and never supported GCC's
++  // RTL constraints). Self-assignment with #pragma optimize("off") might be
++  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
++  // with volatile pointers generates inefficient code on MSVC 2017.
++  static std::atomic<T> dummy(T{});
++  dummy.store(output, std::memory_order_relaxed);
++#endif
++}
++
++#if HWY_ARCH_X86
++
++void Cpuid(const uint32_t level, const uint32_t count,
++           uint32_t* HWY_RESTRICT abcd) {
++#if HWY_COMPILER_MSVC
++  int regs[4];
++  __cpuidex(regs, level, count);
++  for (int i = 0; i < 4; ++i) {
++    abcd[i] = regs[i];
++  }
++#else
++  uint32_t a;
++  uint32_t b;
++  uint32_t c;
++  uint32_t d;
++  __cpuid_count(level, count, a, b, c, d);
++  abcd[0] = a;
++  abcd[1] = b;
++  abcd[2] = c;
++  abcd[3] = d;
++#endif
++}
++
++std::string BrandString() {
++  char brand_string[49];
++  std::array<uint32_t, 4> abcd;
++
++  // Check if brand string is supported (it is on all reasonable Intel/AMD)
++  Cpuid(0x80000000U, 0, abcd.data());
++  if (abcd[0] < 0x80000004U) {
++    return std::string();
++  }
++
++  for (size_t i = 0; i < 3; ++i) {
++    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
++    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
++  }
++  brand_string[48] = 0;
++  return brand_string;
++}
++
++// Returns the frequency quoted inside the brand string. This does not
++// account for throttling nor Turbo Boost.
++double NominalClockRate() {
++  const std::string& brand_string = BrandString();
++  // Brand strings include the maximum configured frequency. These prefixes are
++  // defined by Intel CPUID documentation.
++  const char* prefixes[3] = {"MHz", "GHz", "THz"};
++  const double multipliers[3] = {1E6, 1E9, 1E12};
++  for (size_t i = 0; i < 3; ++i) {
++    const size_t pos_prefix = brand_string.find(prefixes[i]);
++    if (pos_prefix != std::string::npos) {
++      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
++      if (pos_space != std::string::npos) {
++        const std::string digits =
++            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
++        return std::stod(digits) * multipliers[i];
++      }
++    }
++  }
++
++  return 0.0;
++}
++
++#endif  // HWY_ARCH_X86
++
++}  // namespace
++
++double InvariantTicksPerSecond() {
++#if HWY_ARCH_PPC
++  return __ppc_get_timebase_freq();
++#elif HWY_ARCH_X86
++  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
++  return NominalClockRate();
++#elif defined(_WIN32) || defined(_WIN64)
++  LARGE_INTEGER freq;
++  (void)QueryPerformanceFrequency(&freq);
++  return double(freq.QuadPart);
++#elif defined(__MACH__)
++  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
++  mach_timebase_info_data_t timebase;
++  (void)mach_timebase_info(&timebase);
++  return double(timebase.denom) / timebase.numer * 1E9;
++#else
++  // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
++  return 1E9;  // Haiku and clock_gettime return nanoseconds.
++#endif
++}
+ 
+-// Returns timer overhead / minimum measurable difference.
+-Ticks TimerResolution() {
++double Now() {
++  static const double mul = 1.0 / InvariantTicksPerSecond();
++  return static_cast<double>(timer::Start()) * mul;
++}
++
++uint64_t TimerResolution() {
+   // Nested loop avoids exceeding stack/L1 capacity.
+-  Ticks repetitions[Params::kTimerSamples];
++  timer::Ticks repetitions[Params::kTimerSamples];
+   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
+-    Ticks samples[Params::kTimerSamples];
++    timer::Ticks samples[Params::kTimerSamples];
+     for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+-      const Ticks t0 = timer::Start32();
+-      const Ticks t1 = timer::Stop32();
++      const timer::Ticks t0 = timer::Start();
++      const timer::Ticks t1 = timer::Stop();
+       samples[i] = t1 - t0;
+     }
+     repetitions[rep] = robust_statistics::Mode(samples);
+@@ -462,18 +439,21 @@ Ticks TimerResolution() {
+   return robust_statistics::Mode(repetitions);
+ }
+ 
+-static const Ticks timer_resolution = TimerResolution();
++}  // namespace platform
++namespace {
++
++static const timer::Ticks timer_resolution = platform::TimerResolution();
+ 
+ // Estimates the expected value of "lambda" values with a variable number of
+ // samples until the variability "rel_mad" is less than "max_rel_mad".
+ template <class Lambda>
+-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+-                        const Params& p, const Lambda& lambda) {
++timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
++                               const Params& p, const Lambda& lambda) {
+   // Choose initial samples_per_eval based on a single estimated duration.
+-  Ticks t0 = timer::Start32();
++  timer::Ticks t0 = timer::Start();
+   lambda();
+-  Ticks t1 = timer::Stop32();
+-  Ticks est = t1 - t0;
++  timer::Ticks t1 = timer::Stop();
++  timer::Ticks est = t1 - t0;
+   static const double ticks_per_second = platform::InvariantTicksPerSecond();
+   const size_t ticks_per_eval =
+       static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
+@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max
+       est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
+   samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
+ 
+-  std::vector<Ticks> samples;
++  std::vector<timer::Ticks> samples;
+   samples.reserve(1 + samples_per_eval);
+   samples.push_back(est);
+ 
+   // Percentage is too strict for tiny differences, so also allow a small
+   // absolute "median absolute deviation".
+-  const Ticks max_abs_mad = (timer_resolution + 99) / 100;
++  const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
+   *rel_mad = 0.0;  // ensure initialized
+ 
+   for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
+     samples.reserve(samples.size() + samples_per_eval);
+     for (size_t i = 0; i < samples_per_eval; ++i) {
+-      t0 = timer::Start32();
++      t0 = timer::Start();
+       lambda();
+-      t1 = timer::Stop32();
++      t1 = timer::Stop();
+       samples.push_back(t1 - t0);
+     }
+ 
+@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max
+     NANOBENCHMARK_CHECK(est != 0);
+ 
+     // Median absolute deviation (mad) is a robust measure of 'variability'.
+-    const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
++    const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+         samples.data(), samples.size(), est);
+-    *rel_mad = static_cast<double>(int(abs_mad)) / est;
++    *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
+ 
+     if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
+       if (p.verbose) {
+-        printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
+-               samples.size(), est, abs_mad, *rel_mad * 100.0);
++        printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
++               samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
+       }
+       return est;
+     }
+@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i
+   return unique;
+ }
+ 
+-// Returns how often we need to call func for sufficient precision, or zero
+-// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
++// Returns how often we need to call func for sufficient precision.
+ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
+                const Params& p) {
+   // Min elapsed ticks for any input.
+-  Ticks min_duration = ~0u;
++  timer::Ticks min_duration = ~timer::Ticks(0);
+ 
+   for (const FuncInput input : unique) {
+-    // Make sure a 32-bit timer is sufficient.
+-    const uint64_t t0 = timer::Start64();
+-    PreventElision(func(arg, input));
+-    const uint64_t t1 = timer::Stop64();
+-    const uint64_t elapsed = t1 - t0;
+-    if (elapsed >= (1ULL << 30)) {
+-      fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
+-              input);
+-      return 0;
+-    }
+-
+     double rel_mad;
+-    const Ticks total = SampleUntilStable(
++    const timer::Ticks total = SampleUntilStable(
+         p.target_rel_mad, &rel_mad, p,
+-        [func, arg, input]() { PreventElision(func(arg, input)); });
++        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
+     min_duration = std::min(min_duration, total - timer_resolution);
+   }
+ 
+@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui
+   const size_t num_skip =
+       min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
+   if (p.verbose) {
+-    printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
+-           max_skip, min_duration, num_skip);
++    printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
++           size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
+   }
+   return num_skip;
+ }
+@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co
+ }
+ 
+ // Returns total ticks elapsed for all inputs.
+-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
+-                    const Params& p, double* max_rel_mad) {
++timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
++                           const InputVec* inputs, const Params& p,
++                           double* max_rel_mad) {
+   double rel_mad;
+-  const Ticks duration =
++  const timer::Ticks duration =
+       SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
+         for (const FuncInput input : *inputs) {
+-          PreventElision(func(arg, input));
++          platform::PreventElision(func(arg, input));
+         }
+       });
+   *max_rel_mad = std::max(*max_rel_mad, rel_mad);
+@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const
+ 
+ // Returns overhead of accessing inputs[] and calling a function; this will
+ // be deducted from future TotalDuration return values.
+-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
++timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
++                      const Params& p) {
+   double rel_mad;
+   // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
+   return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
+     for (const FuncInput input : *inputs) {
+-      PreventElision(EmptyFunc(arg, input));
++      platform::PreventElision(EmptyFunc(arg, input));
+     }
+   });
+ }
+ 
+ }  // namespace
+ 
+-int Unpredictable1() { return timer::Start64() != ~0ULL; }
++int Unpredictable1() { return timer::Start() != ~0ULL; }
+ 
+ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+                const size_t num_inputs, Result* results, const Params& p) {
+@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui
+       ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
+   InputVec subset(full.size() - num_skip);
+ 
+-  const Ticks overhead = Overhead(arg, &full, p);
+-  const Ticks overhead_skip = Overhead(arg, &subset, p);
++  const timer::Ticks overhead = Overhead(arg, &full, p);
++  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
+   if (overhead < overhead_skip) {
+-    fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
+-            overhead_skip);
++    fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
++            size_t(overhead), size_t(overhead_skip));
+     return 0;
+   }
+ 
+   if (p.verbose) {
+-    printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
+-           overhead, overhead_skip);
++    printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
++           size_t(overhead), size_t(overhead_skip));
+   }
+ 
+   double max_rel_mad = 0.0;
+-  const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
++  const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+ 
+   for (size_t i = 0; i < unique.size(); ++i) {
+     FillSubset(full, unique[i], num_skip, &subset);
+-    const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
++    const timer::Ticks total_skip =
++        TotalDuration(func, arg, &subset, p, &max_rel_mad);
+ 
+     if (total < total_skip) {
+-      fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
++      fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
++              size_t(total_skip));
+       return 0;
+     }
+ 
+-    const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
++    const timer::Ticks duration =
++        (total - overhead) - (total_skip - overhead_skip);
+     results[i].input = unique[i];
+     results[i].ticks = static_cast<float>(duration) * mul;
+     results[i].variability = static_cast<float>(max_rel_mad);
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12	2021-06-02 10:56:05.272904579 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h	2021-05-31 10:37:11.000000000 -0400
+@@ -44,11 +44,6 @@
+ // central tendency of the measurement samples with the "half sample mode",
+ // which is more robust to outliers and skewed data than the mean or median.
+ 
+-// WARNING if included from multiple translation units compiled with distinct
+-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
+-// macro that is unique to the current compile flags. We must also avoid
+-// standard library headers such as vector and functional that define functions.
+-
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -79,6 +74,16 @@ namespace platform {
+ // This call may be expensive, callers should cache the result.
+ double InvariantTicksPerSecond();
+ 
++// Returns current timestamp [in seconds] relative to an unspecified origin.
++// Features: monotonic (no negative elapsed time), steady (unaffected by system
++// time changes), high-resolution (on the order of microseconds).
++double Now();
++
++// Returns ticks elapsed in back to back timer calls, i.e. a function of the
++// timer resolution (minimum measurable difference) and overhead.
++// This call is expensive, callers should cache the result.
++uint64_t TimerResolution();
++
+ }  // namespace platform
+ 
+ // Returns 1, but without the compiler knowing what the value is. This prevents
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12	2021-06-02 10:56:05.275904594 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -15,11 +15,11 @@
+ #include "hwy/nanobenchmark.h"
+ 
+ #include <stdio.h>
+-#include <stdlib.h>  // strtol
+-#include <unistd.h>  // sleep
+ 
+ #include <random>
+ 
++#include "hwy/tests/test_util-inl.h"
++
+ namespace hwy {
+ namespace {
+ 
+@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in
+ 
+ template <size_t N>
+ void MeasureDiv(const FuncInput (&inputs)[N]) {
++  printf("Measuring integer division (output on final two lines)\n");
+   Result results[N];
+   Params params;
+   params.max_evals = 4;  // avoid test timeout
+@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp
+   }
+ }
+ 
+-template <size_t N>
+-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
+-  printf("Expect a 'measurement failed' below:\n");
+-  Result results[N];
+-
+-  const size_t num_results = Measure(
+-      [](const void*, const FuncInput input) -> FuncOutput {
+-        // Loop until the sleep succeeds (not interrupted by signal). We assume
+-        // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
+-        while (sleep(2) != 0) {
+-        }
+-        return input;
+-      },
+-      nullptr, inputs, N, results);
+-  NANOBENCHMARK_CHECK(num_results == 0);
+-  (void)num_results;
+-}
+-
+-void RunAll(const int argc, char** /*argv*/) {
+-  // unpredictable == 1 but the compiler doesn't know that.
+-  const int unpredictable = argc != 999;
++TEST(NanobenchmarkTest, RunAll) {
++  const int unpredictable = Unpredictable1();  // == 1, unknown to compiler.
+   static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
+                                      static_cast<FuncInput>(unpredictable + 9)};
+ 
+   MeasureDiv(inputs);
+   MeasureRandom(inputs);
+-  EnsureLongMeasurementFails(inputs);
+ }
+ 
+ }  // namespace
+ }  // namespace hwy
+-
+-int main(int argc, char* argv[]) {
+-  hwy::RunAll(argc, argv);
+-  return 0;
+-}
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12	2021-06-02 10:56:05.239904412 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE();
+ namespace hwy {
+ namespace HWY_NAMESPACE {
+ 
++namespace detail {  // for code folding and Raw128
++
+ // Macros used to define single and double function calls for multiple types
+ // for full and half vectors. These macros are undefined at the end of the file.
+ 
+@@ -133,7 +135,7 @@ namespace HWY_NAMESPACE {
+   HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
+ 
+ // float and double
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)   \
+   HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args)  \
+   HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args)     \
+@@ -181,7 +183,7 @@ namespace HWY_NAMESPACE {
+   HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
+ 
+ // Emulation of some intrinsics on armv7.
+-#if !defined(__aarch64__)
++#if HWY_ARCH_ARM_V7
+ #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
+ #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
+ #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
+@@ -294,7 +296,7 @@ struct Raw128<float, 4> {
+   using type = float32x4_t;
+ };
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ template <>
+ struct Raw128<double, 2> {
+   using type = float64x2_t;
+@@ -352,7 +354,7 @@ struct Raw128<float, 2> {
+   using type = float32x2_t;
+ };
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ template <>
+ struct Raw128<double, 1> {
+   using type = float64x1_t;
+@@ -437,12 +439,14 @@ struct Raw128<int8_t, 1> {
+   using type = int8x8_t;
+ };
+ 
++}  // namespace detail
++
+ template <typename T>
+ using Full128 = Simd<T, 16 / sizeof(T)>;
+ 
+ template <typename T, size_t N = 16 / sizeof(T)>
+ class Vec128 {
+-  using Raw = typename Raw128<T, N>::type;
++  using Raw = typename detail::Raw128<T, N>::type;
+ 
+  public:
+   HWY_INLINE Vec128() {}
+@@ -480,7 +484,8 @@ class Vec128 {
+ // FF..FF or 0, also for floating-point - see README.
+ template <typename T, size_t N = 16 / sizeof(T)>
+ class Mask128 {
+-  using Raw = typename Raw128<T, N>::type;
++  // ARM C Language Extensions return and expect unsigned type.
++  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
+ 
+  public:
+   HWY_INLINE Mask128() {}
+@@ -573,7 +578,7 @@ HWY_INLINE Vec128<int64_t, 1> BitCastFro
+                                               Vec128<uint8_t, 1 * 8> v) {
+   return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> BitCastFromByte(Simd<double, 1> /* tag */,
+                                              Vec128<uint8_t, 1 * 8> v) {
+   return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
+@@ -615,7 +620,7 @@ HWY_INLINE Vec128<int64_t> BitCastFromBy
+   return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
+ }
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
+                                           Vec128<uint8_t> v) {
+   return Vec128<double>(vreinterpretq_f64_u8(v.raw));
+@@ -664,15 +669,25 @@ template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
+   HWY_DIAGNOSTICS(push)
+   HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+-  typename Raw128<T, N>::type a;
++  typename detail::Raw128<T, N>::type a;
+   return Vec128<T, N>(a);
+   HWY_DIAGNOSTICS(pop)
+ }
+ 
+-// ------------------------------ Extract lane
++// Returns a vector with lane i=[0, N) set to "first" + i.
++template <typename T, size_t N, typename T2>
++Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
++  HWY_ALIGN T lanes[16 / sizeof(T)];
++  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
++    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
++  }
++  return Load(d, lanes);
++}
++
++// ------------------------------ GetLane
+ 
+ HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
+-  return vget_lane_u8(vget_low_u8(v.raw), 0);
++  return vgetq_lane_u8(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
+@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
+-  return vget_lane_s8(vget_low_s8(v.raw), 0);
++  return vgetq_lane_s8(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
+@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128<i
+ }
+ 
+ HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
+-  return vget_lane_u16(vget_low_u16(v.raw), 0);
++  return vgetq_lane_u16(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
+@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128
+ }
+ 
+ HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
+-  return vget_lane_s16(vget_low_s16(v.raw), 0);
++  return vgetq_lane_s16(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
+@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
+-  return vget_lane_u32(vget_low_u32(v.raw), 0);
++  return vgetq_lane_u32(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
+@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128
+ }
+ 
+ HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
+-  return vget_lane_s32(vget_low_s32(v.raw), 0);
++  return vgetq_lane_s32(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
+@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
+-  return vget_lane_u64(vget_low_u64(v.raw), 0);
++  return vgetq_lane_u64(v.raw, 0);
+ }
+ HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
+   return vget_lane_u64(v.raw, 0);
+ }
+ HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
+-  return vget_lane_s64(vget_low_s64(v.raw), 0);
++  return vgetq_lane_s64(v.raw, 0);
+ }
+ HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
+   return vget_lane_s64(v.raw, 0);
+ }
+ 
+ HWY_INLINE float GetLane(const Vec128<float, 4> v) {
+-  return vget_lane_f32(vget_low_f32(v.raw), 0);
++  return vgetq_lane_f32(v.raw, 0);
+ }
+ HWY_INLINE float GetLane(const Vec128<float, 2> v) {
+   return vget_lane_f32(v.raw, 0);
+@@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128<fl
+ HWY_INLINE float GetLane(const Vec128<float, 1> v) {
+   return vget_lane_f32(v.raw, 0);
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE double GetLane(const Vec128<double, 2> v) {
+-  return vget_lane_f64(vget_low_f64(v.raw), 0);
++  return vgetq_lane_f64(v.raw, 0);
+ }
+ HWY_INLINE double GetLane(const Vec128<double, 1> v) {
+   return vget_lane_f64(v.raw, 0);
+@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu
+ // ------------------------------ Average
+ 
+ // Returns (a + b + 1) / 2
+-
+-// Unsigned
+ HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
+ HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
+ 
+@@ -802,6 +815,7 @@ HWY_INLINE Vec128<int16_t> Abs(const Vec
+ HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
+   return Vec128<int32_t>(vabsq_s32(v.raw));
+ }
++// i64 is implemented after BroadcastSignBit.
+ HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
+   return Vec128<float>(vabsq_f32(v.raw));
+ }
+@@ -823,7 +837,7 @@ HWY_INLINE Vec128<float, N> Abs(const Ve
+   return Vec128<float, N>(vabs_f32(v.raw));
+ }
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
+   return Vec128<double>(vabsq_f64(v.raw));
+ }
+@@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below
+ 
+ HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return Vec128<int64_t, 1>(vneg_s64(v.raw));
+ #else
+   return Zero(Simd<int64_t, 1>()) - v;
+@@ -847,7 +861,7 @@ HWY_INLINE Vec128<int64_t, 1> Neg(const
+ }
+ 
+ HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return Vec128<int64_t>(vnegq_s64(v.raw));
+ #else
+   return Zero(Full128<int64_t>()) - v;
+@@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v
+ 
+ // ------------------------------ Shl
+ 
++HWY_INLINE Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
++                                      const Vec128<uint8_t> bits) {
++  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
++}
++template <size_t N, HWY_IF_LE64(uint8_t, N)>
++HWY_INLINE Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
++                                         const Vec128<uint8_t, N> bits) {
++  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
++}
++
+ HWY_INLINE Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
+                                        const Vec128<uint16_t> bits) {
+   return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
+@@ -905,6 +929,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator<
+   return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
+ }
+ 
++HWY_INLINE Vec128<int8_t> operator<<(const Vec128<int8_t> v,
++                                     const Vec128<int8_t> bits) {
++  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
++}
++template <size_t N, HWY_IF_LE64(int8_t, N)>
++HWY_INLINE Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
++                                        const Vec128<int8_t, N> bits) {
++  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
++}
++
+ HWY_INLINE Vec128<int16_t> operator<<(const Vec128<int16_t> v,
+                                       const Vec128<int16_t> bits) {
+   return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
+@@ -936,6 +970,18 @@ HWY_INLINE Vec128<int64_t, 1> operator<<
+ 
+ // ------------------------------ Shr (Neg)
+ 
++HWY_INLINE Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
++                                      const Vec128<uint8_t> bits) {
++  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
++  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
++}
++template <size_t N, HWY_IF_LE64(uint8_t, N)>
++HWY_INLINE Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
++                                         const Vec128<uint8_t, N> bits) {
++  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
++  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
++}
++
+ HWY_INLINE Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
+                                        const Vec128<uint16_t> bits) {
+   const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
+@@ -971,6 +1017,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator>
+   return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
+ }
+ 
++HWY_INLINE Vec128<int8_t> operator>>(const Vec128<int8_t> v,
++                                     const Vec128<int8_t> bits) {
++  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
++}
++template <size_t N, HWY_IF_LE64(int8_t, N)>
++HWY_INLINE Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
++                                        const Vec128<int8_t, N> bits) {
++  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
++}
++
+ HWY_INLINE Vec128<int16_t> operator>>(const Vec128<int16_t> v,
+                                       const Vec128<int16_t> bits) {
+   return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
+@@ -1059,7 +1115,7 @@ HWY_INLINE Vec128<int32_t, N> operator*(
+ HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
+                                    const Vec128<int16_t> b) {
+   int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
+ #else
+   int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
+@@ -1070,7 +1126,7 @@ HWY_INLINE Vec128<int16_t> MulHigh(const
+ HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
+                                     const Vec128<uint16_t> b) {
+   uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
+ #else
+   uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
+@@ -1139,24 +1195,37 @@ HWY_INLINE Vec128<float, N> ApproximateR
+   return Vec128<float, N>(vrecpe_f32(v.raw));
+ }
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
+ #else
+-// Emulated with approx reciprocal + Newton-Raphson + mul
++// Not defined on armv7: approximate
++namespace detail {
++
++HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
++    const Vec128<float> recip, const Vec128<float> divisor) {
++  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
++}
++template <size_t N>
++HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
++    const Vec128<float, N> recip, Vec128<float, N> divisor) {
++  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
++}
++
++}  // namespace detail
++
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
+                                       const Vec128<float, N> b) {
+   auto x = ApproximateReciprocal(b);
+-  // Newton-Raphson on 1/x - b
+-  const auto two = Set(Simd<float, N>(), 2);
+-  x = x * (two - b * x);
+-  x = x * (two - b * x);
+-  x = x * (two - b * x);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+   return a * x;
+ }
+ #endif
+ 
+-// Absolute value of difference.
++// ------------------------------ Absolute value of difference.
++
+ HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
+   return Vec128<float>(vabdq_f32(a.raw, b.raw));
+ }
+@@ -1169,7 +1238,7 @@ HWY_INLINE Vec128<float, N> AbsDiff(cons
+ // ------------------------------ Floating-point multiply-add variants
+ 
+ // Returns add + mul * x
+-#if defined(__aarch64__)
++#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
+ template <size_t N, HWY_IF_LE64(float, N)>
+ HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
+                                    const Vec128<float, N> x,
+@@ -1180,6 +1249,17 @@ HWY_INLINE Vec128<float> MulAdd(const Ve
+                                 const Vec128<float> add) {
+   return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
+ }
++#else
++// Emulate FMA for floats.
++template <size_t N>
++HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
++                                   const Vec128<float, N> x,
++                                   const Vec128<float, N> add) {
++  return mul * x + add;
++}
++#endif
++
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
+                                     const Vec128<double, 1> x,
+                                     const Vec128<double, 1> add) {
+@@ -1190,18 +1270,10 @@ HWY_INLINE Vec128<double> MulAdd(const V
+                                  const Vec128<double> add) {
+   return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
+ }
+-#else
+-// Emulate FMA for floats.
+-template <size_t N>
+-HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
+-                                   const Vec128<float, N> x,
+-                                   const Vec128<float, N> add) {
+-  return mul * x + add;
+-}
+ #endif
+ 
+ // Returns add - mul * x
+-#if defined(__aarch64__)
++#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
+ template <size_t N, HWY_IF_LE64(float, N)>
+ HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
+                                       const Vec128<float, N> x,
+@@ -1213,7 +1285,17 @@ HWY_INLINE Vec128<float> NegMulAdd(const
+                                    const Vec128<float> add) {
+   return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
+ }
++#else
++// Emulate FMA for floats.
++template <size_t N>
++HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
++                                      const Vec128<float, N> x,
++                                      const Vec128<float, N> add) {
++  return add - mul * x;
++}
++#endif
+ 
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
+                                        const Vec128<double, 1> x,
+                                        const Vec128<double, 1> add) {
+@@ -1224,14 +1306,6 @@ HWY_INLINE Vec128<double> NegMulAdd(cons
+                                     const Vec128<double> add) {
+   return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
+ }
+-#else
+-// Emulate FMA for floats.
+-template <size_t N>
+-HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
+-                                      const Vec128<float, N> x,
+-                                      const Vec128<float, N> add) {
+-  return add - mul * x;
+-}
+ #endif
+ 
+ // Returns mul * x - sub
+@@ -1241,12 +1315,6 @@ HWY_INLINE Vec128<float, N> MulSub(const
+                                    const Vec128<float, N> sub) {
+   return MulAdd(mul, x, Neg(sub));
+ }
+-template <size_t N>
+-HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
+-                                    const Vec128<double, N> x,
+-                                    const Vec128<double, N> sub) {
+-  return MulAdd(mul, x, Neg(sub));
+-}
+ 
+ // Returns -mul * x - sub
+ template <size_t N>
+@@ -1255,14 +1323,23 @@ HWY_INLINE Vec128<float, N> NegMulSub(co
+                                       const Vec128<float, N> sub) {
+   return Neg(MulAdd(mul, x, sub));
+ }
++
++#if HWY_ARCH_ARM_A64
++template <size_t N>
++HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
++                                    const Vec128<double, N> x,
++                                    const Vec128<double, N> sub) {
++  return MulAdd(mul, x, Neg(sub));
++}
+ template <size_t N>
+ HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
+                                        const Vec128<double, N> x,
+                                        const Vec128<double, N> sub) {
+   return Neg(MulAdd(mul, x, sub));
+ }
++#endif
+ 
+-// ------------------------------ Floating-point square root
++// ------------------------------ Floating-point square root (IfThenZeroElse)
+ 
+ // Approximate reciprocal square root
+ HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
+@@ -1275,80 +1352,36 @@ HWY_INLINE Vec128<float, N> ApproximateR
+ }
+ 
+ // Full precision square root
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
+ #else
+-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
+-template <size_t N>
+-HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
+-  auto b = v;
+-  auto Y = ApproximateReciprocalSqrt(v);
+-  auto x = v * Y;
+-  const auto half = Set(Simd<float, N>(), 0.5);
+-  const auto oneandhalf = Set(Simd<float, N>(), 1.5);
+-  for (size_t i = 0; i < 3; i++) {
+-    b = b * Y * Y;
+-    Y = oneandhalf - half * b;
+-    x = x * Y;
+-  }
+-  return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
+-}
+-#endif
+-
+-// ================================================== COMPARE
+-
+-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
++namespace detail {
+ 
+-template <typename TFrom, typename TTo, size_t N>
+-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
+-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+-  return Mask128<TTo, N>{m.raw};
++HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
++                                            const Vec128<float> recip) {
++  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
++}
++template <size_t N>
++HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
++                                               Vec128<float, N> recip) {
++  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
+ }
+ 
+-#define HWY_NEON_BUILD_TPL_HWY_COMPARE
+-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
+-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
+-  const Vec128<type, size> a, const Vec128<type, size> b
+-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
+-
+-// ------------------------------ Equality
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
+-#if defined(__aarch64__)
+-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
+-#else
+-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+-#endif
++}  // namespace detail
+ 
+-// ------------------------------ Strict inequality
++// Not defined on armv7: approximate
++template <size_t N>
++HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
++  auto recip = ApproximateReciprocalSqrt(v);
+ 
+-// Signed/float < (no unsigned)
+-#if defined(__aarch64__)
+-HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
+-#else
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
+-#endif
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+ 
+-// Signed/float > (no unsigned)
+-#if defined(__aarch64__)
+-HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
+-#else
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
++  const auto root = v * recip;
++  return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
++}
+ #endif
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
+-
+-// ------------------------------ Weak inequality
+-
+-// Float <= >=
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
+-
+-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
+-#undef HWY_NEON_BUILD_RET_HWY_COMPARE
+-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
+-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
+ 
+ // ================================================== LOGICAL
+ 
+@@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato
+ // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
+ template <typename T>
+ HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
+-  const Full128<uint8_t> d8;
+-  return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
++  const Full128<T> d;
++  const Repartition<uint8_t, decltype(d)> d8;
++  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
+ }
+ template <typename T, size_t N, HWY_IF_LE64(T, N)>
+ HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
+-  const Repartition<uint8_t, Simd<T, N>> d8;
+-  return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
++  const Simd<T, N> d;
++  const Repartition<uint8_t, decltype(d)> d8;
++  using V8 = decltype(Zero(d8));
++  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
+ }
+ 
+ // ------------------------------ And
+@@ -1463,33 +1499,38 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
+   return ShiftRight<sizeof(T) * 8 - 1>(v);
+ }
+ 
+-// ------------------------------ Make mask
++// ================================================== MASK
+ 
+-template <typename T, size_t N>
+-HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
+-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
+-  return (v & bit) == bit;
+-}
++// ------------------------------ To/from vector
+ 
+-// Mask and Vec are the same (true = FF..FF).
++// Mask and Vec have the same representation (true = FF..FF).
+ template <typename T, size_t N>
+ HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
+-  return Mask128<T, N>(v.raw);
++  const Simd<MakeUnsigned<T>, N> du;
++  return Mask128<T, N>(BitCast(du, v).raw);
+ }
+ 
++// DEPRECATED
+ template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
+-  return Vec128<T, N>(v.raw);
++  return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
+ }
+ 
+ template <typename T, size_t N>
+-HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
+-                                     const Mask128<T, N> v) {
+-  return Vec128<T, N>(v.raw);
++HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
++  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
+ }
+ 
+-// IfThenElse(mask, yes, no)
+-// Returns mask ? b : a.
++// ------------------------------ RebindMask
++
++template <typename TFrom, typename TTo, size_t N>
++HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
++  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
++  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
++}
++
++// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
++
+ #define HWY_NEON_BUILD_TPL_HWY_IF
+ #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
+ #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                 \
+@@ -1524,7 +1565,6 @@ HWY_INLINE Vec128<T, N> ZeroIfNegative(V
+   return Max(zero, v);
+ }
+ 
+-
+ // ------------------------------ Mask logical
+ 
+ template <typename T, size_t N>
+@@ -1557,30 +1597,183 @@ HWY_API Mask128<T, N> Xor(const Mask128<
+   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
+ }
+ 
+-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
++// ================================================== COMPARE
+ 
+-namespace detail {
++// Comparisons fill a lane with 1-bits if the condition is true, else 0.
+ 
+-#if defined(__aarch64__)
++// ------------------------------ Shuffle2301 (for i64 compares)
+ 
+-HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+-  return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
++// Swap 32-bit halves in 64-bits
++HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
++  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
++}
++HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
++  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
++}
++HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
++  return Vec128<float, 2>(vrev64_f32(v.raw));
++}
++HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
++  return Vec128<uint32_t>(vrev64q_u32(v.raw));
+ }
+-HWY_INLINE Vec128<uint64_t, 1> Gt(Vec128<uint64_t, 1> a,
+-                                  Vec128<uint64_t, 1> b) {
+-  return Vec128<uint64_t, 1>(vcgt_u64(a.raw, b.raw));
++HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
++  return Vec128<int32_t>(vrev64q_s32(v.raw));
++}
++HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
++  return Vec128<float>(vrev64q_f32(v.raw));
+ }
+ 
+-HWY_INLINE Vec128<int64_t> Gt(Vec128<int64_t> a, Vec128<int64_t> b) {
+-  return Vec128<int64_t>(vcgtq_s64(a.raw, b.raw));
++#define HWY_NEON_BUILD_TPL_HWY_COMPARE
++#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
++#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
++  const Vec128<type, size> a, const Vec128<type, size> b
++#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
++
++// ------------------------------ Equality
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
++#if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
++#else
++// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
++#endif
++
++// ------------------------------ Strict inequality (signed, float)
++#if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
++#else
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
++#endif
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
++
++// ------------------------------ Weak inequality (float)
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
++
++#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
++#undef HWY_NEON_BUILD_RET_HWY_COMPARE
++#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
++#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
++
++// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
++
++#if HWY_ARCH_ARM_V7
++
++template <size_t N>
++HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
++                                          const Vec128<int64_t, N> b) {
++  const Simd<int32_t, N * 2> d32;
++  const Simd<int64_t, N> d64;
++  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
++  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
++  return MaskFromVec(BitCast(d64, cmp64));
++}
++
++template <size_t N>
++HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
++                                           const Vec128<uint64_t, N> b) {
++  const Simd<uint32_t, N * 2> d32;
++  const Simd<uint64_t, N> d64;
++  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
++  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
++  return MaskFromVec(BitCast(d64, cmp64));
+ }
+-HWY_INLINE Vec128<int64_t, 1> Gt(Vec128<int64_t, 1> a, Vec128<int64_t, 1> b) {
+-  return Vec128<int64_t, 1>(vcgt_s64(a.raw, b.raw));
++
++HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
++                                      const Vec128<int64_t> b) {
++  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
++  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
++}
++HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
++                                         const Vec128<int64_t, 1> b) {
++  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
++  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
+ }
+ 
+ #endif
+ 
+-}  // namespace detail
++// ------------------------------ Reversed comparisons
++
++template <typename T, size_t N>
++HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
++  return operator<(b, a);
++}
++template <typename T, size_t N>
++HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
++  return operator<=(b, a);
++}
++
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T, size_t N>
++HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
++}
++
++// ------------------------------ TestBit (Eq)
++
++#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
++#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
++#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
++  Vec128<type, size> v, Vec128<type, size> bit
++#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
++
++#if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
++#else
++// No 64-bit versions on armv7
++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
++
++template <size_t N>
++HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
++                                        Vec128<uint64_t, N> bit) {
++  return (v & bit) == bit;
++}
++template <size_t N>
++HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
++                                       Vec128<int64_t, N> bit) {
++  return (v & bit) == bit;
++}
++
++#endif
++#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
++#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
++#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
++#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
++
++// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
++HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
++#if HWY_ARCH_ARM_A64
++  return Vec128<int64_t>(vabsq_s64(v.raw));
++#else
++  const auto zero = Zero(Full128<int64_t>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
++#if HWY_ARCH_ARM_A64
++  return Vec128<int64_t, 1>(vabs_s64(v.raw));
++#else
++  const auto zero = Zero(Simd<int64_t, 1>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
++// ------------------------------ Min (IfThenElse, BroadcastSignBit)
++
++#if HWY_ARCH_ARM_A64
++
++HWY_INLINE Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
++  return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
++}
++HWY_INLINE Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
++                                          Vec128<uint64_t, 1> b) {
++  return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
++}
++
++#endif
+ 
+ // Unsigned
+ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
+@@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min,
+ template <size_t N>
+ HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
+                                    const Vec128<uint64_t, N> b) {
+-#if defined(__aarch64__)
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
++#if HWY_ARCH_ARM_A64
++  return IfThenElse(b < a, b, a);
+ #else
+   const Simd<uint64_t, N> du;
+   const Simd<int64_t, N> di;
+@@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v
+ template <size_t N>
+ HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
+                                   const Vec128<int64_t, N> b) {
+-#if defined(__aarch64__)
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
++#if HWY_ARCH_ARM_A64
++  return IfThenElse(b < a, b, a);
+ #else
+   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
+   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
+@@ -1612,7 +1805,7 @@ HWY_INLINE Vec128<int64_t, N> Min(const
+ }
+ 
+ // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
+ #else
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
+@@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max,
+ template <size_t N>
+ HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
+                                    const Vec128<uint64_t, N> b) {
+-#if defined(__aarch64__)
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
++#if HWY_ARCH_ARM_A64
++  return IfThenElse(b < a, a, b);
+ #else
+   const Simd<uint64_t, N> du;
+   const Simd<int64_t, N> di;
+@@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v
+ template <size_t N>
+ HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
+                                   const Vec128<int64_t, N> b) {
+-#if defined(__aarch64__)
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
++#if HWY_ARCH_ARM_A64
++  return IfThenElse(b < a, a, b);
+ #else
+   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
+   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
+@@ -1650,7 +1843,7 @@ HWY_INLINE Vec128<int64_t, N> Max(const
+ }
+ 
+ // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
+ #else
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
+@@ -1696,7 +1889,7 @@ HWY_INLINE Vec128<float> LoadU(Full128<f
+                                const float* HWY_RESTRICT aligned) {
+   return Vec128<float>(vld1q_f32(aligned));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
+                                 const double* HWY_RESTRICT aligned) {
+   return Vec128<double>(vld1q_f64(aligned));
+@@ -1741,7 +1934,7 @@ HWY_INLINE Vec128<float, 2> LoadU(Simd<f
+                                   const float* HWY_RESTRICT p) {
+   return Vec128<float, 2>(vld1_f32(p));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
+                                    const double* HWY_RESTRICT p) {
+   return Vec128<double, 1>(vld1_f64(p));
+@@ -1755,73 +1948,72 @@ HWY_INLINE Vec128<double, 1> LoadU(Simd<
+ // we don't actually care what is in it, and we don't want
+ // to introduce extra overhead by initializing it to something.
+ 
+-HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
++HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
+                                     const uint8_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
+   return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
+ }
+-HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
++HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
+                                      const uint16_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
+   return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
+ }
+-HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
++HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
+                                      const uint32_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(p, a, 0);
+   return Vec128<uint32_t, 1>(b);
+ }
+-HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
++HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
+                                    const int8_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
+   return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
+ }
+-HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
++HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
+                                     const int16_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
+   return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
+ }
+-HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
++HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
+                                     const int32_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(p, a, 0);
+   return Vec128<int32_t, 1>(b);
+ }
+-HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
++HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
+                                   const float* HWY_RESTRICT p) {
+-  float32x2_t a = Undefined(d).raw;
++  float32x2_t a = Undefined(Simd<float, 2>()).raw;
+   float32x2_t b = vld1_lane_f32(p, a, 0);
+   return Vec128<float, 1>(b);
+ }
+ 
+ // ------------------------------ Load 16
+ 
+-HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
++HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
+                                     const uint8_t* HWY_RESTRICT p) {
+-  uint16x4_t a = Undefined(d).raw;
++  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
+   uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
+   return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
+ }
+-HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
++HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
+                                      const uint16_t* HWY_RESTRICT p) {
+-  uint16x4_t a = Undefined(d).raw;
++  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
+   uint16x4_t b = vld1_lane_u16(p, a, 0);
+   return Vec128<uint16_t, 1>(b);
+ }
+-
+-HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
++HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
+                                    const int8_t* HWY_RESTRICT p) {
+-  int16x4_t a = Undefined(d).raw;
++  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
+   int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
+   return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
+ }
+-HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
++HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
+                                     const int16_t* HWY_RESTRICT p) {
+-  int16x4_t a = Undefined(d).raw;
++  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
+   int16x4_t b = vld1_lane_s16(p, a, 0);
+   return Vec128<int16_t, 1>(b);
+ }
+@@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128<floa
+                        float* HWY_RESTRICT aligned) {
+   vst1q_f32(aligned, v.raw);
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
+                        double* HWY_RESTRICT aligned) {
+   vst1q_f64(aligned, v.raw);
+@@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128<floa
+                        float* HWY_RESTRICT p) {
+   vst1_f32(p, v.raw);
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
+                        double* HWY_RESTRICT p) {
+   vst1_f64(p, v.raw);
+@@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128<doub
+ HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
+                        uint8_t* HWY_RESTRICT p) {
+   uint32x2_t a = vreinterpret_u32_u8(v.raw);
+-  vst1_lane_u32(p, a, 0);
++  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
+                        uint16_t* HWY_RESTRICT p) {
+   uint32x2_t a = vreinterpret_u32_u16(v.raw);
+-  vst1_lane_u32(p, a, 0);
++  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
+                        uint32_t* HWY_RESTRICT p) {
+@@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128<uint
+ HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
+                        int8_t* HWY_RESTRICT p) {
+   int32x2_t a = vreinterpret_s32_s8(v.raw);
+-  vst1_lane_s32(p, a, 0);
++  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
+                        int16_t* HWY_RESTRICT p) {
+   int32x2_t a = vreinterpret_s32_s16(v.raw);
+-  vst1_lane_s32(p, a, 0);
++  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
+                        int32_t* HWY_RESTRICT p) {
+@@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128<floa
+ HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
+                        uint8_t* HWY_RESTRICT p) {
+   uint16x4_t a = vreinterpret_u16_u8(v.raw);
+-  vst1_lane_u16(p, a, 0);
++  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
+                        uint16_t* HWY_RESTRICT p) {
+@@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128<uint
+ HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
+                        int8_t* HWY_RESTRICT p) {
+   int16x4_t a = vreinterpret_s16_s8(v.raw);
+-  vst1_lane_s16(p, a, 0);
++  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
+                        int16_t* HWY_RESTRICT p) {
+@@ -2068,18 +2260,18 @@ HWY_INLINE Vec128<uint64_t> PromoteTo(Fu
+                                       const Vec128<uint32_t, 2> v) {
+   return Vec128<uint64_t>(vmovl_u32(v.raw));
+ }
+-HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
++HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
+                                      const Vec128<uint8_t, 8> v) {
+-  return Vec128<int16_t>(vmovl_u8(v.raw));
++  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
+ }
+-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
++HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
+                                      const Vec128<uint8_t, 4> v) {
+   uint16x8_t a = vmovl_u8(v.raw);
+-  return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
++  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
+ }
+-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
++HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
+                                      const Vec128<uint16_t, 4> v) {
+-  return Vec128<int32_t>(vmovl_u16(v.raw));
++  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
+ }
+ 
+ // Unsigned: zero-extend to half vector.
+@@ -2105,9 +2297,9 @@ HWY_INLINE Vec128<uint64_t, N> PromoteTo
+   return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int16_t, N)>
+-HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
++HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
+                                         const Vec128<uint8_t, N> v) {
+-  return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
++  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
+ }
+ template <size_t N, HWY_IF_LE64(int32_t, N)>
+ HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+@@ -2170,12 +2362,14 @@ HWY_INLINE Vec128<int64_t, N> PromoteTo(
+ 
+ HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
+                                    const Vec128<float16_t, 4> v) {
+-  return Vec128<float>(vcvt_f32_f16(v.raw));
++  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
++  return Vec128<float>(f32);
+ }
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
+                                       const Vec128<float16_t, N> v) {
+-  return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
++  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
++  return Vec128<float, N>(vget_low_f32(f32));
+ }
+ 
+ #else
+@@ -2204,7 +2398,7 @@ HWY_INLINE Vec128<float, N> PromoteTo(Si
+ 
+ #endif
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ 
+ HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
+                                     const Vec128<float, 2> v) {
+@@ -2298,12 +2492,13 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
+ 
+ HWY_INLINE Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
+                                          const Vec128<float> v) {
+-  return Vec128<float16_t, 4>{vcvt_f16_f32(v.raw)};
++  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
+ }
+ template <size_t N>
+ HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
+                                          const Vec128<float, N> v) {
+-  return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
++  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
++  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
+ }
+ 
+ #else
+@@ -2339,7 +2534,7 @@ HWY_INLINE Vec128<float16_t, N> DemoteTo
+ }
+ 
+ #endif
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ 
+ HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
+                                      const Vec128<double> v) {
+@@ -2397,7 +2592,7 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
+                                       const Vec128<int32_t> v) {
+   Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
+   Vec128<int16_t, N> b;
+-  uint16x8_t c = vcombine_s16(a.raw, b.raw);
++  int16x8_t c = vcombine_s16(a.raw, b.raw);
+   return Vec128<int8_t, N>(vqmovn_s16(c));
+ }
+ 
+@@ -2426,7 +2621,7 @@ HWY_INLINE Vec128<int32_t, N> ConvertTo(
+   return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
+ }
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ 
+ HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
+                                     const Vec128<int64_t> v) {
+@@ -2451,7 +2646,7 @@ HWY_INLINE Vec128<int64_t, 1> ConvertTo(
+ 
+ // ------------------------------ Round (IfThenElse, mask, logical)
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ // Toward nearest integer
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
+ 
+@@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor,
+ // representation, clearing the lowest 23-exp mantissa bits. This requires 9
+ // integer operations and 3 constants, which is likely more expensive.
+ 
++namespace detail {
++
++// The original value is already the desired result if NaN or the magnitude is
++// large (i.e. the value is already an integer).
++template <size_t N>
++HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
++  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
++}
++
++}  // namespace detail
++
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
+   const Simd<float, N> df;
+-  const Simd<int32_t, N> di;
++  const RebindToSigned<decltype(df)> di;
+ 
+   const auto integer = ConvertTo(di, v);  // round toward 0
+   const auto int_f = ConvertTo(df, integer);
+ 
+-  // The original value is already the desired result if NaN or the magnitude is
+-  // large (i.e. the value is already an integer).
+-  const auto max = Set(df, MantissaEnd<float>());
+-  return IfThenElse(Abs(v) < max, int_f, v);
++  return IfThenElse(detail::UseInt(v), int_f, v);
+ }
+ 
+ template <size_t N>
+@@ -2506,7 +2709,7 @@ HWY_INLINE Vec128<float, N> Round(const
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
+   const Simd<float, N> df;
+-  const Simd<int32_t, N> di;
++  const RebindToSigned<decltype(df)> di;
+ 
+   const auto integer = ConvertTo(di, v);  // round toward 0
+   const auto int_f = ConvertTo(df, integer);
+@@ -2514,9 +2717,7 @@ HWY_INLINE Vec128<float, N> Ceil(const V
+   // Truncating a positive non-integer ends up smaller; if so, add 1.
+   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
+ 
+-  // Keep original if NaN or the magnitude is large (already an int).
+-  const auto max = Set(df, MantissaEnd<float>());
+-  return IfThenElse(Abs(v) < max, int_f - neg1, v);
++  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
+ }
+ 
+ template <size_t N>
+@@ -2530,16 +2731,14 @@ HWY_INLINE Vec128<float, N> Floor(const
+   // Truncating a negative non-integer ends up larger; if so, subtract 1.
+   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
+ 
+-  // Keep original if NaN or the magnitude is large (already an int).
+-  const auto max = Set(df, MantissaEnd<float>());
+-  return IfThenElse(Abs(v) < max, int_f + neg1, v);
++  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
+ }
+ 
+ #endif
+ 
+ // ------------------------------ NearestInt (Round)
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ 
+ HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
+   return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
+@@ -2596,7 +2795,7 @@ HWY_INLINE Vec128<int64_t, 1> LowerHalf(
+ HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
+   return Vec128<float, 2>(vget_low_f32(v.raw));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
+   return Vec128<double, 1>(vget_low_f64(v.raw));
+ }
+@@ -2629,7 +2828,7 @@ HWY_INLINE Vec128<int64_t, 1> UpperHalf(
+ HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
+   return Vec128<float, 2>(vget_high_f32(v.raw));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
+   return Vec128<double, 1>(vget_high_f64(v.raw));
+ }
+@@ -2714,7 +2913,7 @@ HWY_INLINE Vec128<T, N> ShiftRightLanes(
+ 
+ // ------------------------------ Broadcast/splat any lane
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ // Unsigned
+ template <int kLane>
+ HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
+@@ -2886,7 +3085,7 @@ HWY_API Vec128<T> TableLookupBytes(const
+                                    const Vec128<T> from) {
+   const Full128<T> d;
+   const Repartition<uint8_t, decltype(d)> d8;
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
+                                                BitCast(d8, from).raw)));
+ #else
+@@ -2911,33 +3110,58 @@ HWY_INLINE Vec128<T, N> TableLookupBytes
+                                                 BitCast(d8, from).raw)));
+ }
+ 
+-// ------------------------------ Hard-coded shuffles
++// ------------------------------ TableLookupLanes
+ 
+-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
+-// Shuffle0321 rotates one lane to the right (the previous least-significant
+-// lane is now most-significant). These could also be implemented via
+-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
++// Returned by SetTableIndices for use by TableLookupLanes.
++template <typename T, size_t N>
++struct Indices128 {
++  typename detail::Raw128<T, N>::type raw;
++};
+ 
+-// Swap 32-bit halves in 64-bits
+-HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
+-  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
+-}
+-HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
+-  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
+-}
+-HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
+-  return Vec128<float, 2>(vrev64_f32(v.raw));
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
++#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
++  for (size_t i = 0; i < N; ++i) {
++    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
++  }
++#endif
++
++  const Repartition<uint8_t, decltype(d)> d8;
++  alignas(16) uint8_t control[16] = {0};
++  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
++    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
++      control[idx_lane * sizeof(T) + idx_byte] =
++          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
++    }
++  }
++  return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
+ }
+-HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>(vrev64q_u32(v.raw));
++
++template <size_t N>
++HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
++    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
+ }
+-HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>(vrev64q_s32(v.raw));
++template <size_t N>
++HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
++    const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
+ }
+-HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
+-  return Vec128<float>(vrev64q_f32(v.raw));
++template <size_t N>
++HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
++                                             const Indices128<float, N> idx) {
++  const Simd<int32_t, N> di;
++  const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
++  return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
+ }
+ 
++// ------------------------------ Other shuffles (TableLookupBytes)
++
++// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
++// Shuffle0321 rotates one lane to the right (the previous least-significant
++// lane is now most-significant). These could also be implemented via
++// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
++
+ // Swap 64-bit halves
+ template <typename T>
+ HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
+@@ -2975,49 +3199,6 @@ HWY_INLINE Vec128<T> Shuffle0123(const V
+   return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
+ }
+ 
+-// ------------------------------ TableLookupLanes
+-
+-// Returned by SetTableIndices for use by TableLookupLanes.
+-template <typename T>
+-struct Indices128 {
+-  uint8x16_t raw;
+-};
+-
+-template <typename T>
+-HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
+-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+-  const size_t N = 16 / sizeof(T);
+-  for (size_t i = 0; i < N; ++i) {
+-    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+-  }
+-#endif
+-
+-  const Full128<uint8_t> d8;
+-  alignas(16) uint8_t control[16];
+-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
+-    const size_t idx_lane = idx_byte / sizeof(T);
+-    const size_t mod = idx_byte % sizeof(T);
+-    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
+-  }
+-  return Indices128<T>{Load(d8, control).raw};
+-}
+-
+-HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
+-                                             const Indices128<uint32_t> idx) {
+-  return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
+-}
+-HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
+-                                            const Indices128<int32_t> idx) {
+-  return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
+-}
+-HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
+-                                          const Indices128<float> idx) {
+-  const Full128<int32_t> di;
+-  const Full128<float> df;
+-  return BitCast(df,
+-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
+-}
+-
+ // ------------------------------ Interleave lanes
+ 
+ // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
+@@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
+ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ // For 64 bit types, we only have the "q" version of the function defined as
+ // interleaving 64-wide registers with 64-wide types in them makes no sense.
+ HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
+@@ -3079,7 +3260,7 @@ HWY_INLINE Vec128<float> InterleaveLower
+                                          const Vec128<float> b) {
+   return Vec128<float>(vzip1q_f32(a.raw, b.raw));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
+                                           const Vec128<double> b) {
+   return Vec128<double>(vzip1q_f64(a.raw, b.raw));
+@@ -3090,10 +3271,10 @@ HWY_INLINE Vec128<float> InterleaveUpper
+                                          const Vec128<float> b) {
+   return Vec128<float>(vzip2q_f32(a.raw, b.raw));
+ }
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
+                                           const Vec128<double> b) {
+-  return Vec128<double>(vzip2q_s64(a.raw, b.raw));
++  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
+ }
+ #endif
+ 
+@@ -3105,119 +3286,125 @@ HWY_INLINE Vec128<double> InterleaveUppe
+ // Full vectors
+ HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
+                                      const Vec128<uint8_t> b) {
+-  return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
++  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
+                                      const Vec128<uint16_t> b) {
+-  return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
++  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
+                                      const Vec128<uint32_t> b) {
+-  return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
++  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw)));
+ }
+ 
+ HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
+                                     const Vec128<int8_t> b) {
+-  return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
++  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
+                                     const Vec128<int16_t> b) {
+-  return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
++  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
+                                     const Vec128<int32_t> b) {
+-  return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
++  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw)));
+ }
+ 
+ HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
+                                      const Vec128<uint8_t> b) {
+-  return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
++  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
+                                      const Vec128<uint16_t> b) {
+-  return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
++  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
+                                      const Vec128<uint32_t> b) {
+-  return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
++  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw)));
+ }
+ 
+ HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
+                                     const Vec128<int8_t> b) {
+-  return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
++  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
+                                     const Vec128<int16_t> b) {
+-  return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
++  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw)));
+ }
+ HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
+                                     const Vec128<int32_t> b) {
+-  return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
++  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw)));
+ }
+ 
+ // Half vectors or less
+ template <size_t N, HWY_IF_LE64(uint8_t, N)>
+ HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
+                                                   const Vec128<uint8_t, N> b) {
+-  return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
++  return Vec128<uint16_t, (N + 1) / 2>(
++      vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(uint16_t, N)>
+ HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
+                                                   const Vec128<uint16_t, N> b) {
+-  return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
++  return Vec128<uint32_t, (N + 1) / 2>(
++      vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(uint32_t, N)>
+ HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
+                                                   const Vec128<uint32_t, N> b) {
+-  return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
++  return Vec128<uint64_t, (N + 1) / 2>(
++      vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw)));
+ }
+ 
+ template <size_t N, HWY_IF_LE64(int8_t, N)>
+ HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
+                                                  const Vec128<int8_t, N> b) {
+-  return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
++  return Vec128<int16_t, (N + 1) / 2>(
++      vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int16_t, N)>
+ HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
+                                                  const Vec128<int16_t, N> b) {
+-  return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
++  return Vec128<int32_t, (N + 1) / 2>(
++      vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int32_t, N)>
+ HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
+                                                  const Vec128<int32_t, N> b) {
+-  return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
++  return Vec128<int64_t, (N + 1) / 2>(
++      vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw)));
+ }
+ 
+ template <size_t N, HWY_IF_LE64(uint8_t, N)>
+ HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
+                                             const Vec128<uint8_t, N> b) {
+-  return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
++  return Vec128<uint16_t, N / 2>(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(uint16_t, N)>
+ HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
+                                             const Vec128<uint16_t, N> b) {
+-  return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
++  return Vec128<uint32_t, N / 2>(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(uint32_t, N)>
+ HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
+                                             const Vec128<uint32_t, N> b) {
+-  return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
++  return Vec128<uint64_t, N / 2>(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw)));
+ }
+ 
+ template <size_t N, HWY_IF_LE64(int8_t, N)>
+ HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
+                                            const Vec128<int8_t, N> b) {
+-  return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
++  return Vec128<int16_t, N / 2>(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int16_t, N)>
+ HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
+                                            const Vec128<int16_t, N> b) {
+-  return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
++  return Vec128<int32_t, N / 2>(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int32_t, N)>
+ HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
+                                            const Vec128<int32_t, N> b) {
+-  return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
++  return Vec128<int64_t, N / 2>(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw)));
+ }
+ 
+ // ------------------------------ Blocks
+@@ -3274,84 +3461,113 @@ HWY_INLINE Vec128<T> OddEven(const Vec12
+ 
+ // ================================================== MISC
+ 
+-// Returns a vector with lane i=[0, N) set to "first" + i.
+-template <typename T, size_t N, typename T2>
+-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+-  HWY_ALIGN T lanes[16 / sizeof(T)];
+-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
++// ------------------------------ Scatter (Store)
++
++template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
++HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                           const Vec128<Offset, N> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
++  }
++}
++
++template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
++HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                          const Vec128<Index, N> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  for (size_t i = 0; i < N; ++i) {
++    base[index_lanes[i]] = lanes[i];
+   }
+-  return Load(d, lanes);
+ }
+ 
+-// ------------------------------ Gather (requires GetLane)
++// ------------------------------ Gather (Load/Store)
+ 
+ template <typename T, size_t N, typename Offset>
+ HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
+                                   const T* HWY_RESTRICT base,
+                                   const Vec128<Offset, N> offset) {
+-  static_assert(N == 1, "NEON does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
+-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
+-  T val;
+-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
+-  return Set(d, val);
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  alignas(16) T lanes[N];
++  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
++  }
++  return Load(d, lanes);
+ }
+ 
+ template <typename T, size_t N, typename Index>
+ HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
+                                  const Vec128<Index, N> index) {
+-  static_assert(N == 1, "NEON does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
+-  return Set(d, base[GetLane(index)]);
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  alignas(16) T lanes[N];
++  for (size_t i = 0; i < N; ++i) {
++    lanes[i] = base[index_lanes[i]];
++  }
++  return Load(d, lanes);
+ }
+ 
+-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
++// ------------------------------ Reductions
+ 
+-#if !defined(__aarch64__)
++namespace detail {
+ 
+-template <size_t N>
+-HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
+-                                          const Vec128<int64_t, N> b) {
+-  const Simd<int32_t, N * 2> d32;
+-  const Simd<int64_t, N> d64;
+-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
+-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+-  return MaskFromVec(BitCast(d64, cmp64));
++// N=1 for any T: no-op
++template <typename T>
++HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
++  return v;
+ }
+-
+-template <size_t N>
+-HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
+-                                           const Vec128<uint64_t, N> b) {
+-  const Simd<uint32_t, N * 2> d32;
+-  const Simd<uint64_t, N> d64;
+-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
+-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+-  return MaskFromVec(BitCast(d64, cmp64));
++template <typename T>
++HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
+ }
+ 
+-HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
+-                                      const Vec128<int64_t> b) {
+-  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
+-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
++// u32/i32/f32: N=2
++template <typename T, HWY_IF_LANE_SIZE(T, 4)>
++HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
++  return v10 + Shuffle2301(v10);
+ }
+-HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
+-                                         const Vec128<int64_t, 1> b) {
+-  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
+-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
++template <typename T>
++HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Min(v10, Shuffle2301(v10));
+ }
+-
+-template <size_t N>
+-HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
+-                                         const Vec128<int64_t, N> b) {
+-  return b < a;
++template <typename T>
++HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Max(v10, Shuffle2301(v10));
+ }
+-#endif
+-
+-// ------------------------------ Reductions
+ 
+-#if defined(__aarch64__)
+-// Supported for 32b and 64b vector types. Returns the sum in each lane.
++// full vectors
++#if HWY_ARCH_ARM_A64
+ HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+   return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
+ }
+@@ -3398,20 +3614,15 @@ HWY_INLINE Vec128<int64_t> SumOfLanes(co
+ }
+ #endif
+ 
+-namespace detail {
+-
+-// For u32/i32/f32.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Min(v20_31_20_31, v31_20_31_20);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+@@ -3419,15 +3630,13 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
+ }
+ 
+ // For u64/i64[/f64].
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Min(v10, v01);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Max(v10, v01);
+ }
+@@ -3435,6 +3644,10 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
+ }  // namespace detail
+ 
+ template <typename T, size_t N>
++HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
++  return detail::SumOfLanes(v);
++}
++template <typename T, size_t N>
+ HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
+   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+ }
+@@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Vec128<uint8_t> values =
+       BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   // Can't vaddv - we need two separate bytes (16 bits).
+   const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
+   const uint8x8_t x4 = vpadd_u8(x2, x2);
+   const uint8x8_t x8 = vpadd_u8(x4, x4);
+-  return vreinterpret_u16_u8(x8)[0];
++  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
+ #else
+   // Don't have vpaddq, so keep doubling lane size.
+   const uint16x8_t x2 = vpaddlq_u8(values.raw);
+   const uint32x4_t x4 = vpaddlq_u16(x2);
+   const uint64x2_t x8 = vpaddlq_u32(x4);
+-  return (uint64_t(x8[1]) << 8) | x8[0];
++  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
+ #endif
+ }
+ 
+@@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
+   const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddv_u8(values.raw);
+ #else
+   const uint16x4_t x2 = vpaddl_u8(values.raw);
+@@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Full128<uint16_t> du;
+   const Vec128<uint16_t> values =
+       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_u16(values.raw);
+ #else
+   const uint32x4_t x2 = vpaddlq_u16(values.raw);
+@@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Simd<uint16_t, N> du;
+   const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
+   const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddv_u16(values.raw);
+ #else
+   const uint32x2_t x2 = vpaddl_u16(values.raw);
+@@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Full128<uint32_t> du;
+   const Vec128<uint32_t> values =
+       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_u32(values.raw);
+ #else
+   const uint64x2_t x2 = vpaddlq_u32(values.raw);
+@@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Simd<uint32_t, N> du;
+   const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
+   const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddv_u32(values.raw);
+ #else
+   const uint64x1_t x2 = vpaddl_u32(values.raw);
+@@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const Full128<uint64_t> du;
+   const Vec128<uint64_t> values =
+       BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_u64(values.raw);
+ #else
+   return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
+@@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+   const int8x16_t ones =
+       vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_s8(ones);
+ #else
+   const int16x8_t x2 = vpaddlq_s8(ones);
+   const int32x4_t x4 = vpaddlq_s16(x2);
+   const int64x2_t x8 = vpaddlq_s32(x4);
+-  return x8[0] + x8[1];
++  return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
+ #endif
+ }
+ template <typename T>
+@@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+   const int16x8_t ones =
+       vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_s16(ones);
+ #else
+   const int32x4_t x2 = vpaddlq_s16(ones);
+   const int64x2_t x4 = vpaddlq_s32(x2);
+-  return x4[0] + x4[1];
++  return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
+ #endif
+ }
+ 
+@@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+   const int32x4_t ones =
+       vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
+ 
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   return vaddvq_s32(ones);
+ #else
+   const int64x2_t x2 = vpaddlq_s32(ones);
+-  return x2[0] + x2[1];
++  return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
+ #endif
+ }
+ 
+ template <typename T>
+ HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
+-#if defined(__aarch64__)
++#if HWY_ARCH_ARM_A64
+   const Full128<int64_t> di;
+   const int64x2_t ones =
+       vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
+   return vaddvq_s64(ones);
+ #else
+-  const Full128<int64_t> di;
+-  const int64x2_t ones =
+-      vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
+-  return ones[0] + ones[1];
++  const Full128<uint64_t> du;
++  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
++  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
++  return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
+ #endif
+ }
+ 
+@@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma
+ // Full
+ template <typename T>
+ HWY_INLINE bool AllFalse(const Mask128<T> m) {
++#if HWY_ARCH_ARM_A64
++  const Full128<uint32_t> d32;
++  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128<T>(), m)));
++  return (vmaxvq_u32(m32.raw) == 0);
++#else
+   const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
+   uint32x2_t a = vqmovn_u64(v64.raw);
+-  return vreinterpret_u64_u32(a)[0] == 0;
++  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
++#endif
+ }
+ 
+ // Partial
+@@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128<T,
+ 
+ namespace detail {
+ 
++// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
++HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
++                                      const uint8_t* bytes) {
++  return Vec128<uint8_t>(vreinterpretq_u8_u64(
++      vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
++}
++
++// Load 8 bytes and return half-reg with N <= 8 bytes.
++template <size_t N, HWY_IF_LE64(uint8_t, N)>
++HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N> d,
++                                         const uint8_t* bytes) {
++  return Load(d, bytes);
++}
++
+ template <typename T, size_t N>
+-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
++HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
++                                    const uint64_t mask_bits) {
++  HWY_DASSERT(mask_bits < 256);
++  const Simd<T, N> d;
++  const Repartition<uint8_t, decltype(d)> d8;
++  const Simd<uint16_t, N> du;
++
++  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
++  // indices for VTBL (one vector's worth for each of 256 combinations of
++  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
++  // store lane indices and convert to byte indices (2*lane + 0..1), with the
++  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
++  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
++  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
++  // is likely more costly than the higher cache footprint from storing bytes.
++  alignas(16) constexpr uint8_t table[256 * 8] = {
++      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
++      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
++      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
++      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
++      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
++      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
++      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
++      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
++      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
++      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
++      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
++      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
++      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
++      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
++      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
++      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
++      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
++      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
++      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
++      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
++      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
++      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
++      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
++      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
++      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
++      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
++      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
++      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
++      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
++      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
++      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
++      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
++      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
++      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
++      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
++      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
++      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
++      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
++      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
++      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
++      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
++      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
++      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
++      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
++      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
++      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
++      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
++      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
++      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
++      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
++      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
++      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
++      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
++      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
++      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
++      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
++      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
++      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
++      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
++      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
++      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
++      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
++      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
++      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
++      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
++      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
++      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
++      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
++      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
++      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
++      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
++      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
++      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
++      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
++      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
++      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
++      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
++      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
++      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
++      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
++      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
++      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
++      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
++      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
++      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
++      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
++      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
++      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
++      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
++      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
++      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
++      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
++      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
++      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
++      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
++      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
++      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
++      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
++      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
++      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
++      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
++      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
++      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
++      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
++      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
++      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
++      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
++      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
++      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
++      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
++      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
++      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
++      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
++      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
++
++  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
++  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
++  return BitCast(d, pairs + Set(du, 0x0100));
++}
++
++template <typename T, size_t N>
++HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
++                                    const uint64_t mask_bits) {
+   HWY_DASSERT(mask_bits < 16);
+ 
+   // There are only 4 lanes, so we can afford to load the index vector directly.
+@@ -3742,7 +4113,8 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(
+ #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
+ 
+ template <typename T, size_t N>
+-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
++HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
++                                    const uint64_t mask_bits) {
+   HWY_DASSERT(mask_bits < 4);
+ 
+   // There are only 2 lanes, so we can afford to load the index vector directly.
+@@ -3761,59 +4133,15 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
+ 
+ // Helper function called by both Compress and CompressStore - avoids a
+ // redundant BitsFromMask in the latter.
+-
+-template <size_t N>
+-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
+-                                     const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-template <size_t N>
+-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
+-                                    const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-
+-#if HWY_CAP_INTEGER64
+-
+-template <size_t N>
+-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
+-                                     const uint64_t mask_bits) {
+-  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-template <size_t N>
+-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
+-                                    const uint64_t mask_bits) {
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-
+-#endif
+-
+-template <size_t N>
+-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
+-                                  const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  const Simd<float, N> df;
+-  const Simd<int32_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+-}
+-
+-#if HWY_CAP_FLOAT64
+-
+-template <size_t N>
+-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
+-                                   const uint64_t mask_bits) {
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  const Simd<double, N> df;
+-  const Simd<int64_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
++  const auto idx =
++      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
++  using D = Simd<T, N>;
++  const RebindToSigned<D> di;
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ }
+ 
+-#endif
+-
+ }  // namespace detail
+ 
+ template <typename T, size_t N>
+@@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128<T, N
+   return PopCount(mask_bits);
+ }
+ 
++// ------------------------------ StoreInterleaved3
++
++// 128 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
++                               const Vec128<uint8_t> v1,
++                               const Vec128<uint8_t> v2,
++                               Full128<uint8_t> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
++  vst3q_u8(unaligned, triple);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
++                               const Vec128<uint8_t, 8> v1,
++                               const Vec128<uint8_t, 8> v2,
++                               Simd<uint8_t, 8> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
++  vst3_u8(unaligned, triple);
++}
++
++// <= 32 bits: avoid writing more than N bytes by copying to buffer
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
++                               const Vec128<uint8_t, N> v1,
++                               const Vec128<uint8_t, N> v2,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  alignas(16) uint8_t buf[24];
++  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
++  vst3_u8(buf, triple);
++  CopyBytes<N * 3>(buf, unaligned);
++}
++
++// ------------------------------ StoreInterleaved4
++
++// 128 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
++                               const Vec128<uint8_t> v1,
++                               const Vec128<uint8_t> v2,
++                               const Vec128<uint8_t> v3,
++                               Full128<uint8_t> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
++  vst4q_u8(unaligned, quad);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> v0,
++                               const Vec128<uint8_t, 8> v1,
++                               const Vec128<uint8_t, 8> v2,
++                               const Vec128<uint8_t, 8> v3,
++                               Simd<uint8_t, 8> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
++  vst4_u8(unaligned, quad);
++}
++
++// <= 32 bits: avoid writing more than N bytes by copying to buffer
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> v0,
++                               const Vec128<uint8_t, N> v1,
++                               const Vec128<uint8_t, N> v2,
++                               const Vec128<uint8_t, N> v3,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  alignas(16) uint8_t buf[32];
++  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
++  vst4_u8(buf, quad);
++  CopyBytes<N * 4>(buf, unaligned);
++}
++
+ // ================================================== Operator wrapper
+ 
+ // These apply to all x86_*-inl.h because there are no restrictions on V.
+@@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a
+   return a <= b;
+ }
+ 
+-#if !defined(__aarch64__)
++namespace detail {  // for code folding
++#if HWY_ARCH_ARM_V7
+ #undef vuzp1_s8
+ #undef vuzp1_u8
+ #undef vuzp1_s16
+@@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
+ #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
+ #undef HWY_NEON_DEF_FUNCTION_UINTS
+ #undef HWY_NEON_EVAL
++}  // namespace detail
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+ }  // namespace HWY_NAMESPACE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12	2021-06-02 10:56:05.230904367 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -39,6 +39,11 @@ using TFromV = TFromD<DFromV<V>>;
+   hwy::EnableIf<IsSigned<TFromV<V>>() && !IsFloat<TFromV<V>>()>* = nullptr
+ #define HWY_IF_FLOAT_V(V) hwy::EnableIf<IsFloat<TFromV<V>>()>* = nullptr
+ 
++// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
++template <typename T, int kShift = 0>
++using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
++                                  : (HWY_LANES(T) << kShift)>;
++
+ // ================================================== MACROS
+ 
+ // Generate specializations and function definitions using X macros. Although
+@@ -58,29 +63,30 @@ namespace detail {  // for code folding
+ 
+ // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the
+ // preprocessor cannot easily do it.
+-#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP)
++// TODO(janwas): GCC does not yet support fractional LMUL
++#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP)        \
++  X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
++
++#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
++
++#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
++  X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
++
++#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP)
+ 
+ // SEW for unsigned:
+ #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
+@@ -153,63 +159,61 @@ namespace detail {  // for code folding
+ 
+ // Assemble types for use in x-macros
+ #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
+-#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL
+-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t
++#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL
++#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
+ #define HWY_RVV_M(MLEN) vbool##MLEN##_t
+ 
+ }  // namespace detail
+ 
+ // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly
+ 
+-// TODO(janwas): do we want fractional LMUL? (can encode as negative)
+-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they
+-// need many registers.
+-#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  using HWY_RVV_D(CHAR, SEW, LMUL) =                                      \
+-      Simd<HWY_RVV_T(BASE, SEW), HWY_LANES(HWY_RVV_T(BASE, SEW)) * LMUL>; \
+-  using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL);               \
+-  template <>                                                             \
+-  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                           \
+-    using Lane = HWY_RVV_T(BASE, SEW);                                    \
+-    using type = Simd<Lane, HWY_LANES(Lane) * LMUL>;                      \
++// Until we have full intrinsic support for fractional LMUL, mixed-precision
++// code can use LMUL 1..8 (adequate unless they need many registers).
++#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
++  using HWY_RVV_D(CHAR, SEW, LMUL) = Full<HWY_RVV_T(BASE, SEW), SHIFT>; \
++  using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL);                \
++  template <>                                                           \
++  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                         \
++    using Lane = HWY_RVV_T(BASE, SEW);                                  \
++    using type = Full<Lane, SHIFT>;                                     \
+   };
+ using Vf16m1 = vfloat16m1_t;
+ using Vf16m2 = vfloat16m2_t;
+ using Vf16m4 = vfloat16m4_t;
+ using Vf16m8 = vfloat16m8_t;
+-using Df16m1 = Simd<float16_t, HWY_LANES(uint16_t) * 1>;
+-using Df16m2 = Simd<float16_t, HWY_LANES(uint16_t) * 2>;
+-using Df16m4 = Simd<float16_t, HWY_LANES(uint16_t) * 4>;
+-using Df16m8 = Simd<float16_t, HWY_LANES(uint16_t) * 8>;
++using Df16m1 = Full<float16_t, 0>;
++using Df16m2 = Full<float16_t, 1>;
++using Df16m4 = Full<float16_t, 2>;
++using Df16m8 = Full<float16_t, 3>;
+ 
+ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
+ #undef HWY_SPECIALIZE
+ 
+ // vector = f(d), e.g. Zero
+-#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
++#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \
+     (void)Lanes(d);                                                       \
+-    return v##OP##_##CHAR##SEW##m##LMUL();                                \
++    return v##OP##_##CHAR##SEW##LMUL();                                   \
+   }
+ 
+ // vector = f(vector), e.g. Not
+-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
++#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_##CHAR##SEW##m##LMUL(v);                             \
++    return v##OP##_v_##CHAR##SEW##LMUL(v);                                \
+   }
+ 
+ // vector = f(vector, scalar), e.g. detail::Add
+-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)  \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
+-    return v##OP##_##CHAR##SEW##m##LMUL(a, b);                     \
++#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {       \
++    return v##OP##_##CHAR##SEW##LMUL(a, b);                              \
+   }
+ 
+ // vector = f(vector, vector), e.g. Add
+-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
++#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(a, b);                           \
+   }
+ 
+ // ================================================== INIT
+@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
+ 
+ // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
+ // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
+-#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {  \
+-    return v##OP##SEW##m##LMUL();                            \
++#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {         \
++    return v##OP##SEW##LMUL();                                      \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e)
+@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero,
+ template <class D>
+ using VFromD = decltype(Zero(D()));
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Full<T>> Zero(Simd<T, N> /*tag*/) {
++  return Zero(Full<T>());
++}
++
+ // ------------------------------ Set
+ // vector = f(d, scalar), e.g. Set
+-#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                 \
+       NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \
+     (void)Lanes(d);                                                  \
+-    return v##OP##_##CHAR##SEW##m##LMUL(arg);                        \
++    return v##OP##_##CHAR##SEW##LMUL(arg);                           \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x)
+ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f)
+ #undef HWY_RVV_SET
+ 
++// Partial vectors
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> Set(Simd<T, N> /*tag*/, T arg) {
++  return Set(Full<T>(), arg);
++}
++
+ // ------------------------------ Undefined
+ 
+ // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
+@@ -265,7 +281,7 @@ HWY_API VFromD<D> Undefined(D d) {
+ namespace detail {
+ 
+ // u8: no change
+-#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
+       BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) {                       \
+     return v;                                                             \
+@@ -276,25 +292,25 @@ namespace detail {
+   }
+ 
+ // Other integers
+-#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)            \
+-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v);                 \
+-  }                                                                       \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
+-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
+-    return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v);                 \
++#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
++  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v);                    \
++  }                                                                      \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
++      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
++    return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v);                    \
+   }
+ 
+ // Float: first cast to/from unsigned
+-#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL(                         \
+-        v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v));                  \
+-  }                                                                       \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
+-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
+-    return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL(                   \
+-        v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v));                        \
++#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)     \
++  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_v_u##SEW##LMUL##_u8##LMUL(                            \
++        v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v));                       \
++  }                                                                      \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
++      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
++    return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL(                        \
++        v##OP##_v_u8##LMUL##_u##SEW##LMUL(v));                           \
+   }
+ 
+ HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _)
+@@ -315,6 +331,12 @@ HWY_API VFromD<D> BitCast(D d, FromV v)
+   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
+ }
+ 
++// Partial
++template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> BitCast(Simd<T, N> /*tag*/, FromV v) {
++  return BitCast(Full<T>(), v);
++}
++
+ namespace detail {
+ 
+ template <class V, class DU = RebindToUnsigned<DFromV<V>>>
+@@ -336,6 +358,12 @@ HWY_API VFromD<DU> Iota0(const D /*d*/)
+   return BitCastToUnsigned(Iota0(DU()));
+ }
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> Iota0(Simd<T, N> /*tag*/) {
++  return Iota0(Full<T>());
++}
++
+ }  // namespace detail
+ 
+ // ================================================== LOGICAL
+@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) {
+ // ------------------------------ Or
+ 
+ // Scalar argument plus mask. Used by VecFromMask.
+-#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm,       \
+            HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \
+-    return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm);    \
++    return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm);       \
+   }
+ 
+ namespace detail {
+@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV,
+ // ------------------------------ ShiftLeft[Same]
+ 
+ // Intrinsics do not define .vi forms, so use .vx instead.
+-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
+-  template <int kBits>                                                     \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {  \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits);                      \
+-  }                                                                        \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+-      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                 \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast<uint8_t>(bits)); \
++#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
++  template <int kBits>                                                    \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits);                        \
++  }                                                                       \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
++      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                \
++    return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits));   \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll)
+@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi
+ #undef HWY_RVV_SHIFT
+ 
+ // ------------------------------ Shl
+-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
++#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)      \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, bits);                           \
+   }
+ 
+ HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll)
+ 
+-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)              \
++#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) {  \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v,                                \
+-                                           detail::BitCastToUnsigned(bits)); \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \
+   }
+ 
+ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll)
+@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons
+ 
+ // ------------------------------ MulAdd
+ // Note: op is still named vv, not vvv.
+-#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
++#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
+            HWY_RVV_V(BASE, SEW, LMUL) add) {                             \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x);                 \
++    return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x);                    \
+   }
+ 
+ HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc)
+@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub
+ // of all bits; SLEN 8 / LMUL 4 = half of all bits.
+ 
+ // mask = f(vector, vector)
+-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
++#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+   HWY_API HWY_RVV_M(MLEN)                                                \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
+     (void)Lanes(DFromV<decltype(a)>());                                  \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b);              \
++    return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b);                 \
+   }
+ 
+ // ------------------------------ Eq
+@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo
+ #undef HWY_RVV_RETM_ARGMM
+ 
+ // ------------------------------ IfThenElse
+-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                \
+-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,       \
+-           HWY_RVV_V(BASE, SEW, LMUL) no) {                         \
+-    return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes);            \
++#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
++      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,              \
++           HWY_RVV_V(BASE, SEW, LMUL) no) {                                \
++    return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes);                      \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge)
+@@ -710,7 +737,7 @@ template <class D>
+ using MFromD = decltype(MaskFromVec(Zero(D())));
+ 
+ template <class D, typename MFrom>
+-HWY_API MFromD<D> RebindMask(const D d, const MFrom mask) {
++HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
+   // No need to check lane size/LMUL are the same: if not, casting MFrom to
+   // MFromD<D> would fail.
+   return mask;
+@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _,
+ 
+ // ------------------------------ Load
+ 
+-#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                        \
+-      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                    \
+-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {   \
+-    (void)Lanes(d);                                         \
+-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p);          \
++#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
++      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                           \
++           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {          \
++    (void)Lanes(d);                                                \
++    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p);                    \
+   }
+ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le)
+ #undef HWY_RVV_LOAD
+ 
+-// Partial load
++// Partial
+ template <typename T, size_t N, HWY_IF_LE128(T, N)>
+ HWY_API VFromD<Simd<T, N>> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
+   return Load(d, p);
+@@ -800,16 +827,22 @@ HWY_API VFromD<D> LoadU(D d, const TFrom
+ 
+ // ------------------------------ Store
+ 
+-#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                 \
+-                    HWY_RVV_D(CHAR, SEW, LMUL) d,                 \
+-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {      \
+-    (void)Lanes(d);                                               \
+-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v);             \
++#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                        \
++                    HWY_RVV_D(CHAR, SEW, LMUL) d,                        \
++                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {             \
++    (void)Lanes(d);                                                      \
++    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v);                       \
+   }
+ HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se)
+ #undef HWY_RVV_RET_ARGVDP
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API void Store(VFromD<Simd<T, N>> v, Simd<T, N> d, T* HWY_RESTRICT p) {
++  return Store(v, Full<T>(), p);
++}
++
+ // ------------------------------ StoreU
+ 
+ // RVV only requires lane alignment, not natural alignment of the entire vector.
+@@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H
+   Store(v, d, aligned);
+ }
+ 
++// ------------------------------ ScatterOffset
++
++#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                     \
++                    HWY_RVV_D(CHAR, SEW, LMUL) /* d */,               \
++                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,         \
++                    HWY_RVV_V(int, SEW, LMUL) offset) {               \
++    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                      \
++        base, detail::BitCastToUnsigned(offset), v);                  \
++  }
++HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sx)
++#undef HWY_RVV_SCATTER
++
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API void ScatterOffset(VFromD<Simd<T, N>> v, Simd<T, N> d,
++                           T* HWY_RESTRICT base,
++                           VFromD<Simd<MakeSigned<T>, N>> offset) {
++  return ScatterOffset(v, Full<T>(), base, offset);
++}
++
++// ------------------------------ ScatterIndex
++
++template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
++HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
++                          const VFromD<RebindToSigned<D>> index) {
++  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
++}
++
++template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
++HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
++                          const VFromD<RebindToSigned<D>> index) {
++  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
++}
++
+ // ------------------------------ GatherOffset
+ 
+-#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                          \
+-      NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                \
+-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,    \
+-           HWY_RVV_V(int, SEW, LMUL) offset) {                \
+-    return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL(           \
+-        base, detail::BitCastToUnsigned(offset));             \
++#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                 \
++      NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                       \
++           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,           \
++           HWY_RVV_V(int, SEW, LMUL) offset) {                       \
++    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                     \
++        base, detail::BitCastToUnsigned(offset));                    \
+   }
+ HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lx)
+ #undef HWY_RVV_GATHER
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> GatherOffset(Simd<T, N> d,
++                                        const T* HWY_RESTRICT base,
++                                        VFromD<Simd<MakeSigned<T>, N>> offset) {
++  return GatherOffset(Full<T>(), base, offset);
++}
++
+ // ------------------------------ GatherIndex
+ 
+ template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+@@ -852,37 +928,101 @@ HWY_API VFromD<D> GatherIndex(D d, const
+   return GatherOffset(d, base, ShiftLeft<3>(index));
+ }
+ 
+-// ================================================== CONVERT
++// ------------------------------ StoreInterleaved3
+ 
+-// ------------------------------ PromoteTo U
++#define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
++  HWY_API void NAME(                                                    \
++      HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b,       \
++      HWY_RVV_V(BASE, SEW, LMUL) c, HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
++      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {                  \
++    const v##BASE##SEW##LMUL##x3_t triple =                             \
++        vcreate_##CHAR##SEW##LMUL##x3(a, b, c);                         \
++    return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple);        \
++  }
++// Segments are limited to 8 registers, so we can only go up to LMUL=2.
++HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3)
++HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3)
+ 
+-HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); }
+-HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); }
+-HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); }
++#undef HWY_RVV_STORE3
+ 
+-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); }
+-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); }
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API void StoreInterleaved3(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
++                               VFromD<Simd<T, N>> v2, Simd<T, N> /*tag*/,
++                               T* unaligned) {
++  return StoreInterleaved3(v0, v1, v2, Full<T>(), unaligned);
++}
++
++// ------------------------------ StoreInterleaved4
++
++#define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API void NAME(                                                 \
++      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,  \
++      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3,  \
++      HWY_RVV_D(CHAR, SEW, LMUL) /* d */,                            \
++      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) {                 \
++    const v##BASE##SEW##LMUL##x4_t quad =                            \
++        vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3);               \
++    return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad);         \
++  }
++// Segments are limited to 8 registers, so we can only go up to LMUL=2.
++HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4)
++HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4)
+ 
+-HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) {
+-  return vzext_vf2_u32m2(v);
+-}
+-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) {
+-  return vzext_vf2_u32m4(v);
+-}
+-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) {
+-  return vzext_vf2_u32m8(v);
+-}
++#undef HWY_RVV_STORE4
+ 
+-HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) {
+-  return vzext_vf2_u64m2(v);
+-}
+-HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) {
+-  return vzext_vf2_u64m4(v);
+-}
+-HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) {
+-  return vzext_vf2_u64m8(v);
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API void StoreInterleaved4(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
++                               VFromD<Simd<T, N>> v2, VFromD<Simd<T, N>> v3,
++                               Simd<T, N> /*tag*/, T* unaligned) {
++  return StoreInterleaved4(v0, v1, v2, v3, Full<T>(), unaligned);
+ }
+ 
++// ================================================== CONVERT
++
++#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \
++  HWY_API HWY_RVV_V(BASE, BITS, LMUL)                                          \
++      PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/,                             \
++                HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) {                      \
++    return OP##CHAR##BITS##LMUL(v);                                            \
++  }
++
++// TODO(janwas): GCC does not yet support fractional LMUL
++#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)     \
++  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2)*/ \
++  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1)      \
++  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2)      \
++  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4)
++
++#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)     \
++  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4)*/ \
++  /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2)*/ \
++  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1)      \
++  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2)
++
++// ------------------------------ PromoteTo
++
++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8)
++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16)
++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32)
++HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
++
++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8)
++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16)
++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32)
++HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
++
++HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16)
++HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32)
++
++// i32 to f64
++HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
++
++#undef HWY_RVV_PROMOTE_X4
++#undef HWY_RVV_PROMOTE_X2
++#undef HWY_RVV_PROMOTE
++
+ template <size_t N>
+ HWY_API VFromD<Simd<int16_t, N>> PromoteTo(Simd<int16_t, N> d,
+                                            VFromD<Simd<uint8_t, N>> v) {
+@@ -901,67 +1041,6 @@ HWY_API VFromD<Simd<int32_t, N>> Promote
+   return BitCast(d, PromoteTo(Simd<uint32_t, N>(), v));
+ }
+ 
+-// ------------------------------ PromoteTo I
+-
+-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); }
+-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); }
+-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); }
+-
+-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); }
+-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); }
+-
+-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) {
+-  return vsext_vf2_i32m2(v);
+-}
+-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) {
+-  return vsext_vf2_i32m4(v);
+-}
+-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) {
+-  return vsext_vf2_i32m8(v);
+-}
+-
+-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) {
+-  return vsext_vf2_i64m2(v);
+-}
+-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) {
+-  return vsext_vf2_i64m4(v);
+-}
+-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) {
+-  return vsext_vf2_i64m8(v);
+-}
+-
+-// ------------------------------ PromoteTo F
+-
+-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) {
+-  return vfwcvt_f_f_v_f32m2(v);
+-}
+-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) {
+-  return vfwcvt_f_f_v_f32m4(v);
+-}
+-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) {
+-  return vfwcvt_f_f_v_f32m8(v);
+-}
+-
+-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) {
+-  return vfwcvt_f_f_v_f64m2(v);
+-}
+-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) {
+-  return vfwcvt_f_f_v_f64m4(v);
+-}
+-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) {
+-  return vfwcvt_f_f_v_f64m8(v);
+-}
+-
+-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) {
+-  return vfwcvt_f_x_v_f64m2(v);
+-}
+-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) {
+-  return vfwcvt_f_x_v_f64m4(v);
+-}
+-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) {
+-  return vfwcvt_f_x_v_f64m8(v);
+-}
+-
+ // ------------------------------ DemoteTo U
+ 
+ // First clamp negative numbers to zero to match x86 packus.
+@@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
+ 
+ // ------------------------------ ConvertTo F
+ 
+-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)                 \
++#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)          \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
+       HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) {       \
+-    return vfcvt_f_x_v_f##SEW##m##LMUL(v);                                     \
++    return vfcvt_f_x_v_f##SEW##LMUL(v);                                        \
+   }                                                                            \
+   /* Truncates (rounds toward zero). */                                        \
+   HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \
+                                               HWY_RVV_V(BASE, SEW, LMUL) v) {  \
+-    return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v);                                 \
++    return vfcvt_rtz_x_f_v_i##SEW##LMUL(v);                                    \
+   }                                                                            \
+   /* Uses default rounding mode. */                                            \
+   HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return vfcvt_x_f_v_i##SEW##m##LMUL(v);                                     \
++    return vfcvt_x_f_v_i##SEW##LMUL(v);                                        \
+   }
+ 
+ // API only requires f32 but we provide f64 for internal use (otherwise, it
+@@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
+ HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _)
+ #undef HWY_RVV_CONVERT
+ 
++// Partial
++template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> ConvertTo(Simd<T, N> /*tag*/, FromV v) {
++  return ConvertTo(Full<T>(), v);
++}
++
+ // ================================================== SWIZZLE
+ 
+ // ------------------------------ Compress
+ 
+-#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)  \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                             \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
+-    return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v);          \
++#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                   \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) {       \
++    return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v);                   \
+   }
+ 
++HWY_RVV_FOREACH_UI16(HWY_RVV_COMPRESS, Compress, compress)
+ HWY_RVV_FOREACH_UI32(HWY_RVV_COMPRESS, Compress, compress)
+ HWY_RVV_FOREACH_UI64(HWY_RVV_COMPRESS, Compress, compress)
+ HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress)
+@@ -1121,10 +1207,10 @@ HWY_API VFromD<DU> SetTableIndices(D d,
+ 
+ // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
+ // to 2048! We could instead use vrgatherei16.
+-#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
++#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, idx);                           \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
+@@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D
+   using T = MakeUnsigned<TFromD<D>>;
+   return detail::And(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
+ }
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) {
+ 
+ // ------------------------------ GetLane
+ 
+-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)     \
+-  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v);         \
++#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {    \
++    return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v);               \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x)
+@@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL
+ 
+ // ------------------------------ ShiftLeftLanes
+ 
+-// vector = f(vector, size_t)
+-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                         \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) {     \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes);     \
++// vector = f(vector, vector, size_t)
++#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
++           size_t lanes) {                                                 \
++    return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes);                  \
+   }
+ 
+ namespace detail {
+@@ -1270,7 +1356,7 @@ template <size_t kLanes, class V>
+ HWY_API V ShiftLeftLanes(const V v) {
+   using D = DFromV<V>;
+   const RebindToSigned<D> di;
+-  const auto shifted = detail::SlideUp(v, kLanes);
++  const auto shifted = detail::SlideUp(v, v, kLanes);
+   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
+   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
+   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
+@@ -1300,7 +1386,7 @@ template <size_t kLanes, class V>
+ HWY_API V ShiftRightLanes(const V v) {
+   using D = DFromV<V>;
+   const RebindToSigned<D> di;
+-  const auto shifted = detail::SlideDown(v, kLanes);
++  const auto shifted = detail::SlideDown(v, v, kLanes);
+   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
+   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
+   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
+@@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c
+ template <class V>
+ HWY_API V ConcatLowerLower(const V hi, const V lo) {
+   // Move lower half into upper
+-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
++  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi_up, lo);
+ }
+ 
+@@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c
+ template <class V>
+ HWY_API V ConcatUpperUpper(const V hi, const V lo) {
+   // Move upper half into lower
+-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
++  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi, lo_down);
+ }
+ 
+@@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c
+ template <class V>
+ HWY_API V ConcatLowerUpper(const V hi, const V lo) {
+   // Move half of both inputs to the other half
+-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
+-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
++  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
++  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi_up, lo_down);
+ }
+ 
+@@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b)
+ // ================================================== REDUCE
+ 
+ // vector = f(vector, zero_m1)
+-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) {    \
+-    vsetvlmax_e##SEW##m##LMUL();                                          \
+-    return Set(HWY_RVV_D(CHAR, SEW, LMUL)(),                              \
+-               GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \
+-                   v0, v, v0)));                                          \
++#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)         \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) {      \
++    vsetvlmax_e##SEW##LMUL();                                                \
++    return Set(                                                              \
++        HWY_RVV_D(CHAR, SEW, LMUL)(),                                        \
++        GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \
+   }
+ 
+ // ------------------------------ SumOfLanes
+ 
+ namespace detail {
+-
+ HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V SumOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const auto v0 = Zero(Simd<T, HWY_LANES(T)>());  // always m1
++  const auto v0 = Zero(Full<T>());  // always m1
+   return detail::RedSum(v, v0);
+ }
+ 
+ // ------------------------------ MinOfLanes
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu)
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V MinOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const Simd<T, HWY_LANES(T)> d1;  // always m1
++  const Full<T> d1;  // always m1
+   const auto neutral = Set(d1, HighestValue<T>());
+   return detail::RedMin(v, neutral);
+ }
+ 
+ // ------------------------------ MaxOfLanes
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu)
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V MaxOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const Simd<T, HWY_LANES(T)> d1;  // always m1
++  const Full<T> d1;  // always m1
+   const auto neutral = Set(d1, LowestValue<T>());
+   return detail::RedMax(v, neutral);
+ }
+@@ -1507,7 +1587,7 @@ HWY_API VFromD<D> LoadDup128(D d, const
+ #define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP)                 \
+   HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \
+     /* LMUL=1 is always enough */                               \
+-    Simd<uint8_t, HWY_LANES(uint8_t)> d8;                       \
++    Full<uint8_t> d8;                                           \
+     const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN;     \
+     /* TODO(janwas): how to convert vbool* to vuint?*/          \
+     /*Store(m, d8, p);*/                                        \
+@@ -1518,6 +1598,22 @@ HWY_API VFromD<D> LoadDup128(D d, const
+ HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _)
+ #undef HWY_RVV_STORE_MASK_BITS
+ 
++// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
++
++// Disallow for 8-bit because Iota is likely to overflow.
++template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
++HWY_API MFromD<D> FirstN(const D d, const size_t n) {
++  const RebindToSigned<D> di;
++  return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
++}
++
++template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
++HWY_API MFromD<D> FirstN(const D d, const size_t n) {
++  const auto zero = Zero(d);
++  const auto one = Set(d, 1);
++  return Eq(detail::SlideUp(one, zero, n), one);
++}
++
+ // ------------------------------ Neg
+ 
+ template <class V, HWY_IF_SIGNED_V(V)>
+@@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) {
+ }
+ 
+ // vector = f(vector), but argument is repeated
+-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)         \
++#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)  \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v);                         \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, v);                            \
+   }
+ 
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn)
+@@ -1565,7 +1661,6 @@ template <class V>
+ HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
+   return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
+ }
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1636,10 +1731,8 @@ HWY_API VFromD<D> Iota(const D d, TFromD
+ // Using vwmul does not work for m8, so use mulh instead. Highway only provides
+ // MulHigh for 16-bit, so use a private wrapper.
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu)
+ HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh)
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1649,7 +1742,7 @@ HWY_API VFromD<RepartitionToWide<DFromV<
+   const auto lo = Mul(a, b);
+   const auto hi = detail::MulHigh(a, b);
+   const RepartitionToWide<DFromV<V>> dw;
+-  return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo));
++  return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo));
+ }
+ 
+ // ================================================== END MACROS
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12	2021-06-02 10:56:05.237904402 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -19,7 +19,6 @@
+ #include <stdint.h>
+ 
+ #include <algorithm>  // std::min
+-#include <cmath>
+ 
+ #include "hwy/base.h"
+ #include "hwy/ops/shared-inl.h"
+@@ -199,7 +198,7 @@ HWY_API Vec1<T> BroadcastSignBit(const V
+ template <typename TFrom, typename TTo>
+ HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
+   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+-  return Mask1<TTo>(m.raw);
++  return Mask1<TTo>{m.bits};
+ }
+ 
+ // v must be 0 or FF..FF.
+@@ -224,6 +223,11 @@ Vec1<T> VecFromMask(Sisd<T> /* tag */, c
+   return v;
+ }
+ 
++template <typename T>
++HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
++  return Mask1<T>::FromBool(n != 0);
++}
++
+ // Returns mask ? yes : no.
+ template <typename T>
+ HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
+@@ -357,9 +361,9 @@ HWY_INLINE Vec1<T> operator>>(const Vec1
+ 
+ template <typename T>
+ HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
+-  const uint64_t a64 = static_cast<int64_t>(a.raw);
+-  const uint64_t b64 = static_cast<int64_t>(b.raw);
+-  return Vec1<T>((a64 + b64) & ~T(0));
++  const uint64_t a64 = static_cast<uint64_t>(a.raw);
++  const uint64_t b64 = static_cast<uint64_t>(b.raw);
++  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
+ }
+ HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
+   return Vec1<float>(a.raw + b.raw);
+@@ -370,9 +374,9 @@ HWY_INLINE Vec1<double> operator+(const
+ 
+ template <typename T>
+ HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
+-  const uint64_t a64 = static_cast<int64_t>(a.raw);
+-  const uint64_t b64 = static_cast<int64_t>(b.raw);
+-  return Vec1<T>((a64 - b64) & ~T(0));
++  const uint64_t a64 = static_cast<uint64_t>(a.raw);
++  const uint64_t b64 = static_cast<uint64_t>(b.raw);
++  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
+ }
+ HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
+   return Vec1<float>(a.raw - b.raw);
+@@ -388,21 +392,25 @@ HWY_INLINE Vec1<double> operator-(const
+ // Unsigned
+ HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
+                                       const Vec1<uint8_t> b) {
+-  return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255));
++  return Vec1<uint8_t>(
++      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
+ }
+ HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
+                                        const Vec1<uint16_t> b) {
+-  return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535));
++  return Vec1<uint16_t>(
++      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
+ }
+ 
+ // Signed
+ HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
+                                      const Vec1<int8_t> b) {
+-  return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127));
++  return Vec1<int8_t>(
++      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
+ }
+ HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
+                                       const Vec1<int16_t> b) {
+-  return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767));
++  return Vec1<int16_t>(
++      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
+ }
+ 
+ // ------------------------------ Saturating subtraction
+@@ -412,21 +420,25 @@ HWY_INLINE Vec1<int16_t> SaturatedAdd(co
+ // Unsigned
+ HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
+                                       const Vec1<uint8_t> b) {
+-  return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255));
++  return Vec1<uint8_t>(
++      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
+ }
+ HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
+                                        const Vec1<uint16_t> b) {
+-  return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535));
++  return Vec1<uint16_t>(
++      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
+ }
+ 
+ // Signed
+ HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
+                                      const Vec1<int8_t> b) {
+-  return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127));
++  return Vec1<int8_t>(
++      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
+ }
+ HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
+                                       const Vec1<int16_t> b) {
+-  return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767));
++  return Vec1<int16_t>(
++      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
+ }
+ 
+ // ------------------------------ Average
+@@ -435,11 +447,11 @@ HWY_INLINE Vec1<int16_t> SaturatedSub(co
+ 
+ HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
+                                       const Vec1<uint8_t> b) {
+-  return Vec1<uint8_t>((a.raw + b.raw + 1) / 2);
++  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
+ }
+ HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
+                                        const Vec1<uint16_t> b) {
+-  return Vec1<uint16_t>((a.raw + b.raw + 1) / 2);
++  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
+ }
+ 
+ // ------------------------------ Absolute value
+@@ -514,15 +526,15 @@ HWY_INLINE Vec1<T> operator/(const Vec1<
+ 
+ // Returns the upper 16 bits of a * b in each lane.
+ HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
+-  return Vec1<int16_t>((a.raw * b.raw) >> 16);
++  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
+ }
+ HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
+                                   const Vec1<uint16_t> b) {
+   // Cast to uint32_t first to prevent overflow. Otherwise the result of
+   // uint16_t * uint16_t is in "int" which may overflow. In practice the result
+   // is the same but this way it is also defined.
+-  return Vec1<uint16_t>(
+-      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16);
++  return Vec1<uint16_t>(static_cast<uint16_t>(
++      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
+ }
+ 
+ // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
+@@ -617,6 +629,31 @@ HWY_INLINE Vec1<T> Round(const Vec1<T> v
+   return Vec1<T>(static_cast<T>(rounded));
+ }
+ 
++// Round-to-nearest even.
++HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
++  using T = float;
++  using TI = int32_t;
++
++  const T abs = Abs(v).raw;
++  const bool signbit = std::signbit(v.raw);
++
++  if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
++    // Check if too large to cast or NaN
++    if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
++      return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
++    }
++    return Vec1<int32_t>(static_cast<TI>(v.raw));
++  }
++  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
++  const TI rounded = static_cast<TI>(v.raw + bias);
++  if (rounded == 0) return Vec1<int32_t>(0);
++  // Round to even
++  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
++    return Vec1<TI>(rounded - (signbit ? -1 : 1));
++  }
++  return Vec1<TI>(rounded);
++}
++
+ template <typename T>
+ HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
+   using TI = MakeSigned<T>;
+@@ -641,7 +678,8 @@ V Ceiling(const V v) {
+   Bits bits;
+   CopyBytes<sizeof(Bits)>(&v, &bits);
+ 
+-  const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
++  const int exponent =
++      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
+   // Already an integer.
+   if (exponent >= kMantissaBits) return v;
+   // |v| <= 1 => 0 or 1.
+@@ -672,7 +710,8 @@ V Floor(const V v) {
+   Bits bits;
+   CopyBytes<sizeof(Bits)>(&v, &bits);
+ 
+-  const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
++  const int exponent =
++      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
+   // Already an integer.
+   if (exponent >= kMantissaBits) return v;
+   // |v| <= 1 => -1 or 0.
+@@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1<T> v,
+   return Store(v, d, p);
+ }
+ 
++// ------------------------------ StoreInterleaved3
++
++HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
++                               const Vec1<uint8_t> v2, Sisd<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  StoreU(v0, d, unaligned + 0);
++  StoreU(v1, d, unaligned + 1);
++  StoreU(v2, d, unaligned + 2);
++}
++
++HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
++                               const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
++                               Sisd<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  StoreU(v0, d, unaligned + 0);
++  StoreU(v1, d, unaligned + 1);
++  StoreU(v2, d, unaligned + 2);
++  StoreU(v3, d, unaligned + 3);
++}
++
+ // ------------------------------ Stream
+ 
+ template <typename T>
+@@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1<T> v,
+   return Store(v, d, aligned);
+ }
+ 
++// ------------------------------ Scatter
++
++template <typename T, typename Offset>
++HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
++                              const Vec1<Offset> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
++  return Store(v, d, reinterpret_cast<T*>(base8));
++}
++
++template <typename T, typename Index>
++HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
++                             const Vec1<Index> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++  return Store(v, d, base + index.raw);
++}
++
+ // ------------------------------ Gather
+ 
+ template <typename T, typename Offset>
+ HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
+                                 const Vec1<Offset> offset) {
+-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+   const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
+   return Load(d, reinterpret_cast<const T*>(addr));
+ }
+@@ -792,7 +868,7 @@ HWY_INLINE Vec1<T> GatherOffset(Sisd<T>
+ template <typename T, typename Index>
+ HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
+                                const Vec1<Index> index) {
+-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+   return Load(d, base + index.raw);
+ }
+ 
+@@ -833,15 +909,20 @@ HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT>
+ 
+ static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
+                                         const Vec1<float16_t> v) {
++#if HWY_NATIVE_FLOAT16
+   uint16_t bits16;
+   CopyBytes<2>(&v.raw, &bits16);
++#else
++  const uint16_t bits16 = v.raw.bits;
++#endif
+   const uint32_t sign = bits16 >> 15;
+   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+   const uint32_t mantissa = bits16 & 0x3FF;
+ 
+   // Subnormal or zero
+   if (biased_exp == 0) {
+-    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
++    const float subnormal =
++        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
+     return Vec1<float>(sign ? -subnormal : subnormal);
+   }
+ 
+@@ -867,8 +948,12 @@ static HWY_INLINE Vec1<float16_t> Demote
+   // Tiny or zero => zero.
+   Vec1<float16_t> out;
+   if (exp < -24) {
+-    bits32 = 0;
+-    CopyBytes<2>(&bits32, &out);
++#if HWY_NATIVE_FLOAT16
++    const uint16_t zero = 0;
++    CopyBytes<2>(&zero, &out.raw);
++#else
++    out.raw.bits = 0;
++#endif
+     return out;
+   }
+ 
+@@ -890,7 +975,12 @@ static HWY_INLINE Vec1<float16_t> Demote
+   HWY_DASSERT(mantissa16 < 1024);
+   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+   HWY_DASSERT(bits16 < 0x10000);
+-  CopyBytes<2>(&bits16, &out);
++#if HWY_NATIVE_FLOAT16
++  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
++  CopyBytes<2>(&narrowed, &out.raw);
++#else
++  out.raw.bits = static_cast<uint16_t>(bits16);
++#endif
+   return out;
+ }
+ 
+@@ -919,18 +1009,6 @@ HWY_INLINE Vec1<uint8_t> U8FromU32(const
+   return DemoteTo(Sisd<uint8_t>(), v);
+ }
+ 
+-// Approximation of round-to-nearest for numbers representable as int32_t.
+-HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
+-  const float f = v.raw;
+-  if (std::isinf(f) ||
+-      std::fabs(f) > static_cast<float>(LimitsMax<int32_t>())) {
+-    return Vec1<int32_t>(std::signbit(f) ? LimitsMin<int32_t>()
+-                                         : LimitsMax<int32_t>());
+-  }
+-  const float bias = f < 0.0f ? -0.5f : 0.5f;
+-  return Vec1<int32_t>(static_cast<int>(f + bias));
+-}
+-
+ // ================================================== SWIZZLE
+ 
+ // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12	2021-06-02 10:56:05.224904336 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -31,11 +31,6 @@
+ #undef HWY_ALIGN
+ #undef HWY_LANES
+ 
+-#undef HWY_GATHER_LANES
+-#undef HWY_VARIABLE_SHIFT_LANES
+-#undef HWY_COMPARE64_LANES
+-#undef HWY_MINMAX64_LANES
+-
+ #undef HWY_CAP_INTEGER64
+ #undef HWY_CAP_FLOAT64
+ #undef HWY_CAP_GE256
+@@ -53,11 +48,6 @@
+ #define HWY_ALIGN alignas(16)
+ #define HWY_LANES(T) (16 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) 1
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_COMPARE64_LANES 2
+-#define HWY_MINMAX64_LANES 1
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+ #define HWY_CAP_GE256 0
+@@ -73,11 +63,6 @@
+ #define HWY_ALIGN alignas(32)
+ #define HWY_LANES(T) (32 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) HWY_LANES(T)
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_COMPARE64_LANES 4
+-#define HWY_MINMAX64_LANES 1
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+ #define HWY_CAP_GE256 1
+@@ -96,11 +81,6 @@
+ #define HWY_ALIGN alignas(64)
+ #define HWY_LANES(T) (64 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) HWY_LANES(T)
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_COMPARE64_LANES 8
+-#define HWY_MINMAX64_LANES 8
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+ #define HWY_CAP_GE256 1
+@@ -121,11 +101,6 @@
+ #define HWY_ALIGN alignas(16)
+ #define HWY_LANES(T) (16 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) 1
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_COMPARE64_LANES 2
+-#define HWY_MINMAX64_LANES 2
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+ #define HWY_CAP_GE256 0
+@@ -142,19 +117,14 @@
+ #define HWY_ALIGN alignas(16)
+ #define HWY_LANES(T) (16 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) 1
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_MINMAX64_LANES 2
+-#define HWY_COMPARE64_LANES 2
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_GE256 0
+ #define HWY_CAP_GE512 0
+ 
+-#ifdef __arm__
+-#define HWY_CAP_FLOAT64 0
+-#else
++#if HWY_ARCH_ARM_A64
+ #define HWY_CAP_FLOAT64 1
++#else
++#define HWY_CAP_FLOAT64 0
+ #endif
+ 
+ #define HWY_NAMESPACE N_NEON
+@@ -162,17 +132,34 @@
+ // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+ 
+ //-----------------------------------------------------------------------------
++// SVE[2]
++#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
++
++// SVE only requires lane alignment, not natural alignment of the entire vector.
++#define HWY_ALIGN alignas(8)
++// Upper bound, not the actual lane count!
++#define HWY_LANES(T) (256 / sizeof(T))
++
++#define HWY_CAP_INTEGER64 1
++#define HWY_CAP_FLOAT64 1
++#define HWY_CAP_GE256 0
++#define HWY_CAP_GE512 0
++
++#if HWY_TARGET == HWY_SVE2
++#define HWY_NAMESPACE N_SVE2
++#else
++#define HWY_NAMESPACE N_SVE
++#endif
++
++// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
++
++//-----------------------------------------------------------------------------
+ // WASM
+ #elif HWY_TARGET == HWY_WASM
+ 
+ #define HWY_ALIGN alignas(16)
+ #define HWY_LANES(T) (16 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) 1
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-#define HWY_COMPARE64_LANES 2
+-#define HWY_MINMAX64_LANES 2
+-
+ #define HWY_CAP_INTEGER64 0
+ #define HWY_CAP_FLOAT64 0
+ #define HWY_CAP_GE256 0
+@@ -194,11 +181,6 @@
+ // mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
+ #define HWY_LANES(T) (4096 / sizeof(T))
+ 
+-#define HWY_GATHER_LANES(T) HWY_LANES(T)
+-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T)
+-// Cannot use HWY_LANES/sizeof here because these are used in an #if.
+-#define HWY_COMPARE64_LANES 256
+-#define HWY_MINMAX64_LANES 256
+ 
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+@@ -215,13 +197,9 @@
+ #elif HWY_TARGET == HWY_SCALAR
+ 
+ #define HWY_ALIGN
++// For internal use only; use Lanes(d) instead.
+ #define HWY_LANES(T) 1
+ 
+-#define HWY_GATHER_LANES(T) 1
+-#define HWY_VARIABLE_SHIFT_LANES(T) 1
+-#define HWY_COMPARE64_LANES 1
+-#define HWY_MINMAX64_LANES 1
+-
+ #define HWY_CAP_INTEGER64 1
+ #define HWY_CAP_FLOAT64 1
+ #define HWY_CAP_GE256 0
+@@ -265,3 +243,7 @@
+ #else
+ #define HWY_ATTR
+ #endif
++
++// DEPRECATED
++#undef HWY_GATHER_LANES
++#define HWY_GATHER_LANES(T) HWY_LANES(T)
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12	2021-06-02 10:56:05.235904392 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -14,6 +14,8 @@
+ 
+ // Per-target definitions shared by ops/*.h and user code.
+ 
++#include <cmath>
++
+ // Separate header because foreach_target.h re-enables its include guard.
+ #include "hwy/ops/set_macros-inl.h"
+ 
+@@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si
+ }
+ 
+ // Targets with non-constexpr Lanes define this themselves.
+-#if HWY_TARGET != HWY_RVV
++#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
+ 
+ // (Potentially) non-constant actual size of the vector at runtime, subject to
+ // the limit imposed by the Simd. Useful for advancing loop counters.
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12	2021-06-02 10:56:05.242904427 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -19,8 +19,6 @@
+ #include <stdint.h>
+ #include <wasm_simd128.h>
+ 
+-#include <cmath>
+-
+ #include "hwy/base.h"
+ #include "hwy/ops/shared-inl.h"
+ 
+@@ -177,6 +175,16 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N
+ 
+ HWY_DIAGNOSTICS(pop)
+ 
++// Returns a vector with lane i=[0, N) set to "first" + i.
++template <typename T, size_t N, typename T2>
++Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
++  HWY_ALIGN T lanes[16 / sizeof(T)];
++  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
++    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
++  }
++  return Load(d, lanes);
++}
++
+ // ================================================== ARITHMETIC
+ 
+ // ------------------------------ Addition
+@@ -273,24 +281,24 @@ HWY_API Vec128<float, N> operator-(const
+ template <size_t N>
+ HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
+                                         const Vec128<uint8_t, N> b) {
+-  return Vec128<uint8_t, N>{wasm_u8x16_add_saturate(a.raw, b.raw)};
++  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
+                                          const Vec128<uint16_t, N> b) {
+-  return Vec128<uint16_t, N>{wasm_u16x8_add_saturate(a.raw, b.raw)};
++  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
+ }
+ 
+ // Signed
+ template <size_t N>
+ HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
+                                        const Vec128<int8_t, N> b) {
+-  return Vec128<int8_t, N>{wasm_i8x16_add_saturate(a.raw, b.raw)};
++  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
+                                         const Vec128<int16_t, N> b) {
+-  return Vec128<int16_t, N>{wasm_i16x8_add_saturate(a.raw, b.raw)};
++  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
+ }
+ 
+ // ------------------------------ Saturating subtraction
+@@ -301,24 +309,24 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(
+ template <size_t N>
+ HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
+                                         const Vec128<uint8_t, N> b) {
+-  return Vec128<uint8_t, N>{wasm_u8x16_sub_saturate(a.raw, b.raw)};
++  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
+                                          const Vec128<uint16_t, N> b) {
+-  return Vec128<uint16_t, N>{wasm_u16x8_sub_saturate(a.raw, b.raw)};
++  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
+ }
+ 
+ // Signed
+ template <size_t N>
+ HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
+                                        const Vec128<int8_t, N> b) {
+-  return Vec128<int8_t, N>{wasm_i8x16_sub_saturate(a.raw, b.raw)};
++  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
+                                         const Vec128<int16_t, N> b) {
+-  return Vec128<int16_t, N>{wasm_i16x8_sub_saturate(a.raw, b.raw)};
++  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
+ }
+ 
+ // ------------------------------ Average
+@@ -352,6 +360,12 @@ template <size_t N>
+ HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
+   return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
+ }
++template <size_t N>
++HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
++  // TODO(janwas): use wasm_i64x2_abs when available
++  const Vec128<int64_t, N> mask = wasm_i64x2_shr(v.raw, 63);
++  return ((v ^ mask) - mask);
++}
+ 
+ template <size_t N>
+ HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
+@@ -396,9 +410,38 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
+   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
+ }
+ 
++// 8-bit
++template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
++  const Simd<T, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
++  return kBits == 1
++             ? (v + v)
++             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
++}
++
++template <int kBits, size_t N>
++HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
++  const Simd<uint8_t, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<uint8_t, N> shifted{
++      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
++  return shifted & Set(d8, 0xFF >> kBits);
++}
++
++template <int kBits, size_t N>
++HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
++  const Simd<int8_t, N> di;
++  const Simd<uint8_t, N> du;
++  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ Shift lanes by same variable #bits
+ 
+-// Unsigned (no u8)
++// Unsigned
+ template <size_t N>
+ HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
+                                           const int bits) {
+@@ -420,7 +463,7 @@ HWY_API Vec128<uint32_t, N> ShiftRightSa
+   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
+ }
+ 
+-// Signed (no i8)
++// Signed
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
+                                          const int bits) {
+@@ -442,6 +485,35 @@ HWY_API Vec128<int32_t, N> ShiftRightSam
+   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
+ }
+ 
++// 8-bit
++template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
++  const Simd<T, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<T, N> shifted{
++      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
++  return shifted & Set(d8, (0xFF << bits) & 0xFF);
++}
++
++template <size_t N>
++HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
++                                          const int bits) {
++  const Simd<uint8_t, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<uint8_t, N> shifted{
++      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
++  return shifted & Set(d8, 0xFF >> bits);
++}
++
++template <size_t N>
++HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
++  const Simd<int8_t, N> di;
++  const Simd<uint8_t, N> du;
++  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ Minimum
+ 
+ // Unsigned
+@@ -607,29 +679,29 @@ template <size_t N>
+ HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
+                                     const Vec128<uint16_t, N> b) {
+   // TODO(eustas): replace, when implemented in WASM.
+-  const auto al = wasm_i32x4_widen_low_u16x8(a.raw);
+-  const auto ah = wasm_i32x4_widen_high_u16x8(a.raw);
+-  const auto bl = wasm_i32x4_widen_low_u16x8(b.raw);
+-  const auto bh = wasm_i32x4_widen_high_u16x8(b.raw);
++  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
++  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
++  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
++  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
+   const auto l = wasm_i32x4_mul(al, bl);
+   const auto h = wasm_i32x4_mul(ah, bh);
+   // TODO(eustas): shift-right + narrow?
+   return Vec128<uint16_t, N>{
+-      wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
++      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
+                                    const Vec128<int16_t, N> b) {
+   // TODO(eustas): replace, when implemented in WASM.
+-  const auto al = wasm_i32x4_widen_low_i16x8(a.raw);
+-  const auto ah = wasm_i32x4_widen_high_i16x8(a.raw);
+-  const auto bl = wasm_i32x4_widen_low_i16x8(b.raw);
+-  const auto bh = wasm_i32x4_widen_high_i16x8(b.raw);
++  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
++  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
++  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
++  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
+   const auto l = wasm_i32x4_mul(al, bl);
+   const auto h = wasm_i32x4_mul(ah, bh);
+   // TODO(eustas): shift-right + narrow?
+   return Vec128<int16_t, N>{
+-      wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
++      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+ }
+ 
+ // Multiplies even lanes (0, 2 ..) and returns the double-width result.
+@@ -765,53 +837,76 @@ HWY_API Vec128<float, N> ApproximateReci
+ // Toward nearest integer, ties to even
+ template <size_t N>
+ HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
+-  // TODO(eustas): is it f32x4.nearest? (not implemented yet)
+-  alignas(16) float input[4];
+-  alignas(16) float output[4];
+-  wasm_v128_store(input, v.raw);
+-  for (size_t i = 0; i < 4; ++i) {
+-    output[i] = std::nearbyint(input[i]);
+-  }
+-  return Vec128<float, N>{wasm_v128_load(output)};
++  // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not
++  // yet have an instruction for that (f32x4.nearest is not implemented). We
++  // rely on rounding after addition with a large value such that no mantissa
++  // bits remain (assuming the current mode is nearest-even). We may need a
++  // compiler flag for precise floating-point to prevent "optimizing" this out.
++  const Simd<float, N> df;
++  const auto max = Set(df, MantissaEnd<float>());
++  const auto large = CopySignToAbs(max, v);
++  const auto added = large + v;
++  const auto rounded = added - large;
++
++  // Keep original if NaN or the magnitude is large (already an int).
++  return IfThenElse(Abs(v) < max, rounded, v);
+ }
+ 
++namespace detail {
++
++// Truncating to integer and converting back to float is correct except when the
++// input magnitude is large, in which case the input was already an integer
++// (because mantissa >> exponent is zero).
++template <size_t N>
++HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
++  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
++}
++
++}  // namespace detail
++
+ // Toward zero, aka truncate
+ template <size_t N>
+ HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
+   // TODO(eustas): is it f32x4.trunc? (not implemented yet)
+-  alignas(16) float input[4];
+-  alignas(16) float output[4];
+-  wasm_v128_store(input, v.raw);
+-  for (size_t i = 0; i < 4; ++i) {
+-    output[i] = std::trunc(input[i]);
+-  }
+-  return Vec128<float, N>{wasm_v128_load(output)};
++  const Simd<float, N> df;
++  const RebindToSigned<decltype(df)> di;
++
++  const auto integer = ConvertTo(di, v);  // round toward 0
++  const auto int_f = ConvertTo(df, integer);
++
++  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
+ }
+ 
+ // Toward +infinity, aka ceiling
+ template <size_t N>
+-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
++HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
+   // TODO(eustas): is it f32x4.ceil? (not implemented yet)
+-  alignas(16) float input[4];
+-  alignas(16) float output[4];
+-  wasm_v128_store(input, v.raw);
+-  for (size_t i = 0; i < 4; ++i) {
+-    output[i] = std::ceil(input[i]);
+-  }
+-  return Vec128<float, N>{wasm_v128_load(output)};
++  const Simd<float, N> df;
++  const RebindToSigned<decltype(df)> di;
++
++  const auto integer = ConvertTo(di, v);  // round toward 0
++  const auto int_f = ConvertTo(df, integer);
++
++  // Truncating a positive non-integer ends up smaller; if so, add 1.
++  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
++
++  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
+ }
+ 
+ // Toward -infinity, aka floor
+ template <size_t N>
+-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
++HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) {
+   // TODO(eustas): is it f32x4.floor? (not implemented yet)
+-  alignas(16) float input[4];
+-  alignas(16) float output[4];
+-  wasm_v128_store(input, v.raw);
+-  for (size_t i = 0; i < 4; ++i) {
+-    output[i] = std::floor(input[i]);
+-  }
+-  return Vec128<float, N>{wasm_v128_load(output)};
++  const Simd<float, N> df;
++  const RebindToSigned<decltype(df)> di;
++
++  const auto integer = ConvertTo(di, v);  // round toward 0
++  const auto int_f = ConvertTo(df, integer);
++
++  // Truncating a negative non-integer ends up larger; if so, subtract 1.
++  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
++
++  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
+ }
+ 
+ // ================================================== COMPARE
+@@ -902,12 +997,12 @@ HWY_API Mask128<int64_t, N> operator>(co
+ 
+   // Otherwise, the lower half decides.
+   const auto m_eq = a32 == b32;
+-  const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
++  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
+   const auto lo_gt = And(m_eq, lo_in_hi);
+ 
+   const auto gt = Or(lo_gt, m_gt);
+   // Copy result in upper 32 bits to lower 32 bits.
+-  return Mask128<int64_t, N>{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)};
++  return Mask128<int64_t, N>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
+ }
+ 
+ template <size_t N>
+@@ -935,6 +1030,14 @@ HWY_API Mask128<float, N> operator>=(con
+   return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
+ }
+ 
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T, size_t N>
++HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
++}
++
+ // ================================================== LOGICAL
+ 
+ // ------------------------------ Not
+@@ -1015,7 +1118,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
+ }
+ template <size_t N>
+ HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
+-  return VecFromMask(v < Zero(Simd<int8_t, N>()));
++  return VecFromMask(Simd<int8_t, N>(), v < Zero(Simd<int8_t, N>()));
+ }
+ 
+ // ------------------------------ Mask
+@@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128<T, N> v, Simd
+   wasm_v128_store(aligned, v.raw);
+ }
+ 
+-// ------------------------------ Gather
++// ------------------------------ Scatter (Store)
++
++template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
++HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                           const Vec128<Offset, N> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
++  }
++}
++
++template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
++HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                          const Vec128<Index, N> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  for (size_t i = 0; i < N; ++i) {
++    base[index_lanes[i]] = lanes[i];
++  }
++}
++
++// ------------------------------ Gather (Load/Store)
+ 
+ template <typename T, size_t N, typename Offset>
+ HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
+                                   const T* HWY_RESTRICT base,
+                                   const Vec128<Offset, N> offset) {
+-  static_assert(N == 1, "Wasm does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
+-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
+-  T val;
+-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
+-  return Set(d, val);
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  alignas(16) T lanes[N];
++  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
++  }
++  return Load(d, lanes);
+ }
+ 
+ template <typename T, size_t N, typename Index>
+ HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
+                                  const Vec128<Index, N> index) {
+-  static_assert(N == 1, "Wasm does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
+-  return Set(d, base[GetLane(index)]);
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  alignas(16) T lanes[N];
++  for (size_t i = 0; i < N; ++i) {
++    lanes[i] = base[index_lanes[i]];
++  }
++  return Load(d, lanes);
+ }
+ 
+ // ================================================== SWIZZLE
+@@ -1346,12 +1496,12 @@ HWY_API Vec128<T, N / 2> LowerHalf(Vec12
+ template <typename T>
+ HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Vec128<T> v) {
+   // TODO(eustas): use swizzle?
+-  return Vec128<T, 8 / sizeof(T)>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
++  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+ }
+ template <>
+ HWY_INLINE Vec128<float, 2> UpperHalf(Vec128<float> v) {
+   // TODO(eustas): use swizzle?
+-  return Vec128<float, 2>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
++  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+ }
+ 
+ // ------------------------------ Shift vector by constant #bytes
+@@ -1366,64 +1516,64 @@ HWY_API Vec128<T> ShiftLeftBytes(const V
+       return v;
+ 
+     case 1:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
+                                           7, 8, 9, 10, 11, 12, 13, 14)};
+ 
+     case 2:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
+                                           6, 7, 8, 9, 10, 11, 12, 13)};
+ 
+     case 3:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
+                                           4, 5, 6, 7, 8, 9, 10, 11, 12)};
+ 
+     case 4:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
+                                           3, 4, 5, 6, 7, 8, 9, 10, 11)};
+ 
+     case 5:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
+                                           2, 3, 4, 5, 6, 7, 8, 9, 10)};
+ 
+     case 6:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
+ 
+     case 7:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
+ 
+     case 8:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
+ 
+     case 9:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
+ 
+     case 10:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
+ 
+     case 11:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
+ 
+     case 12:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
+ 
+     case 13:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
+ 
+     case 14:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 0,
+                                           1)};
+ 
+     case 15:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           0)};
+   }
+@@ -1447,69 +1597,69 @@ HWY_API Vec128<T> ShiftRightBytes(const
+       return v;
+ 
+     case 1:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
+                                           9, 10, 11, 12, 13, 14, 15, 16)};
+ 
+     case 2:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
+                                           10, 11, 12, 13, 14, 15, 16, 16)};
+ 
+     case 3:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
+                                           11, 12, 13, 14, 15, 16, 16, 16)};
+ 
+     case 4:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
+                                           12, 13, 14, 15, 16, 16, 16, 16)};
+ 
+     case 5:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
+                                           12, 13, 14, 15, 16, 16, 16, 16, 16)};
+ 
+     case 6:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
+                                           13, 14, 15, 16, 16, 16, 16, 16, 16)};
+ 
+     case 7:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
+                                           14, 15, 16, 16, 16, 16, 16, 16, 16)};
+ 
+     case 8:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
+                                           15, 16, 16, 16, 16, 16, 16, 16, 16)};
+ 
+     case 9:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
+                                           15, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 10:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 11:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 12:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 13:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 14:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+ 
+     case 15:
+-      return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
+                                           16, 16, 16, 16, 16, 16, 16, 16, 16,
+                                           16)};
+   }
+@@ -1535,72 +1685,72 @@ HWY_API Vec128<T> CombineShiftRightBytes
+       return lo;
+ 
+     case 1:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
+                                           8, 9, 10, 11, 12, 13, 14, 15, 16)};
+ 
+     case 2:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
+                                           9, 10, 11, 12, 13, 14, 15, 16, 17)};
+ 
+     case 3:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
+                                           10, 11, 12, 13, 14, 15, 16, 17, 18)};
+ 
+     case 4:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
+                                           11, 12, 13, 14, 15, 16, 17, 18, 19)};
+ 
+     case 5:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
+                                           12, 13, 14, 15, 16, 17, 18, 19, 20)};
+ 
+     case 6:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
+                                           12, 13, 14, 15, 16, 17, 18, 19, 20,
+                                           21)};
+ 
+     case 7:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
+                                           13, 14, 15, 16, 17, 18, 19, 20, 21,
+                                           22)};
+ 
+     case 8:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
+                                           14, 15, 16, 17, 18, 19, 20, 21, 22,
+                                           23)};
+ 
+     case 9:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
+                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
+                                           24)};
+ 
+     case 10:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
+                                           15, 16, 17, 18, 19, 20, 21, 22, 23,
+                                           24, 25)};
+ 
+     case 11:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
+                                           16, 17, 18, 19, 20, 21, 22, 23, 24,
+                                           25, 26)};
+ 
+     case 12:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
+                                           17, 18, 19, 20, 21, 22, 23, 24, 25,
+                                           26, 27)};
+ 
+     case 13:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
+                                           18, 19, 20, 21, 22, 23, 24, 25, 26,
+                                           27, 28)};
+ 
+     case 14:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
+                                           19, 20, 21, 22, 23, 24, 25, 26, 27,
+                                           28, 29)};
+ 
+     case 15:
+-      return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
++      return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
+                                           20, 21, 22, 23, 24, 25, 26, 27, 28,
+                                           29, 30)};
+   }
+@@ -1613,28 +1763,28 @@ HWY_API Vec128<T> CombineShiftRightBytes
+ template <int kLane, size_t N>
+ HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
+   static_assert(0 <= kLane && kLane < N, "Invalid lane");
+-  return Vec128<uint16_t, N>{wasm_v16x8_shuffle(
++  return Vec128<uint16_t, N>{wasm_i16x8_shuffle(
+       v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
+ }
+ template <int kLane, size_t N>
+ HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
+   static_assert(0 <= kLane && kLane < N, "Invalid lane");
+   return Vec128<uint32_t, N>{
+-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
++      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ }
+ 
+ // Signed
+ template <int kLane, size_t N>
+ HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
+   static_assert(0 <= kLane && kLane < N, "Invalid lane");
+-  return Vec128<int16_t, N>{wasm_v16x8_shuffle(
++  return Vec128<int16_t, N>{wasm_i16x8_shuffle(
+       v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
+ }
+ template <int kLane, size_t N>
+ HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
+   static_assert(0 <= kLane && kLane < N, "Invalid lane");
+   return Vec128<int32_t, N>{
+-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
++      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ }
+ 
+ // Float
+@@ -1642,7 +1792,7 @@ template <int kLane, size_t N>
+ HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
+   static_assert(0 <= kLane && kLane < N, "Invalid lane");
+   return Vec128<float, N>{
+-      wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
++      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ }
+ 
+ // ------------------------------ Shuffle bytes with variable indices
+@@ -1652,16 +1802,23 @@ HWY_API Vec128<float, N> Broadcast(const
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> TableLookupBytes(const Vec128<T, N> bytes,
+                                       const Vec128<T, N> from) {
+-  // TODO(eustas): use swizzle? (shuffle does not work for variable indices)
++// Not yet available in all engines, see
++// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
++// V8 implementation of this had a bug, fixed on 2021-04-03:
++// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
++#if 0
++  return Vec128<T, N>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
++#else
+   alignas(16) uint8_t control[16];
+   alignas(16) uint8_t input[16];
+   alignas(16) uint8_t output[16];
+   wasm_v128_store(control, from.raw);
+   wasm_v128_store(input, bytes.raw);
+   for (size_t i = 0; i < 16; ++i) {
+-    output[i] = input[control[i]];
++    output[i] = control[i] < 16 ? input[control[i]] : 0;
+   }
+   return Vec128<T, N>{wasm_v128_load(output)};
++#endif
+ }
+ 
+ // ------------------------------ Hard-coded shuffles
+@@ -1673,101 +1830,102 @@ HWY_API Vec128<T, N> TableLookupBytes(co
+ 
+ // Swap 32-bit halves in 64-bit halves.
+ HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ }
+ HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ }
+ HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
+-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
++  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ }
+ 
+ // Swap 64-bit halves
+ HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
++  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
+ }
+ HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
++  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
+ }
+ HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
+-  return Vec128<float>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
++  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
+ }
+ 
+ // Rotate right 32 bits
+ HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ }
+ HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ }
+ HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
+-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
++  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ }
+ // Rotate left 32 bits
+ HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ }
+ HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ }
+ HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
+-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
++  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ }
+ 
+ // Reverse
+ HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ }
+ HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ }
+ HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
+-  return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
++  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ }
+ 
+ // ------------------------------ TableLookupLanes
+ 
+ // Returned by SetTableIndices for use by TableLookupLanes.
+-template <typename T>
++template <typename T, size_t N>
+ struct Indices128 {
+   __v128_u raw;
+ };
+ 
+-template <typename T>
+-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
+ #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+-  const size_t N = 16 / sizeof(T);
+   for (size_t i = 0; i < N; ++i) {
+     HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+   }
+ #endif
+ 
+-  const Full128<uint8_t> d8;
+-  alignas(16) uint8_t control[16];  // = Lanes()
+-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
+-    const size_t idx_lane = idx_byte / sizeof(T);
+-    const size_t mod = idx_byte % sizeof(T);
+-    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
++  const Repartition<uint8_t, decltype(d)> d8;
++  alignas(16) uint8_t control[16] = {0};
++  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
++    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
++      control[idx_lane * sizeof(T) + idx_byte] =
++          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
++    }
+   }
+-  return Indices128<T>{Load(d8, control).raw};
++  return Indices128<T, N>{Load(d8, control).raw};
+ }
+ 
+-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
+-                                          const Indices128<uint32_t> idx) {
+-  return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<uint32_t, N> TableLookupLanes(
++    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
+ }
+-
+-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
+-                                         const Indices128<int32_t> idx) {
+-  return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
++                                            const Indices128<int32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
+ }
+-
+-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
+-                                       const Indices128<float> idx) {
+-  const Full128<int32_t> di;
+-  const Full128<float> df;
++template <size_t N>
++HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
++                                          const Indices128<float, N> idx) {
++  const Simd<int32_t, N> di;
++  const Simd<float, N> df;
+   return BitCast(df,
+-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
++                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
+ }
+ 
+ // ------------------------------ Zip lanes
+@@ -1778,33 +1936,33 @@ HWY_API Vec128<float> TableLookupLanes(c
+ template <size_t N>
+ HWY_API Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
+                                                const Vec128<uint8_t, N> b) {
+-  return Vec128<uint16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
++  return Vec128<uint16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
+       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
+ }
+ template <size_t N>
+ HWY_API Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
+                                                const Vec128<uint16_t, N> b) {
+   return Vec128<uint32_t, (N + 1) / 2>{
+-      wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
++      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+ }
+ 
+ template <size_t N>
+ HWY_API Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
+                                               const Vec128<int8_t, N> b) {
+-  return Vec128<int16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
++  return Vec128<int16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
+       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
+                                               const Vec128<int16_t, N> b) {
+   return Vec128<int32_t, (N + 1) / 2>{
+-      wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
++      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+ }
+ 
+ template <size_t N>
+ HWY_API Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
+                                          const Vec128<uint8_t, N> b) {
+-  return Vec128<uint16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
++  return Vec128<uint16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+                                                     10, 26, 11, 27, 12, 28, 13,
+                                                     29, 14, 30, 15, 31)};
+ }
+@@ -1812,13 +1970,13 @@ template <size_t N>
+ HWY_API Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
+                                          const Vec128<uint16_t, N> b) {
+   return Vec128<uint32_t, N / 2>{
+-      wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
++      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+ }
+ 
+ template <size_t N>
+ HWY_API Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
+                                         const Vec128<int8_t, N> b) {
+-  return Vec128<int16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
++  return Vec128<int16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+                                                    10, 26, 11, 27, 12, 28, 13,
+                                                    29, 14, 30, 15, 31)};
+ }
+@@ -1826,7 +1984,7 @@ template <size_t N>
+ HWY_API Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
+                                         const Vec128<int16_t, N> b) {
+   return Vec128<int32_t, N / 2>{
+-      wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
++      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+ }
+ 
+ // ------------------------------ Interleave lanes
+@@ -1842,17 +2000,17 @@ HWY_API Vec128<T> InterleaveLower(const
+ template <>
+ HWY_INLINE Vec128<uint32_t> InterleaveLower<uint32_t>(
+     const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ }
+ template <>
+ HWY_INLINE Vec128<int32_t> InterleaveLower<int32_t>(const Vec128<int32_t> a,
+                                                     const Vec128<int32_t> b) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ }
+ template <>
+ HWY_INLINE Vec128<float> InterleaveLower<float>(const Vec128<float> a,
+                                                 const Vec128<float> b) {
+-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
++  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ }
+ 
+ template <typename T>
+@@ -1862,17 +2020,17 @@ HWY_API Vec128<T> InterleaveUpper(const
+ template <>
+ HWY_INLINE Vec128<uint32_t> InterleaveUpper<uint32_t>(
+     const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
+-  return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
++  return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ }
+ template <>
+ HWY_INLINE Vec128<int32_t> InterleaveUpper<int32_t>(const Vec128<int32_t> a,
+                                                     const Vec128<int32_t> b) {
+-  return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
++  return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ }
+ template <>
+ HWY_INLINE Vec128<float> InterleaveUpper<float>(const Vec128<float> a,
+                                                 const Vec128<float> b) {
+-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
++  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ }
+ 
+ // ------------------------------ Blocks
+@@ -1880,13 +2038,13 @@ HWY_INLINE Vec128<float> InterleaveUpper
+ // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
+ template <typename T>
+ HWY_API Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
+-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)};
++  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
+ }
+ 
+ // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
+ template <typename T>
+ HWY_API Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
+-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)};
++  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
+ }
+ 
+ // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
+@@ -1898,7 +2056,7 @@ HWY_API Vec128<T> ConcatLowerUpper(const
+ // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
+ template <typename T>
+ HWY_API Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
+-  return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)};
++  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)};
+ }
+ 
+ // ------------------------------ Odd/even lanes
+@@ -1917,12 +2075,12 @@ HWY_API Vec128<T> odd_even_impl(hwy::Siz
+ template <typename T>
+ HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
+                                 const Vec128<T> b) {
+-  return Vec128<T>{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
++  return Vec128<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
+ }
+ template <typename T>
+ HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
+                                 const Vec128<T> b) {
+-  return Vec128<T>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
++  return Vec128<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+ }
+ // TODO(eustas): implement
+ // template <typename T>
+@@ -1939,7 +2097,7 @@ HWY_API Vec128<T> OddEven(const Vec128<T
+ template <>
+ HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
+                                         const Vec128<float> b) {
+-  return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
++  return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+ }
+ 
+ // ================================================== CONVERT
+@@ -1950,52 +2108,52 @@ HWY_INLINE Vec128<float> OddEven<float>(
+ template <size_t N>
+ HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
+                                       const Vec128<uint8_t, N> v) {
+-  return Vec128<uint16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
++  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
+                                       const Vec128<uint8_t, N> v) {
+   return Vec128<uint32_t, N>{
+-      wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
++      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
+ }
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
+                                      const Vec128<uint8_t, N> v) {
+-  return Vec128<int16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
++  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+                                      const Vec128<uint8_t, N> v) {
+   return Vec128<int32_t, N>{
+-      wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
++      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
+ }
+ template <size_t N>
+ HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
+                                       const Vec128<uint16_t, N> v) {
+-  return Vec128<uint32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
++  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+-  return Vec128<int32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
++  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
+ }
+ 
+ // Signed: replicate sign bit.
+ template <size_t N>
+ HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
+                                      const Vec128<int8_t, N> v) {
+-  return Vec128<int16_t, N>{wasm_i16x8_widen_low_i8x16(v.raw)};
++  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
+ }
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+                                      const Vec128<int8_t, N> v) {
+   return Vec128<int32_t, N>{
+-      wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))};
++      wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
+ }
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+                                      const Vec128<int16_t, N> v) {
+-  return Vec128<int32_t, N>{wasm_i32x4_widen_low_i16x8(v.raw)};
++  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
+ }
+ 
+ template <size_t N>
+@@ -2122,7 +2280,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
+       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
+ }
+ 
+-// ------------------------------ Convert i32 <=> f32
++// ------------------------------ Convert i32 <=> f32 (Round)
+ 
+ template <size_t N>
+ HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
+@@ -2133,33 +2291,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
+                                      const Vec128<float, N> v) {
+-  return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(v.raw)};
++  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
+ }
+ 
+ template <size_t N>
+ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
+-  const __f32x4 c00 = wasm_f32x4_splat(0.0f);
+-  const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00));
+-  const __f32x4 c05 = wasm_f32x4_splat(0.5f);
+-  // +0.5 for non-negative lane, -0.5 for other.
+-  const __f32x4 delta = wasm_f32x4_add(c05, corr);
+-  // Shift input by 0.5 away from 0.
+-  const __f32x4 fixed = wasm_f32x4_add(v.raw, delta);
+-  return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(fixed)};
++  return ConvertTo(Simd<int32_t, N>(), Round(v));
+ }
+ 
+ // ================================================== MISC
+ 
+-// Returns a vector with lane i=[0, N) set to "first" + i.
+-template <typename T, size_t N, typename T2>
+-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+-  HWY_ALIGN T lanes[16 / sizeof(T)];
+-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+-  }
+-  return Load(d, lanes);
+-}
+-
+ // ------------------------------ Mask
+ 
+ namespace detail {
+@@ -2167,20 +2308,13 @@ namespace detail {
+ template <typename T, size_t N>
+ HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
+                               const Mask128<T, N> mask) {
+-  const __i8x16 slice =
+-      wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8);
+-  // Each u32 lane has byte[i] = (1 << i) or 0.
+-  const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice);
+-  // OR together 4 bytes of each u32 to get the 4 bits.
+-  const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16);
+-  const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z);
+-  const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8);
+-  const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0);
+-  const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24);
+-  // Assemble four nibbles into 16 bits.
+-  alignas(16) uint32_t lanes[4];
+-  wasm_v128_store(lanes, nibble_per_u32);
+-  return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12);
++  alignas(16) uint64_t lanes[2];
++  wasm_v128_store(lanes, mask.raw);
++
++  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
++  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
++  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
++  return (hi + lo);
+ }
+ 
+ template <typename T, size_t N>
+@@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() {
+ 
+ template <typename T, size_t N>
+ HWY_API uint64_t BitsFromMask(const Mask128<T, N> mask) {
+-  return OnlyActive<T, N>(
+-      BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
++  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
+ }
+ 
+ template <typename T>
+@@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128<T
+ // Full vector, type-independent
+ template <typename T>
+ HWY_API bool AllFalse(const Mask128<T> m) {
+-  return !wasm_i8x16_any_true(m.raw);
++#if 0
++  // Casting followed by wasm_i8x16_any_true results in wasm error:
++  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
++  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(Full128<T>(), m));
++  return !wasm_i8x16_any_true(v8.raw);
++#else
++  return (wasm_i64x2_extract_lane(m.raw, 0) |
++          wasm_i64x2_extract_lane(m.raw, 1)) == 0;
++#endif
+ }
+ 
+ // Full vector, type-dependent
+@@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128<T, N>
+ namespace detail {
+ 
+ template <typename T, size_t N>
++HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
++  HWY_DASSERT(mask_bits < 256);
++  const Simd<T, N> d;
++  const Rebind<uint8_t, decltype(d)> d8;
++  const Simd<uint16_t, N> du;
++
++  // We need byte indices for TableLookupBytes (one vector's worth for each of
++  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
++  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
++  // with the doubling baked into the table. Unpacking nibbles is likely more
++  // costly than the higher cache footprint from storing bytes.
++  alignas(16) constexpr uint8_t table[256 * 8] = {
++      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
++      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
++      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
++      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
++      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
++      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
++      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
++      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
++      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
++      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
++      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
++      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
++      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
++      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
++      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
++      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
++      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
++      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
++      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
++      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
++      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
++      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
++      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
++      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
++      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
++      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
++      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
++      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
++      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
++      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
++      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
++      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
++      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
++      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
++      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
++      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
++      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
++      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
++      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
++      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
++      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
++      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
++      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
++      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
++      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
++      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
++      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
++      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
++      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
++      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
++      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
++      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
++      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
++      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
++      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
++      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
++      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
++      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
++      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
++      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
++      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
++      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
++      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
++      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
++      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
++      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
++      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
++      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
++      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
++      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
++      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
++      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
++      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
++      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
++      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
++      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
++      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
++      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
++      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
++      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
++      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
++      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
++      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
++      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
++      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
++      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
++      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
++      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
++      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
++      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
++      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
++      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
++      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
++      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
++      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
++      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
++      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
++      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
++      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
++      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
++      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
++      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
++      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
++      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
++      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
++      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
++      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
++      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
++      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
++      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
++      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
++      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
++      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
++      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
++
++  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
++  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
++  return BitCast(d, pairs + Set(du, 0x0100));
++}
++
++template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+   HWY_DASSERT(mask_bits < 16);
+ 
+@@ -2383,57 +2657,37 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
+ 
+ #endif
+ 
+-// Helper function called by both Compress and CompressStore - avoids a
++// Helper functions called by both Compress and CompressStore - avoids a
+ // redundant BitsFromMask in the latter.
+ 
+-template <size_t N>
+-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
+-                                     const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
++                              const uint64_t mask_bits) {
++  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
++  using D = Simd<T, N>;
++  const RebindToSigned<D> di;
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ }
+-template <size_t N>
+-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
+-                                    const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
++
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
++                              const uint64_t mask_bits) {
++  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
++  using D = Simd<T, N>;
++  const RebindToSigned<D> di;
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ }
+ 
+-#if HWY_CAP_INTEGER64
++#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
+ 
+-template <size_t N>
+-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
++template <typename T, size_t N>
++HWY_API Vec128<uint64_t, N> Compress(hwy::SizeTag<8> /*tag*/,
++                                     Vec128<uint64_t, N> v,
+                                      const uint64_t mask_bits) {
+   const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-template <size_t N>
+-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
+-                                    const uint64_t mask_bits) {
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-}
+-
+-#endif
+-
+-template <size_t N>
+-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
+-                                  const uint64_t mask_bits) {
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  const Simd<float, N> df;
+-  const Simd<int32_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+-}
+-
+-#if HWY_CAP_FLOAT64
+-
+-template <size_t N>
+-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
+-                                   const uint64_t mask_bits) {
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  const Simd<double, N> df;
+-  const Simd<int64_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
++  using D = Simd<T, N>;
++  const RebindToSigned<D> di;
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ }
+ 
+ #endif
+@@ -2442,7 +2696,8 @@ HWY_API Vec128<double, N> Compress(Vec12
+ 
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
+-  return detail::Compress(v, detail::BitsFromMask(mask));
++  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
++                          detail::BitsFromMask(mask));
+ }
+ 
+ // ------------------------------ CompressStore
+@@ -2451,63 +2706,284 @@ template <typename T, size_t N>
+ HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
+                              Simd<T, N> d, T* HWY_RESTRICT aligned) {
+   const uint64_t mask_bits = detail::BitsFromMask(mask);
+-  Store(detail::Compress(v, mask_bits), d, aligned);
++  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
+   return PopCount(mask_bits);
+ }
+ 
++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
++// TableLookupBytes)
++
++// 128 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
++                               const Vec128<uint8_t> c, Full128<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const auto k5 = Set(d, 5);
++  const auto k6 = Set(d, 6);
++
++  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = Load(d, tbl_r0);
++  const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  StoreU(int0, d, unaligned + 0 * 16);
++
++  // Second vector: g10,r10, bgr[9:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
++  const auto r1 = TableLookupBytes(a, shuf_r1);
++  const auto g1 = TableLookupBytes(b, shuf_g1);
++  const auto b1 = TableLookupBytes(c, shuf_b1);
++  const auto int1 = r1 | g1 | b1;
++  StoreU(int1, d, unaligned + 1 * 16);
++
++  // Third vector: bgr[15:11], b10
++  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
++  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
++  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
++  const auto r2 = TableLookupBytes(a, shuf_r2);
++  const auto g2 = TableLookupBytes(b, shuf_g2);
++  const auto b2 = TableLookupBytes(c, shuf_b2);
++  const auto int2 = r2 | g2 | b2;
++  StoreU(int2, d, unaligned + 2 * 16);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
++                               const Vec128<uint8_t, 8> b,
++                               const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors for the shuffles and first result.
++  const Full128<uint8_t> d_full;
++  const auto k5 = Set(d_full, 5);
++  const auto k6 = Set(d_full, 6);
++
++  const Vec128<uint8_t> full_a{a.raw};
++  const Vec128<uint8_t> full_b{b.raw};
++  const Vec128<uint8_t> full_c{c.raw};
++
++  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = Load(d_full, tbl_r0);
++  const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  StoreU(int0, d_full, unaligned + 0 * 16);
++
++  // Second (HALF) vector: bgr[7:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
++  const auto r1 = TableLookupBytes(full_a, shuf_r1);
++  const auto g1 = TableLookupBytes(full_b, shuf_g1);
++  const auto b1 = TableLookupBytes(full_c, shuf_b1);
++  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
++  StoreU(int1, d, unaligned + 1 * 16);
++}
++
++// <= 32 bits
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
++                               const Vec128<uint8_t, N> b,
++                               const Vec128<uint8_t, N> c,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors for the shuffles and result.
++  const Full128<uint8_t> d_full;
++
++  const Vec128<uint8_t> full_a{a.raw};
++  const Vec128<uint8_t> full_b{b.raw};
++  const Vec128<uint8_t> full_c{c.raw};
++
++  // Shuffle (a,b,c) vector bytes to bgr[3:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
++      0x80, 0x80, 0x80, 0x80};
++  const auto shuf_r0 = Load(d_full, tbl_r0);
++  const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
++  const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
++  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
++  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
++  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  alignas(16) uint8_t buf[16];
++  StoreU(int0, d_full, buf);
++  CopyBytes<N * 3>(buf, unaligned);
++}
++
++// ------------------------------ StoreInterleaved4
++
++// 128 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
++                               const Vec128<uint8_t> v1,
++                               const Vec128<uint8_t> v2,
++                               const Vec128<uint8_t> v3, Full128<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
++  const auto ba8 = ZipUpper(v0, v1);
++  const auto dc8 = ZipUpper(v2, v3);
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
++  const auto dcba_8 = ZipLower(ba8, dc8);  // d..aB d..a8
++  const auto dcba_C = ZipUpper(ba8, dc8);  // d..aF d..aC
++  StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
++  StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
++  StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
++  StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
++                               const Vec128<uint8_t, 8> in1,
++                               const Vec128<uint8_t, 8> in2,
++                               const Vec128<uint8_t, 8> in3,
++                               Simd<uint8_t, 8> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors to reduce the number of stores.
++  const Vec128<uint8_t> v0{in0.raw};
++  const Vec128<uint8_t> v1{in1.raw};
++  const Vec128<uint8_t> v2{in2.raw};
++  const Vec128<uint8_t> v3{in3.raw};
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);       // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);       // d7 c7 .. d0 c0
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
++  const Full128<uint8_t> d_full;
++  StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
++  StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
++}
++
++// <= 32 bits
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
++                               const Vec128<uint8_t, N> in1,
++                               const Vec128<uint8_t, N> in2,
++                               const Vec128<uint8_t, N> in3,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors to reduce the number of stores.
++  const Vec128<uint8_t> v0{in0.raw};
++  const Vec128<uint8_t> v1{in1.raw};
++  const Vec128<uint8_t> v2{in2.raw};
++  const Vec128<uint8_t> v3{in3.raw};
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);       // b3 a3 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);       // d3 c3 .. d0 c0
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  alignas(16) uint8_t buf[16];
++  const Full128<uint8_t> d_full;
++  StoreU(BitCast(d_full, dcba_0), d_full, buf);
++  CopyBytes<4 * N>(buf, unaligned);
++}
++
+ // ------------------------------ Reductions
+ 
+ namespace detail {
+ 
+-// For u32/i32/f32.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++// N=1 for any T: no-op
++template <typename T>
++HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++
++// u32/i32/f32:
++
++// N=2
++template <typename T>
++HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
++}
++template <typename T>
++HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++template <typename T>
++HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++
++// N=4 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = v3210 + v1032;
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return v20_31_20_31 + v31_20_31_20;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Min(v20_31_20_31, v31_20_31_20);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Max(v20_31_20_31, v31_20_31_20);
+ }
+ 
+-// For u64/i64/f64.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++// u64/i64/f64:
++
++// N=2 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return v10 + v01;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Min(v10, v01);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Max(v10, v01);
+ }
+ 
+ }  // namespace detail
+ 
+-// Supported for u/i/f 32/64. Returns the sum in each lane.
++// Supported for u/i/f 32/64. Returns the same value in each lane.
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
+   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12	2021-06-02 10:56:05.240904417 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -154,27 +154,28 @@ HWY_API Vec128<double, N> Zero(Simd<doub
+ // Returns a vector/part with all lanes set to "t".
+ template <size_t N, HWY_IF_LE128(uint8_t, N)>
+ HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
+-  return Vec128<uint8_t, N>{_mm_set1_epi8(t)};
++  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(uint16_t, N)>
+ HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
+-  return Vec128<uint16_t, N>{_mm_set1_epi16(t)};
++  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(uint32_t, N)>
+ HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
+-  return Vec128<uint32_t, N>{_mm_set1_epi32(t)};
++  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
+ }
+ template <size_t N, HWY_IF_LE128(uint64_t, N)>
+ HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
+-  return Vec128<uint64_t, N>{_mm_set1_epi64x(t)};
++  return Vec128<uint64_t, N>{
++      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int8_t, N)>
+ HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
+-  return Vec128<int8_t, N>{_mm_set1_epi8(t)};
++  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int16_t, N)>
+ HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
+-  return Vec128<int16_t, N>{_mm_set1_epi16(t)};
++  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int32_t, N)>
+ HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
+@@ -182,7 +183,8 @@ HWY_API Vec128<int32_t, N> Set(Simd<int3
+ }
+ template <size_t N, HWY_IF_LE128(int64_t, N)>
+ HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
+-  return Vec128<int64_t, N>{_mm_set1_epi64x(t)};
++  return Vec128<int64_t, N>{
++      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(float, N)>
+ HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
+@@ -510,7 +512,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<
+ template <typename TFrom, typename TTo, size_t N>
+ HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
+   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+-  return Mask128<TTo, N>{m.raw};
++  const Simd<TFrom, N> d;
++  return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
+ }
+ 
+ // ------------------------------ Equality
+@@ -683,6 +686,14 @@ HWY_API Mask128<double, N> operator>=(co
+   return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
+ }
+ 
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
++}
++
+ // ================================================== ARITHMETIC
+ 
+ // ------------------------------ Addition
+@@ -894,7 +905,7 @@ template <size_t N>
+ HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
+   return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
+ }
+-
++// i64 is implemented after BroadcastSignBit.
+ template <size_t N>
+ HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
+   const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
+@@ -959,7 +970,6 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> Mu
+ 
+ // ------------------------------ ShiftLeft
+ 
+-// Unsigned
+ template <int kBits, size_t N>
+ HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
+   return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
+@@ -988,6 +998,16 @@ HWY_API Vec128<int64_t, N> ShiftLeft(con
+   return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
+ }
+ 
++template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
++  const Simd<T, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
++  return kBits == 1
++             ? (v + v)
++             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
++}
++
+ // ------------------------------ ShiftRight
+ 
+ template <int kBits, size_t N>
+@@ -1004,6 +1024,15 @@ HWY_API Vec128<uint64_t, N> ShiftRight(c
+ }
+ 
+ template <int kBits, size_t N>
++HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
++  const Simd<uint8_t, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<uint8_t, N> shifted{
++      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
++  return shifted & Set(d8, 0xFF >> kBits);
++}
++
++template <int kBits, size_t N>
+ HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
+   return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
+ }
+@@ -1012,6 +1041,15 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
+   return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
+ }
+ 
++template <int kBits, size_t N>
++HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
++  const Simd<int8_t, N> di;
++  const Simd<uint8_t, N> du;
++  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // i64 is implemented after BroadcastSignBit.
+ 
+ // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
+@@ -1039,15 +1077,24 @@ HWY_API Vec128<int64_t, N> BroadcastSign
+   return VecFromMask(v < Zero(Simd<int64_t, N>()));
+ #else
+   // Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires
+-  // two constants and domain crossing. 32-bit compare only requires Zero()
+-  // plus a shuffle to replicate the upper 32 bits.
++  // two constants and domain crossing. 32-bit shift avoids generating a zero.
+   const Simd<int32_t, N * 2> d32;
+-  const auto sign = BitCast(d32, v) < Zero(d32);
++  const auto sign = ShiftRight<31>(BitCast(d32, v));
+   return Vec128<int64_t, N>{
+       _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
+ #endif
+ }
+ 
++template <size_t N>
++HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
++#if HWY_TARGET == HWY_AVX3
++  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
++#else
++  const auto zero = Zero(Simd<int64_t,N>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
+ template <int kBits, size_t N>
+ HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
+ #if HWY_TARGET == HWY_AVX3
+@@ -1097,6 +1144,15 @@ HWY_API Vec128<int64_t, N> ShiftLeftSame
+   return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
++  const Simd<T, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<T, N> shifted{
++      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
++  return shifted & Set(d8, (0xFF << bits) & 0xFF);
++}
++
+ // ------------------------------ ShiftRightSame (BroadcastSignBit)
+ 
+ template <size_t N>
+@@ -1116,6 +1172,16 @@ HWY_API Vec128<uint64_t, N> ShiftRightSa
+ }
+ 
+ template <size_t N>
++HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
++                                          const int bits) {
++  const Simd<uint8_t, N> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec128<uint8_t, N> shifted{
++      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
++  return shifted & Set(d8, 0xFF >> bits);
++}
++
++template <size_t N>
+ HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
+                                           const int bits) {
+   return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
+@@ -1140,6 +1206,15 @@ HWY_API Vec128<int64_t, N> ShiftRightSam
+ #endif
+ }
+ 
++template <size_t N>
++HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
++  const Simd<int8_t, N> di;
++  const Simd<uint8_t, N> du;
++  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ Negate
+ 
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
+@@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128<double,
+   _mm_stream_pd(aligned, v.raw);
+ }
+ 
+-// ------------------------------ Gather
++// ------------------------------ Scatter
++
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+ 
+ // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
+ using GatherIndex64 = long long int;  // NOLINT(google-runtime-int)
+ static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
+ 
++#if HWY_TARGET == HWY_AVX3
++namespace detail {
++
++template <typename T, size_t N>
++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
++                           Simd<T, N> /* tag */, T* HWY_RESTRICT base,
++                           const Vec128<int32_t, N> offset) {
++  if (N == 4) {
++    _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
++  }
++}
++template <typename T, size_t N>
++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
++                          Simd<T, N> /* tag */, T* HWY_RESTRICT base,
++                          const Vec128<int32_t, N> index) {
++  if (N == 4) {
++    _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
++  }
++}
++
++template <typename T, size_t N>
++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
++                           Simd<T, N> /* tag */, T* HWY_RESTRICT base,
++                           const Vec128<int64_t, N> offset) {
++  if (N == 2) {
++    _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
++  }
++}
++template <typename T, size_t N>
++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
++                          Simd<T, N> /* tag */, T* HWY_RESTRICT base,
++                          const Vec128<int64_t, N> index) {
++  if (N == 2) {
++    _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
++  }
++}
++
++}  // namespace detail
++
++template <typename T, size_t N, typename Offset>
++HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                           const Vec128<Offset, N> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
++}
++template <typename T, size_t N, typename Index>
++HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                          const Vec128<Index, N> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
++}
++
++template <size_t N>
++HWY_INLINE void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */,
++                              float* HWY_RESTRICT base,
++                              const Vec128<int32_t, N> offset) {
++  if (N == 4) {
++    _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
++  }
++}
++template <size_t N>
++HWY_INLINE void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */,
++                             float* HWY_RESTRICT base,
++                             const Vec128<int32_t, N> index) {
++  if (N == 4) {
++    _mm_i32scatter_ps(base, index.raw, v.raw, 4);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
++  }
++}
++
++template <size_t N>
++HWY_INLINE void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */,
++                              double* HWY_RESTRICT base,
++                              const Vec128<int64_t, N> offset) {
++  if (N == 2) {
++    _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
++  }
++}
++template <size_t N>
++HWY_INLINE void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */,
++                             double* HWY_RESTRICT base,
++                             const Vec128<int64_t, N> index) {
++  if (N == 2) {
++    _mm_i64scatter_pd(base, index.raw, v.raw, 8);
++  } else {
++    const __mmask8 mask = (1u << N) - 1;
++    _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
++  }
++}
++#else  // HWY_TARGET == HWY_AVX3
++
++template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
++HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                           const Vec128<Offset, N> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
++  }
++}
++
++template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
++HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
++                          const Vec128<Index, N> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  for (size_t i = 0; i < N; ++i) {
++    base[index_lanes[i]] = lanes[i];
++  }
++}
++
++#endif
++
++// ------------------------------ Gather (Load/Store)
++
+ #if HWY_TARGET == HWY_SSE4
+ 
+ template <typename T, size_t N, typename Offset>
+ HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
+                                   const T* HWY_RESTRICT base,
+                                   const Vec128<Offset, N> offset) {
+-  static_assert(N == 1, "SSE4 does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
+-  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
+-  T val;
+-  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
+-  return Set(d, val);
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  alignas(16) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  alignas(16) T lanes[N];
++  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
++  }
++  return Load(d, lanes);
+ }
+ 
+ template <typename T, size_t N, typename Index>
+ HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
+                                  const Vec128<Index, N> index) {
+-  static_assert(N == 1, "SSE4 does not support full gather");
+-  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
+-  return Set(d, base[GetLane(index)]);
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  alignas(16) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  alignas(16) T lanes[N];
++  for (size_t i = 0; i < N; ++i) {
++    lanes[i] = base[index_lanes[i]];
++  }
++  return Load(d, lanes);
+ }
+ 
+ #else
+@@ -1832,6 +2071,8 @@ HWY_API Vec128<double, N> GatherIndex(Si
+ 
+ #endif  // HWY_TARGET != HWY_SSE4
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== SWIZZLE
+ 
+ // ------------------------------ Extract half
+@@ -1859,10 +2100,10 @@ HWY_INLINE Vec128<double, 1> UpperHalf(V
+ // ------------------------------ Shift vector by constant #bytes
+ 
+ // 0x01..0F, kBytes = 1 => 0x02..0F00
+-template <int kBytes, typename T>
+-HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
++template <int kBytes, typename T, size_t N>
++HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
+   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+-  return Vec128<T>{_mm_slli_si128(v.raw, kBytes)};
++  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
+ }
+ 
+ template <int kLanes, typename T, size_t N>
+@@ -1873,10 +2114,10 @@ HWY_API Vec128<T, N> ShiftLeftLanes(cons
+ }
+ 
+ // 0x01..0F, kBytes = 1 => 0x0001..0E
+-template <int kBytes, typename T>
+-HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
++template <int kBytes, typename T, size_t N>
++HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
+   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+-  return Vec128<T>{_mm_srli_si128(v.raw, kBytes)};
++  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
+ }
+ 
+ template <int kLanes, typename T, size_t N>
+@@ -2041,44 +2282,47 @@ HWY_API Vec128<float> Shuffle0123(const
+ // ------------------------------ TableLookupLanes
+ 
+ // Returned by SetTableIndices for use by TableLookupLanes.
+-template <typename T>
++template <typename T, size_t N>
+ struct Indices128 {
+   __m128i raw;
+ };
+ 
+-template <typename T>
+-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
+ #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+-  const size_t N = 16 / sizeof(T);
+   for (size_t i = 0; i < N; ++i) {
+     HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+   }
+ #endif
+ 
+-  const Full128<uint8_t> d8;
+-  alignas(16) uint8_t control[16];
+-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
+-    const size_t idx_lane = idx_byte / sizeof(T);
+-    const size_t mod = idx_byte % sizeof(T);
+-    control[idx_byte] = static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + mod);
++  const Repartition<uint8_t, decltype(d)> d8;
++  alignas(16) uint8_t control[16] = {0};
++  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
++    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
++      control[idx_lane * sizeof(T) + idx_byte] =
++          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
++    }
+   }
+-  return Indices128<T>{Load(d8, control).raw};
++  return Indices128<T, N>{Load(d8, control).raw};
+ }
+ 
+-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
+-                                          const Indices128<uint32_t> idx) {
+-  return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<uint32_t, N> TableLookupLanes(
++    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
+ }
+-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
+-                                         const Indices128<int32_t> idx) {
+-  return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
++                                            const Indices128<int32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
+ }
+-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
+-                                       const Indices128<float> idx) {
+-  const Full128<int32_t> di;
+-  const Full128<float> df;
++template <size_t N>
++HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
++                                          const Indices128<float, N> idx) {
++  const Simd<int32_t, N> di;
++  const Simd<float, N> df;
+   return BitCast(df,
+-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
++                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
+ }
+ 
+ // ------------------------------ Interleave lanes
+@@ -2286,47 +2530,47 @@ HWY_INLINE Vec128<double> ConcatUpperLow
+ 
+ namespace detail {
+ 
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  const Full128<T> d;
+-  const Full128<uint8_t> d8;
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  const Simd<T, N> d;
++  const Repartition<uint8_t, decltype(d)> d8;
+   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
+                                             0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
+   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
+ }
+ 
+ }  // namespace detail
+ 
+-template <typename T>
+-HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
+   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
+ }
+-template <>
+-HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
+-                                        const Vec128<float> b) {
+-  return Vec128<float>{_mm_blend_ps(a.raw, b.raw, 5)};
++template <size_t N>
++HWY_INLINE Vec128<float, N> OddEven(const Vec128<float, N> a,
++                                    const Vec128<float, N> b) {
++  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
+ }
+ 
+-template <>
+-HWY_INLINE Vec128<double> OddEven<double>(const Vec128<double> a,
+-                                          const Vec128<double> b) {
+-  return Vec128<double>{_mm_blend_pd(a.raw, b.raw, 1)};
++template <size_t N>
++HWY_INLINE Vec128<double, N> OddEven(const Vec128<double, N> a,
++                                     const Vec128<double, N> b) {
++  return Vec128<double, N>{_mm_blend_pd(a.raw, b.raw, 1)};
+ }
+ 
+ // ------------------------------ Shl (ZipLower, Mul)
+@@ -2764,7 +3008,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
+   return LowerHalf(LowerHalf(BitCast(d8, quad)));
+ }
+ 
+-// ------------------------------ Convert integer <=> floating point
++// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
+ 
+ template <size_t N>
+ HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
+@@ -2779,13 +3023,20 @@ HWY_API Vec128<double, N> ConvertTo(Simd
+   (void)dd;
+   return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
+ #else
+-  alignas(16) int64_t lanes_i[2];
+-  Store(v, Simd<int64_t, N>(), lanes_i);
+-  alignas(16) double lanes_d[2];
+-  for (size_t i = 0; i < N; ++i) {
+-    lanes_d[i] = static_cast<double>(lanes_i[i]);
+-  }
+-  return Load(dd, lanes_d);
++  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
++  const Repartition<uint32_t, decltype(dd)> d32;
++  const Repartition<uint64_t, decltype(dd)> d64;
++
++  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
++  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
++  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
++
++  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
++  const auto k52 = Set(d32, 0x43300000);
++  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
++
++  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
++  return (v_upper - k84_63_52) + v_lower;  // order matters!
+ #endif
+ }
+ 
+@@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128<T
+ namespace detail {
+ 
+ template <typename T, size_t N>
++HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
++  HWY_DASSERT(mask_bits < 256);
++  const Simd<T, N> d;
++  const Rebind<uint8_t, decltype(d)> d8;
++  const Simd<uint16_t, N> du;
++
++  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
++  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
++  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
++  // store lane indices and convert to byte indices (2*lane + 0..1), with the
++  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
++  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
++  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
++  // is likely more costly than the higher cache footprint from storing bytes.
++  alignas(16) constexpr uint8_t table[256 * 8] = {
++      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
++      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
++      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
++      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
++      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
++      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
++      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
++      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
++      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
++      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
++      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
++      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
++      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
++      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
++      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
++      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
++      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
++      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
++      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
++      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
++      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
++      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
++      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
++      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
++      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
++      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
++      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
++      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
++      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
++      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
++      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
++      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
++      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
++      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
++      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
++      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
++      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
++      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
++      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
++      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
++      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
++      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
++      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
++      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
++      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
++      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
++      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
++      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
++      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
++      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
++      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
++      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
++      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
++      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
++      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
++      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
++      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
++      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
++      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
++      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
++      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
++      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
++      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
++      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
++      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
++      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
++      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
++      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
++      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
++      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
++      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
++      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
++      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
++      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
++      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
++      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
++      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
++      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
++      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
++      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
++      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
++      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
++      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
++      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
++      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
++      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
++      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
++      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
++      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
++      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
++      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
++      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
++      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
++      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
++      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
++      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
++      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
++      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
++      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
++      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
++      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
++      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
++      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
++      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
++      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
++      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
++      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
++      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
++      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
++      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
++      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
++      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
++      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
++      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
++
++  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
++  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
++  return BitCast(d, pairs + Set(du, 0x0100));
++}
++
++template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+   HWY_DASSERT(mask_bits < 16);
+ 
+@@ -2968,71 +3355,42 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
+ // Helper function called by both Compress and CompressStore - avoids a
+ // redundant BitsFromMask in the latter.
+ 
+-template <size_t N>
+-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
+-                                     const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
+-#else
+-  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-#endif
+-}
+-template <size_t N>
+-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
+-                                    const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec128<int32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
+-#else
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-#endif
+-}
+-
+-template <size_t N>
+-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
+-                                     const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec128<uint64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
+-#else
+-  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-#endif
+-}
+-template <size_t N>
+-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
+-                                    const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec128<int64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
+-#else
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  return TableLookupBytes(v, idx);
+-#endif
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
++                              const uint64_t mask_bits) {
++  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
++  using D = Simd<T, N>;
++  const RebindToSigned<D> di;
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ }
+ 
+-template <size_t N>
+-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
+-                                  const uint64_t mask_bits) {
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
++                              const uint64_t mask_bits) {
++  using D = Simd<T, N>;
++  using TI = MakeSigned<T>;
++  const Rebind<TI, D> di;
+ #if HWY_TARGET == HWY_AVX3
+-  return Vec128<float, N>{_mm_maskz_compress_ps(mask_bits, v.raw)};
++  return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi32(
++                          mask_bits, BitCast(di, v).raw)});
+ #else
+-  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
+-  const Simd<float, N> df;
+-  const Simd<int32_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
++  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ #endif
+ }
+ 
+-template <size_t N>
+-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
+-                                   const uint64_t mask_bits) {
++template <typename T, size_t N>
++HWY_API Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
++                              const uint64_t mask_bits) {
++  using D = Simd<T, N>;
++  using TI = MakeSigned<T>;
++  const Rebind<TI, D> di;
+ #if HWY_TARGET == HWY_AVX3
+-  return Vec128<double, N>{_mm_maskz_compress_pd(mask_bits, v.raw)};
++  return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi64(
++                          mask_bits, BitCast(di, v).raw)});
+ #else
+-  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
+-  const Simd<double, N> df;
+-  const Simd<int64_t, N> di;
+-  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
++  const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
++  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+ #endif
+ }
+ 
+@@ -3040,7 +3398,8 @@ HWY_API Vec128<double, N> Compress(Vec12
+ 
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
+-  return detail::Compress(v, detail::BitsFromMask(mask));
++  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
++                          detail::BitsFromMask(mask));
+ }
+ 
+ // ------------------------------ CompressStore
+@@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128<T, N
+                              Simd<T, N> d, T* HWY_RESTRICT aligned) {
+   const uint64_t mask_bits = detail::BitsFromMask(mask);
+   // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
+-  Store(detail::Compress(v, mask_bits), d, aligned);
++  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
+   return PopCount(mask_bits);
+ }
+ 
++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
++// TableLookupBytes)
++
++// 128 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
++                               const Vec128<uint8_t> v1,
++                               const Vec128<uint8_t> v2, Full128<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const auto k5 = Set(d, 5);
++  const auto k6 = Set(d, 6);
++
++  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = Load(d, tbl_r0);
++  const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(v0, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(v1, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(v2, shuf_b0);  // .4..3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  StoreU(int0, d, unaligned + 0 * 16);
++
++  // Second vector: g10,r10, bgr[9:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
++  const auto r1 = TableLookupBytes(v0, shuf_r1);
++  const auto g1 = TableLookupBytes(v1, shuf_g1);
++  const auto b1 = TableLookupBytes(v2, shuf_b1);
++  const auto int1 = r1 | g1 | b1;
++  StoreU(int1, d, unaligned + 1 * 16);
++
++  // Third vector: bgr[15:11], b10
++  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
++  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
++  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
++  const auto r2 = TableLookupBytes(v0, shuf_r2);
++  const auto g2 = TableLookupBytes(v1, shuf_g2);
++  const auto b2 = TableLookupBytes(v2, shuf_b2);
++  const auto int2 = r2 | g2 | b2;
++  StoreU(int2, d, unaligned + 2 * 16);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
++                               const Vec128<uint8_t, 8> v1,
++                               const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors for the shuffles and first result.
++  const Full128<uint8_t> d_full;
++  const auto k5 = Set(d_full, 5);
++  const auto k6 = Set(d_full, 6);
++
++  const Vec128<uint8_t> full_a{v0.raw};
++  const Vec128<uint8_t> full_b{v1.raw};
++  const Vec128<uint8_t> full_c{v2.raw};
++
++  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = Load(d_full, tbl_r0);
++  const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  StoreU(int0, d_full, unaligned + 0 * 16);
++
++  // Second (HALF) vector: bgr[7:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
++  const auto r1 = TableLookupBytes(full_a, shuf_r1);
++  const auto g1 = TableLookupBytes(full_b, shuf_g1);
++  const auto b1 = TableLookupBytes(full_c, shuf_b1);
++  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
++  StoreU(int1, d, unaligned + 1 * 16);
++}
++
++// <= 32 bits
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
++                               const Vec128<uint8_t, N> v1,
++                               const Vec128<uint8_t, N> v2,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors for the shuffles and result.
++  const Full128<uint8_t> d_full;
++
++  const Vec128<uint8_t> full_a{v0.raw};
++  const Vec128<uint8_t> full_b{v1.raw};
++  const Vec128<uint8_t> full_c{v2.raw};
++
++  // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
++      0x80, 0x80, 0x80, 0x80};
++  const auto shuf_r0 = Load(d_full, tbl_r0);
++  const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
++  const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
++  const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
++  const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
++  const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
++  const auto int0 = r0 | g0 | b0;
++  alignas(16) uint8_t buf[16];
++  StoreU(int0, d_full, buf);
++  CopyBytes<N * 3>(buf, unaligned);
++}
++
++// ------------------------------ StoreInterleaved4
++
++// 128 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
++                               const Vec128<uint8_t> v1,
++                               const Vec128<uint8_t> v2,
++                               const Vec128<uint8_t> v3, Full128<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
++  const auto ba8 = ZipUpper(v0, v1);
++  const auto dc8 = ZipUpper(v2, v3);
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
++  const auto dcba_8 = ZipLower(ba8, dc8);  // d..aB d..a8
++  const auto dcba_C = ZipUpper(ba8, dc8);  // d..aF d..aC
++  StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
++  StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
++  StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
++  StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
++}
++
++// 64 bits
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
++                               const Vec128<uint8_t, 8> in1,
++                               const Vec128<uint8_t, 8> in2,
++                               const Vec128<uint8_t, 8> in3,
++                               Simd<uint8_t, 8> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors to reduce the number of stores.
++  const Vec128<uint8_t> v0{in0.raw};
++  const Vec128<uint8_t> v1{in1.raw};
++  const Vec128<uint8_t> v2{in2.raw};
++  const Vec128<uint8_t> v3{in3.raw};
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);       // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);       // d7 c7 .. d0 c0
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a7 d..a4
++  const Full128<uint8_t> d_full;
++  StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
++  StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
++}
++
++// <= 32 bits
++template <size_t N, HWY_IF_LE32(uint8_t, N)>
++HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
++                               const Vec128<uint8_t, N> in1,
++                               const Vec128<uint8_t, N> in2,
++                               const Vec128<uint8_t, N> in3,
++                               Simd<uint8_t, N> /*tag*/,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // Use full vectors to reduce the number of stores.
++  const Vec128<uint8_t> v0{in0.raw};
++  const Vec128<uint8_t> v1{in1.raw};
++  const Vec128<uint8_t> v2{in2.raw};
++  const Vec128<uint8_t> v3{in3.raw};
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);       // b3 a3 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);       // d3 c3 .. d0 c0
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a3 d..a0
++  alignas(16) uint8_t buf[16];
++  const Full128<uint8_t> d_full;
++  StoreU(BitCast(d_full, dcba_0), d_full, buf);
++  CopyBytes<4 * N>(buf, unaligned);
++}
++
+ // ------------------------------ Reductions
+ 
+ namespace detail {
+ 
+-// For u32/i32/f32.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++// N=1 for any T: no-op
++template <typename T>
++HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++
++// u32/i32/f32:
++
++// N=2
++template <typename T>
++HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
++}
++template <typename T>
++HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++template <typename T>
++HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++
++// N=4 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = v3210 + v1032;
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return v20_31_20_31 + v31_20_31_20;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Min(v20_31_20_31, v31_20_31_20);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Max(v20_31_20_31, v31_20_31_20);
+ }
+ 
+-// For u64/i64/f64.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++// u64/i64/f64:
++
++// N=2 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return v10 + v01;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Min(v10, v01);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Max(v10, v01);
+ }
+ 
+ }  // namespace detail
+ 
+-// Supported for u/i/f 32/64. Returns the sum in each lane.
++// Supported for u/i/f 32/64. Returns the same value in each lane.
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
+   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12	2021-06-02 10:56:05.234904387 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -20,6 +20,20 @@
+ // particular, "Broadcast", pack and zip behavior may be surprising.
+ 
+ #include <immintrin.h>  // AVX2+
++
++#if defined(_MSC_VER) && defined(__clang__)
++// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
++// including these headers when _MSC_VER is defined, like when using clang-cl.
++// Include these directly here.
++#include <avxintrin.h>
++// avxintrin defines __m256i and must come before avx2intrin.
++#include <avx2intrin.h>
++#include <bmi2intrin.h>  // _pext_u64
++#include <f16cintrin.h>
++#include <fmaintrin.h>
++#include <smmintrin.h>
++#endif
++
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -148,23 +162,24 @@ HWY_API Vec256<uint16_t> Set(Full256<uin
+   return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
+-  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};  // NOLINT
++  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
+ }
+ HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
+   return Vec256<uint64_t>{
+       _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
+-  return Vec256<int8_t>{_mm256_set1_epi8(t)};
++  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
+-  return Vec256<int16_t>{_mm256_set1_epi16(t)};
++  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
+   return Vec256<int32_t>{_mm256_set1_epi32(t)};
+ }
+ HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
+-  return Vec256<int64_t>{_mm256_set1_epi64x(t)};
++  return Vec256<int64_t>{
++      _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
+   return Vec256<float>{_mm256_set1_ps(t)};
+@@ -340,6 +355,8 @@ HWY_API Vec256<T> VecFromMask(Full256<T>
+   return Vec256<T>{v.raw};
+ }
+ 
++// ------------------------------ IfThenElse
++
+ // mask ? yes : no
+ template <typename T>
+ HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
+@@ -412,9 +429,9 @@ HWY_API Mask256<T> Xor(const Mask256<T>
+ // Comparisons fill a lane with 1-bits if the condition is true, else 0.
+ 
+ template <typename TFrom, typename TTo>
+-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
++HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
+   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+-  return Mask256<TTo>{m.raw};
++  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
+ }
+ 
+ // ------------------------------ Equality
+@@ -670,6 +687,14 @@ HWY_API Vec256<double> Max(const Vec256<
+   return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
+ }
+ 
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T>
++HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
++}
++
+ // ================================================== ARITHMETIC
+ 
+ // ------------------------------ Addition
+@@ -832,7 +857,13 @@ HWY_API Vec256<uint16_t> AverageRound(co
+ 
+ // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
+ HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
++#if HWY_COMPILER_MSVC
++  // Workaround for incorrect codegen? (wrong result)
++  const auto zero = Zero(Full256<int8_t>());
++  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
++#else
+   return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
++#endif
+ }
+ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
+   return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
+@@ -840,6 +871,7 @@ HWY_API Vec256<int16_t> Abs(const Vec256
+ HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
+   return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
+ }
++// i64 is implemented after BroadcastSignBit.
+ 
+ HWY_API Vec256<float> Abs(const Vec256<float> v) {
+   const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
+@@ -925,6 +957,16 @@ HWY_API Vec256<int64_t> ShiftLeft(const
+   return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
+ }
+ 
++template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
++  const Full256<T> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
++  return kBits == 1
++             ? (v + v)
++             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
++}
++
+ // ------------------------------ ShiftRight
+ 
+ template <int kBits>
+@@ -943,6 +985,14 @@ HWY_API Vec256<uint64_t> ShiftRight(cons
+ }
+ 
+ template <int kBits>
++HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
++  const Full256<uint8_t> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
++  return shifted & Set(d8, 0xFF >> kBits);
++}
++
++template <int kBits>
+ HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
+   return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
+ }
+@@ -952,6 +1002,15 @@ HWY_API Vec256<int32_t> ShiftRight(const
+   return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
+ }
+ 
++template <int kBits>
++HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
++  const Full256<int8_t> di;
++  const Full256<uint8_t> du;
++  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // i64 is implemented after BroadcastSignBit.
+ 
+ // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
+@@ -989,6 +1048,15 @@ HWY_API Vec256<int64_t> ShiftRight(const
+ #endif
+ }
+ 
++HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
++#if HWY_TARGET == HWY_AVX3
++  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
++#else
++  const auto zero = Zero(Full256<int64_t>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
+ // ------------------------------ ShiftLeftSame
+ 
+ HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
+@@ -1016,6 +1084,14 @@ HWY_API Vec256<int64_t> ShiftLeftSame(co
+   return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++template <typename T, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
++  const Full256<T> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
++  return shifted & Set(d8, (0xFF << bits) & 0xFF);
++}
++
+ // ------------------------------ ShiftRightSame (BroadcastSignBit)
+ 
+ HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
+@@ -1031,6 +1107,13 @@ HWY_API Vec256<uint64_t> ShiftRightSame(
+   return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
++  const Full256<uint8_t> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
++  return shifted & Set(d8, 0xFF >> bits);
++}
++
+ HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
+                                        const int bits) {
+   return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
+@@ -1053,6 +1136,14 @@ HWY_API Vec256<int64_t> ShiftRightSame(c
+ #endif
+ }
+ 
++HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
++  const Full256<int8_t> di;
++  const Full256<uint8_t> du;
++  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ Negate
+ 
+ template <typename T, HWY_IF_FLOAT(T)>
+@@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256<double>
+   _mm256_stream_pd(aligned, v.raw);
+ }
+ 
++// ------------------------------ Scatter
++
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
++
++#if HWY_TARGET == HWY_AVX3
++namespace detail {
++
++template <typename T>
++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256<T> v,
++                           Full256<T> /* tag */, T* HWY_RESTRICT base,
++                           const Vec256<int32_t> offset) {
++  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
++}
++template <typename T>
++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256<T> v,
++                          Full256<T> /* tag */, T* HWY_RESTRICT base,
++                          const Vec256<int32_t> index) {
++  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
++}
++
++template <typename T>
++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256<T> v,
++                           Full256<T> /* tag */, T* HWY_RESTRICT base,
++                           const Vec256<int64_t> offset) {
++  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
++}
++template <typename T>
++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256<T> v,
++                          Full256<T> /* tag */, T* HWY_RESTRICT base,
++                          const Vec256<int64_t> index) {
++  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
++}
++
++}  // namespace detail
++
++template <typename T, typename Offset>
++HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
++                           const Vec256<Offset> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
++}
++template <typename T, typename Index>
++HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
++                          const Vec256<Index> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
++}
++
++template <>
++HWY_INLINE void ScatterOffset<float>(Vec256<float> v, Full256<float> /* tag */,
++                                     float* HWY_RESTRICT base,
++                                     const Vec256<int32_t> offset) {
++  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
++}
++template <>
++HWY_INLINE void ScatterIndex<float>(Vec256<float> v, Full256<float> /* tag */,
++                                    float* HWY_RESTRICT base,
++                                    const Vec256<int32_t> index) {
++  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
++}
++
++template <>
++HWY_INLINE void ScatterOffset<double>(Vec256<double> v,
++                                      Full256<double> /* tag */,
++                                      double* HWY_RESTRICT base,
++                                      const Vec256<int64_t> offset) {
++  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
++}
++template <>
++HWY_INLINE void ScatterIndex<double>(Vec256<double> v,
++                                     Full256<double> /* tag */,
++                                     double* HWY_RESTRICT base,
++                                     const Vec256<int64_t> index) {
++  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
++}
++
++#else
++
++template <typename T, typename Offset>
++HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
++                           const Vec256<Offset> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++
++  constexpr size_t N = 32 / sizeof(T);
++  alignas(32) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(32) Offset offset_lanes[N];
++  Store(offset, Simd<Offset, N>(), offset_lanes);
++
++  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
++  for (size_t i = 0; i < N; ++i) {
++    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
++  }
++}
++
++template <typename T, typename Index>
++HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
++                          const Vec256<Index> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++
++  constexpr size_t N = 32 / sizeof(T);
++  alignas(32) T lanes[N];
++  Store(v, d, lanes);
++
++  alignas(32) Index index_lanes[N];
++  Store(index, Simd<Index, N>(), index_lanes);
++
++  for (size_t i = 0; i < N; ++i) {
++    base[index_lanes[i]] = lanes[i];
++  }
++}
++
++#endif
++
+ // ------------------------------ Gather
+ 
+ namespace detail {
+@@ -1374,13 +1582,13 @@ HWY_API Vec256<T> GatherIndex(hwy::SizeT
+ template <typename T, typename Offset>
+ HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
+                                const Vec256<Offset> offset) {
+-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
+ }
+ template <typename T, typename Index>
+ HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
+                               const Vec256<Index> index) {
+-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+   return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
+ }
+ 
+@@ -1410,6 +1618,8 @@ HWY_INLINE Vec256<double> GatherIndex<do
+   return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== SWIZZLE
+ 
+ template <typename T>
+@@ -1861,38 +2071,26 @@ HWY_API Vec256<int64_t> ZipUpper(const V
+   return Vec256<int64_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
+ }
+ 
+-// ------------------------------ Blocks
++// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
++
++// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
++// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
++// UpperUpper at the cost of one extra cycle/instruction.
+ 
+ // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
+ template <typename T>
+ HWY_API Vec256<T> ConcatLowerLower(const Vec256<T> hi, const Vec256<T> lo) {
+-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)};
++  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)};
+ }
+ template <>
+ HWY_INLINE Vec256<float> ConcatLowerLower(const Vec256<float> hi,
+                                           const Vec256<float> lo) {
+-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)};
++  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)};
+ }
+ template <>
+ HWY_INLINE Vec256<double> ConcatLowerLower(const Vec256<double> hi,
+                                            const Vec256<double> lo) {
+-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)};
+-}
+-
+-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
+-template <typename T>
+-HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
+-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
+-}
+-template <>
+-HWY_INLINE Vec256<float> ConcatUpperUpper(const Vec256<float> hi,
+-                                          const Vec256<float> lo) {
+-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
+-}
+-template <>
+-HWY_INLINE Vec256<double> ConcatUpperUpper(const Vec256<double> hi,
+-                                           const Vec256<double> lo) {
+-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
++  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)};
+ }
+ 
+ // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
+@@ -1927,6 +2125,12 @@ HWY_INLINE Vec256<double> ConcatUpperLow
+   return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
+ }
+ 
++// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
++template <typename T>
++HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
++  return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo)));
++}
++
+ // ------------------------------ Odd/even lanes
+ 
+ namespace detail {
+@@ -2211,11 +2415,18 @@ HWY_API Vec128<int8_t> DemoteTo(Full128<
+       _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
+ }
+ 
++  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
++  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
++
+ HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
+                                    const Vec256<float> v) {
+   return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
+                                const Vec256<double> v) {
+   return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
+@@ -2241,7 +2452,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
+   return BitCast(Simd<uint8_t, 8>(), pair);
+ }
+ 
+-// ------------------------------ Convert integer <=> floating point
++// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
+ 
+ HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
+                                 const Vec256<int32_t> v) {
+@@ -2253,13 +2464,20 @@ HWY_API Vec256<double> ConvertTo(Full256
+   (void)dd;
+   return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
+ #else
+-  alignas(32) int64_t lanes_i[4];
+-  Store(v, Full256<int64_t>(), lanes_i);
+-  alignas(32) double lanes_d[4];
+-  for (size_t i = 0; i < 4; ++i) {
+-    lanes_d[i] = static_cast<double>(lanes_i[i]);
+-  }
+-  return Load(dd, lanes_d);
++  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
++  const Repartition<uint32_t, decltype(dd)> d32;
++  const Repartition<uint64_t, decltype(dd)> d64;
++
++  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
++  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
++  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
++
++  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
++  const auto k52 = Set(d32, 0x43300000);
++  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
++
++  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
++  return (v_upper - k84_63_52) + v_lower;  // order matters!
+ #endif
+ }
+ 
+@@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT
+   const auto compressed =
+       _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
+   return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
+-
+-#endif
++#endif  // HWY_ARCH_X86_64
+ }
+ 
+ template <typename T>
+@@ -2473,75 +2690,100 @@ HWY_INLINE Vec256<uint32_t> Idx64x4FromB
+   return Load(d32, packed_array + 8 * mask_bits);
+ }
+ 
+-// Helper function called by both Compress and CompressStore - avoids a
++// Helper functions called by both Compress and CompressStore - avoids a
+ // redundant BitsFromMask in the latter.
+ 
+-HWY_API Vec256<uint32_t> Compress(Vec256<uint32_t> v,
+-                                  const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec256<uint32_t>{
+-      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
+-#else
+-  const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
+-  return Vec256<uint32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+-#endif
+-}
+-HWY_API Vec256<int32_t> Compress(Vec256<int32_t> v, const uint64_t mask_bits) {
++template <typename T>
++HWY_API Vec256<T> Compress(hwy::SizeTag<4> /*tag*/, Vec256<T> v,
++                           const uint64_t mask_bits) {
++  const auto vu = BitCast(Full256<uint32_t>(), v);
+ #if HWY_TARGET == HWY_AVX3
+-  return Vec256<int32_t>{
+-      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
++  const __m256i ret =
++      _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw);
+ #else
+   const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
+-  return Vec256<int32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
++  const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
+ #endif
++  return BitCast(Full256<T>(), Vec256<uint32_t>{ret});
+ }
+ 
+-HWY_API Vec256<uint64_t> Compress(Vec256<uint64_t> v,
+-                                  const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec256<uint64_t>{
+-      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
+-#else
+-  const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
+-  return Vec256<uint64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+-#endif
+-}
+-HWY_API Vec256<int64_t> Compress(Vec256<int64_t> v, const uint64_t mask_bits) {
++template <typename T>
++HWY_API Vec256<T> Compress(hwy::SizeTag<8> /*tag*/, Vec256<T> v,
++                           const uint64_t mask_bits) {
++  const auto vu = BitCast(Full256<uint64_t>(), v);
+ #if HWY_TARGET == HWY_AVX3
+-  return Vec256<int64_t>{
+-      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
++  const __m256i ret =
++      _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw);
+ #else
+   const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
+-  return Vec256<int64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
++  const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
+ #endif
++  return BitCast(Full256<T>(), Vec256<uint64_t>{ret});
+ }
+ 
+-HWY_API Vec256<float> Compress(Vec256<float> v, const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec256<float>{
+-      _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)};
+-#else
+-  const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
+-  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
+-#endif
+-}
++// Otherwise, defined in x86_512-inl.h so it can use wider vectors.
++#if HWY_TARGET != HWY_AVX3
+ 
+-HWY_API Vec256<double> Compress(Vec256<double> v, const uint64_t mask_bits) {
+-#if HWY_TARGET == HWY_AVX3
+-  return Vec256<double>{
+-      _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)};
+-#else
+-  const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
+-  return Vec256<double>{_mm256_castsi256_pd(
+-      _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))};
+-#endif
++// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using
++// the native Compress is probably more efficient than 2 LUTs.
++template <typename T>
++HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
++                           const uint64_t mask_bits) {
++  using D = Full256<T>;
++  const Rebind<uint16_t, D> du;
++  const Repartition<int32_t, D> dw;
++  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
++  const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
++  const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
++
++  const uint64_t mask_bits0 = mask_bits & 0xFF;
++  const uint64_t mask_bits1 = mask_bits >> 8;
++  const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0);
++  const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1);
++
++  const Half<decltype(du)> dh;
++  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
++  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
++
++  const size_t count0 = PopCount(mask_bits0);
++  // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
++  // VPERMD for shifting at 4 byte granularity.
++  alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
++                                             0, 1, 2, 3, 4, 5, 6, 7};
++  const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2);
++  const auto shift1_multiple4 =
++      BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices));
++
++  // Whole-register unconditional shift by 2 bytes.
++  // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead?
++  const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
++                                                  shift1_multiple4.raw, 0x08);
++  const auto shift1_multiple2 =
++      Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
++
++  // Make the shift conditional on the lower bit of count0.
++  const auto m_odd = TestBit(Set(du, count0), Set(du, 1));
++  const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
++
++  // Blend the lower and shifted upper parts.
++  constexpr uint16_t on = 0xFFFF;
++  alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on),
++                                                    HWY_REP4(on), HWY_REP4(on)};
++  const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0));
++  return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1));
+ }
+ 
++#endif  // HWY_TARGET != HWY_AVX3
++
+ }  // namespace detail
+ 
++// Otherwise, defined in x86_512-inl.h after detail::Compress.
++#if HWY_TARGET != HWY_AVX3
++
+ template <typename T>
+ HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+-  return detail::Compress(v, detail::BitsFromMask(mask));
++  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
++                          detail::BitsFromMask(mask));
+ }
+ 
+ // ------------------------------ CompressStore
+@@ -2550,10 +2792,101 @@ template <typename T>
+ HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
+                              T* HWY_RESTRICT aligned) {
+   const uint64_t mask_bits = detail::BitsFromMask(mask);
+-  Store(detail::Compress(v, mask_bits), d, aligned);
++  // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
++  // using StoreU to concatenate the results would cause page faults if
++  // `aligned` is the last valid vector. Instead rely on in-register splicing.
++  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
+   return PopCount(mask_bits);
+ }
+ 
++#endif  // HWY_TARGET != HWY_AVX3
++
++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
++// TableLookupBytes, ConcatUpperLower)
++
++HWY_API void StoreInterleaved3(const Vec256<uint8_t> v0,
++                               const Vec256<uint8_t> v1,
++                               const Vec256<uint8_t> v2, Full256<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const auto k5 = Set(d, 5);
++  const auto k6 = Set(d, 6);
++
++  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = LoadDup128(d, tbl_r0);
++  const auto shuf_g0 = LoadDup128(d, tbl_g0);  // cannot reuse r0 due to 5
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(v0, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(v1, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(v2, shuf_b0);  // .4..3..2..1..0..
++  const auto interleaved_10_00 = r0 | g0 | b0;
++
++  // Second vector: g10,r10, bgr[9:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
++  const auto r1 = TableLookupBytes(v0, shuf_r1);
++  const auto g1 = TableLookupBytes(v1, shuf_g1);
++  const auto b1 = TableLookupBytes(v2, shuf_b1);
++  const auto interleaved_15_05 = r1 | g1 | b1;
++
++  // We want to write the lower halves of the interleaved vectors, then the
++  // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
++  // that would require two ununaligned stores. For the lower halves, we can
++  // merge two 128-bit stores for the same swizzling cost:
++  const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00);
++  StoreU(out0, d, unaligned + 0 * 32);
++
++  // Third vector: bgr[15:11], b10
++  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
++  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
++  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
++  const auto r2 = TableLookupBytes(v0, shuf_r2);
++  const auto g2 = TableLookupBytes(v1, shuf_g2);
++  const auto b2 = TableLookupBytes(v2, shuf_b2);
++  const auto interleaved_1A_0A = r2 | g2 | b2;
++
++  const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A);
++  StoreU(out1, d, unaligned + 1 * 32);
++
++  const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05);
++  StoreU(out2, d, unaligned + 2 * 32);
++}
++
++// ------------------------------ StoreInterleaved4
++
++HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
++                               const Vec256<uint8_t> v1,
++                               const Vec256<uint8_t> v2,
++                               const Vec256<uint8_t> v3, Full256<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
++  const auto ba8 = ZipUpper(v0, v1);
++  const auto dc8 = ZipUpper(v2, v3);
++  const auto dcba_0 = ZipLower(ba0, dc0);  // d..a13 d..a10 | d..a03 d..a00
++  const auto dcba_4 = ZipUpper(ba0, dc0);  // d..a17 d..a14 | d..a07 d..a04
++  const auto dcba_8 = ZipLower(ba8, dc8);  // d..a1B d..a18 | d..a0B d..a08
++  const auto dcba_C = ZipUpper(ba8, dc8);  // d..a1F d..a1C | d..a0F d..a0C
++  // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
++  // efficiently combine two lower halves into 256 bits:
++  const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0));
++  const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8));
++  StoreU(out0, d, unaligned + 0 * 32);
++  StoreU(out1, d, unaligned + 1 * 32);
++  const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0));
++  const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8));
++  StoreU(out2, d, unaligned + 2 * 32);
++  StoreU(out3, d, unaligned + 3 * 32);
++}
++
+ // ------------------------------ Reductions
+ 
+ namespace detail {
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12	2021-06-02 10:56:05.218904306 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -19,6 +19,23 @@
+ // particular, "Broadcast", pack and zip behavior may be surprising.
+ 
+ #include <immintrin.h>  // AVX2+
++#if defined(_MSC_VER) && defined(__clang__)
++// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
++// including these headers when _MSC_VER is defined, like when using clang-cl.
++// Include these directly here.
++#include <smmintrin.h>
++#include <avxintrin.h>
++#include <avx2intrin.h>
++#include <f16cintrin.h>
++#include <fmaintrin.h>
++#include <avx512fintrin.h>
++#include <avx512vlintrin.h>
++#include <avx512bwintrin.h>
++#include <avx512dqintrin.h>
++#include <avx512vlbwintrin.h>
++#include <avx512vldqintrin.h>
++#endif
++
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -100,9 +117,8 @@ struct RawMask512<8> {
+ // Mask register: one bit per lane.
+ template <typename T>
+ class Mask512 {
+-  using Raw = typename RawMask512<sizeof(T)>::type;
+-
+  public:
++  using Raw = typename RawMask512<sizeof(T)>::type;
+   Raw raw;
+ };
+ 
+@@ -167,23 +183,24 @@ HWY_API Vec512<uint16_t> Set(Full512<uin
+   return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
+-  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};  // NOLINT
++  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
+ }
+ HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
+   return Vec512<uint64_t>{
+       _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
+-  return Vec512<int8_t>{_mm512_set1_epi8(t)};
++  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
+-  return Vec512<int16_t>{_mm512_set1_epi16(t)};
++  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
+   return Vec512<int32_t>{_mm512_set1_epi32(t)};
+ }
+ HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
+-  return Vec512<int64_t>{_mm512_set1_epi64(t)};
++  return Vec512<int64_t>{
++      _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
+   return Vec512<float>{_mm512_set1_ps(t)};
+@@ -329,7 +346,45 @@ HWY_API Vec512<T> CopySignToAbs(const Ve
+   return CopySign(abs, sign);
+ }
+ 
+-// ------------------------------ Select/blend
++// ------------------------------ FirstN
++
++// Possibilities for constructing a bitmask of N ones:
++// - kshift* only consider the lowest byte of the shift count, so they would
++//   not correctly handle large n.
++// - Scalar shifts >= 64 are UB.
++// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
++//   we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
++
++#if HWY_ARCH_X86_32
++namespace detail {
++
++// 32 bit mask is sufficient for lane size >= 2.
++template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
++HWY_API Mask512<T> FirstN(size_t n) {
++  using Bits = typename Mask512<T>::Raw;
++  return Mask512<T>{static_cast<Bits>(_bzhi_u32(~uint32_t(0), n))};
++}
++
++template <typename T, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Mask512<T> FirstN(size_t n) {
++  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
++  return Mask512<T>{static_cast<__mmask64>(bits)};
++}
++
++}  // namespace detail
++#endif  // HWY_ARCH_X86_32
++
++template <typename T>
++HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
++#if HWY_ARCH_X86_64
++  using Bits = typename Mask512<T>::Raw;
++  return Mask512<T>{static_cast<Bits>(_bzhi_u64(~uint64_t(0), n))};
++#else
++  return detail::FirstN<T>(n);
++#endif  // HWY_ARCH_X86_64
++}
++
++// ------------------------------ IfThenElse
+ 
+ // Returns mask ? b : a.
+ 
+@@ -626,7 +681,13 @@ HWY_API Vec512<uint16_t> AverageRound(co
+ 
+ // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
+ HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) {
++#if HWY_COMPILER_MSVC
++  // Workaround for incorrect codegen? (untested due to internal compiler error)
++  const auto zero = Zero(Full512<int8_t>());
++  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
++#else
+   return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
++#endif
+ }
+ HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) {
+   return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
+@@ -634,6 +695,9 @@ HWY_API Vec512<int16_t> Abs(const Vec512
+ HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) {
+   return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
+ }
++HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
++  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
++}
+ 
+ // These aren't native instructions, they also involve AND with constant.
+ HWY_API Vec512<float> Abs(const Vec512<float> v) {
+@@ -675,6 +739,16 @@ HWY_API Vec512<int64_t> ShiftLeft(const
+   return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
+ }
+ 
++template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
++  const Full512<T> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
++  return kBits == 1
++             ? (v + v)
++             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
++}
++
+ // ------------------------------ ShiftRight
+ 
+ template <int kBits>
+@@ -693,6 +767,14 @@ HWY_API Vec512<uint64_t> ShiftRight(cons
+ }
+ 
+ template <int kBits>
++HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
++  const Full512<uint8_t> d8;
++  // Use raw instead of BitCast to support N=1.
++  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
++  return shifted & Set(d8, 0xFF >> kBits);
++}
++
++template <int kBits>
+ HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) {
+   return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
+ }
+@@ -707,6 +789,15 @@ HWY_API Vec512<int64_t> ShiftRight(const
+   return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
+ }
+ 
++template <int kBits>
++HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
++  const Full512<int8_t> di;
++  const Full512<uint8_t> du;
++  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ ShiftLeftSame
+ 
+ HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v,
+@@ -734,6 +825,14 @@ HWY_API Vec512<int64_t> ShiftLeftSame(co
+   return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++template <typename T, HWY_IF_LANE_SIZE(T, 1)>
++HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
++  const Full512<T> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
++  return shifted & Set(d8, (0xFF << bits) & 0xFF);
++}
++
+ // ------------------------------ ShiftRightSame
+ 
+ HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v,
+@@ -749,6 +848,13 @@ HWY_API Vec512<uint64_t> ShiftRightSame(
+   return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) {
++  const Full512<uint8_t> d8;
++  const RepartitionToWide<decltype(d8)> d16;
++  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
++  return shifted & Set(d8, 0xFF >> bits);
++}
++
+ HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v,
+                                        const int bits) {
+   return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
+@@ -763,6 +869,14 @@ HWY_API Vec512<int64_t> ShiftRightSame(c
+   return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+ }
+ 
++HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
++  const Full512<int8_t> di;
++  const Full512<uint8_t> du;
++  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
++  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
++  return (shifted ^ shifted_sign) - shifted_sign;
++}
++
+ // ------------------------------ Shl
+ 
+ HWY_API Vec512<uint16_t> operator<<(const Vec512<uint16_t> v,
+@@ -1046,6 +1160,10 @@ HWY_API Vec512<float> ApproximateRecipro
+ 
+ // ------------------------------ Floating-point rounding
+ 
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
++
+ // Toward nearest integer, tie to even
+ HWY_API Vec512<float> Round(const Vec512<float> v) {
+   return Vec512<float>{_mm512_roundscale_ps(
+@@ -1086,6 +1204,8 @@ HWY_API Vec512<double> Floor(const Vec51
+       _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== COMPARE
+ 
+ // Comparisons set a mask bit to 1 if the condition is true, else 0.
+@@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512<double>
+   _mm512_stream_pd(aligned, v.raw);
+ }
+ 
++// ------------------------------ Scatter
++
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
++
++namespace detail {
++
++template <typename T>
++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v,
++                           Full512<T> /* tag */, T* HWY_RESTRICT base,
++                           const Vec512<int32_t> offset) {
++  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
++}
++template <typename T>
++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v,
++                          Full512<T> /* tag */, T* HWY_RESTRICT base,
++                          const Vec512<int32_t> index) {
++  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
++}
++
++template <typename T>
++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v,
++                           Full512<T> /* tag */, T* HWY_RESTRICT base,
++                           const Vec512<int64_t> offset) {
++  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
++}
++template <typename T>
++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v,
++                          Full512<T> /* tag */, T* HWY_RESTRICT base,
++                          const Vec512<int64_t> index) {
++  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
++}
++
++}  // namespace detail
++
++template <typename T, typename Offset>
++HWY_API void ScatterOffset(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
++                           const Vec512<Offset> offset) {
++  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
++  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
++}
++template <typename T, typename Index>
++HWY_API void ScatterIndex(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
++                          const Vec512<Index> index) {
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
++  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
++}
++
++template <>
++HWY_INLINE void ScatterOffset<float>(Vec512<float> v, Full512<float> /* tag */,
++                                     float* HWY_RESTRICT base,
++                                     const Vec512<int32_t> offset) {
++  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
++}
++template <>
++HWY_INLINE void ScatterIndex<float>(Vec512<float> v, Full512<float> /* tag */,
++                                    float* HWY_RESTRICT base,
++                                    const Vec512<int32_t> index) {
++  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
++}
++
++template <>
++HWY_INLINE void ScatterOffset<double>(Vec512<double> v,
++                                      Full512<double> /* tag */,
++                                      double* HWY_RESTRICT base,
++                                      const Vec512<int64_t> offset) {
++  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
++}
++template <>
++HWY_INLINE void ScatterIndex<double>(Vec512<double> v,
++                                     Full512<double> /* tag */,
++                                     double* HWY_RESTRICT base,
++                                     const Vec512<int64_t> index) {
++  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
++}
++
+ // ------------------------------ Gather
+ 
+ namespace detail {
+@@ -1713,13 +1910,13 @@ HWY_API Vec512<T> GatherIndex(hwy::SizeT
+ template <typename T, typename Offset>
+ HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
+                                const Vec512<Offset> offset) {
+-  static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
++static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
+ }
+ template <typename T, typename Index>
+ HWY_API Vec512<T> GatherIndex(Full512<T> d, const T* HWY_RESTRICT base,
+                               const Vec512<Index> index) {
+-  static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
++  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+   return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
+ }
+ 
+@@ -1749,6 +1946,8 @@ HWY_INLINE Vec512<double> GatherIndex<do
+   return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== SWIZZLE
+ 
+ template <typename T>
+@@ -2439,7 +2638,11 @@ HWY_API Vec256<int8_t> DemoteTo(Full256<
+ 
+ HWY_API Vec256<float16_t> DemoteTo(Full256<float16_t> /* tag */,
+                                    const Vec512<float> v) {
++  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
++  HWY_DIAGNOSTICS(push)
++  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+   return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
++  HWY_DIAGNOSTICS(pop)
+ }
+ 
+ HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
+@@ -2633,8 +2836,81 @@ HWY_API Vec512<double> Compress(Vec512<d
+   return Vec512<double>{_mm512_maskz_compress_pd(mask.raw, v.raw)};
+ }
+ 
++namespace detail {
++
++// Ignore IDE redefinition error for these two functions: if this header is
++// included, then the functions weren't actually defined in x86_256-inl.h.
++template <typename T>
++HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
++                           const uint64_t mask_bits) {
++  using D = Full256<T>;
++  const Rebind<uint16_t, D> du;
++  const Rebind<int32_t, D> dw;       // 512-bit, not 256!
++  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
++  const Mask512<int32_t> mask{static_cast<__mmask16>(mask_bits)};
++  return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask)));
++}
++
++}  // namespace detail
++
++template <typename T>
++HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
++  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
++                          detail::BitsFromMask(mask));
++}
++
++// Expands to 32-bit, compresses, concatenate demoted halves.
++template <typename T, HWY_IF_LANE_SIZE(T, 2)>
++HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
++  using D = Full512<T>;
++  const Rebind<uint16_t, D> du;
++  const Repartition<int32_t, D> dw;
++  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
++  const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
++  const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
++
++  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)};
++  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask.raw >> 16)};
++  const auto compressed0 = Compress(promoted0, mask0);
++  const auto compressed1 = Compress(promoted1, mask1);
++
++  const Half<decltype(du)> dh;
++  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
++  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
++
++  // Concatenate into single vector by shifting upper with writemask.
++  const size_t num0 = CountTrue(mask0);
++  const __mmask32 m_upper = ~((1u << num0) - 1);
++  alignas(64) uint16_t iota[64] = {
++      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
++      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
++      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
++      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
++  const auto idx = LoadU(du, iota + 32 - num0);
++  return Vec512<T>{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw,
++                                                 demoted1.raw)};
++}
++
+ // ------------------------------ CompressStore
+ 
++template <typename T>
++HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
++                             T* HWY_RESTRICT aligned) {
++  const uint64_t mask_bits = detail::BitsFromMask(mask);
++  Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
++  return PopCount(mask_bits);
++}
++
++template <typename T, HWY_IF_LANE_SIZE(T, 2)>
++HWY_API size_t CompressStore(Vec512<T> v, const Mask512<T> mask, Full512<T> d,
++                             T* HWY_RESTRICT aligned) {
++  // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
++  // using StoreU to concatenate the results would cause page faults if
++  // `aligned` is the last valid vector. Instead rely on in-register splicing.
++  Store(Compress(v, mask), d, aligned);
++  return CountTrue(mask);
++}
++
+ HWY_API size_t CompressStore(Vec512<uint32_t> v, const Mask512<uint32_t> mask,
+                              Full512<uint32_t> /* tag */,
+                              uint32_t* HWY_RESTRICT aligned) {
+@@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512<doub
+   return CountTrue(mask);
+ }
+ 
++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
++// TableLookupBytes)
++
++HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
++                               const Vec512<uint8_t> c, Full512<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  const auto k5 = Set(d, 5);
++  const auto k6 = Set(d, 6);
++
++  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
++  // 0x80 so lanes to be filled from other vectors are 0 for blending.
++  alignas(16) static constexpr uint8_t tbl_r0[16] = {
++      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
++      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
++  alignas(16) static constexpr uint8_t tbl_g0[16] = {
++      0x80, 0, 0x80, 0x80, 1, 0x80,  //
++      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
++  const auto shuf_r0 = LoadDup128(d, tbl_r0);
++  const auto shuf_g0 = LoadDup128(d, tbl_g0);  // cannot reuse r0 due to 5
++  const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
++  const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
++  const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
++  const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
++  const auto i = (r0 | g0 | b0).raw;  // low byte in each 128bit: 30 20 10 00
++
++  // Second vector: g10,r10, bgr[9:6], b5,g5
++  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
++  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
++  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
++  const auto r1 = TableLookupBytes(a, shuf_r1);
++  const auto g1 = TableLookupBytes(b, shuf_g1);
++  const auto b1 = TableLookupBytes(c, shuf_b1);
++  const auto j = (r1 | g1 | b1).raw;  // low byte in each 128bit: 35 25 15 05
++
++  // Third vector: bgr[15:11], b10
++  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
++  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
++  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
++  const auto r2 = TableLookupBytes(a, shuf_r2);
++  const auto g2 = TableLookupBytes(b, shuf_g2);
++  const auto b2 = TableLookupBytes(c, shuf_b2);
++  const auto k = (r2 | g2 | b2).raw;  // low byte in each 128bit: 3A 2A 1A 0A
++
++  // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
++  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
++  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
++  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
++
++  // Alternating order, most-significant 128 bits from the second arg.
++  const __mmask8 m = 0xCC;
++  const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
++  const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
++  const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
++
++  StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64);  //  10 0A 05 00
++  StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64);  //  25 20 1A 15
++  StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64);  //  3A 35 30 2A
++}
++
++// ------------------------------ StoreInterleaved4
++
++HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
++                               const Vec512<uint8_t> v1,
++                               const Vec512<uint8_t> v2,
++                               const Vec512<uint8_t> v3, Full512<uint8_t> d,
++                               uint8_t* HWY_RESTRICT unaligned) {
++  // let a,b,c,d denote v0..3.
++  const auto ba0 = ZipLower(v0, v1);  // b7 a7 .. b0 a0
++  const auto dc0 = ZipLower(v2, v3);  // d7 c7 .. d0 c0
++  const auto ba8 = ZipUpper(v0, v1);
++  const auto dc8 = ZipUpper(v2, v3);
++  const auto i = ZipLower(ba0, dc0).raw;  // 4x128bit: d..a3 d..a0
++  const auto j = ZipUpper(ba0, dc0).raw;  // 4x128bit: d..a7 d..a4
++  const auto k = ZipLower(ba8, dc8).raw;  // 4x128bit: d..aB d..a8
++  const auto l = ZipUpper(ba8, dc8).raw;  // 4x128bit: d..aF d..aC
++  // 128-bit blocks were independent until now; transpose 4x4.
++  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
++  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
++  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
++  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
++  constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
++  constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
++  const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
++  const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
++  const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
++  const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
++  StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d, unaligned + 0 * 64);
++  StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d, unaligned + 1 * 64);
++  StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d, unaligned + 2 * 64);
++  StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d, unaligned + 3 * 64);
++}
++
+ // ------------------------------ Reductions
+ 
+ // Returns the sum in each lane.
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12	2021-06-02 10:56:05.281904625 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -28,12 +28,12 @@
+ 
+ #if HWY_ARCH_X86
+ #include <xmmintrin.h>
+-#ifdef _MSC_VER
++#if HWY_COMPILER_MSVC
+ #include <intrin.h>
+-#else
++#else  // HWY_COMPILER_MSVC
+ #include <cpuid.h>
+-#endif
+-#endif
++#endif  // HWY_COMPILER_MSVC
++#endif  // HWY_ARCH_X86
+ 
+ namespace hwy {
+ namespace {
+@@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const
+ // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
+ void Cpuid(const uint32_t level, const uint32_t count,
+            uint32_t* HWY_RESTRICT abcd) {
+-#ifdef _MSC_VER
++#if HWY_COMPILER_MSVC
+   int regs[4];
+   __cpuidex(regs, level, count);
+   for (int i = 0; i < 4; ++i) {
+     abcd[i] = regs[i];
+   }
+-#else
++#else  // HWY_COMPILER_MSVC
+   uint32_t a;
+   uint32_t b;
+   uint32_t c;
+@@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u
+   abcd[1] = b;
+   abcd[2] = c;
+   abcd[3] = d;
+-#endif
++#endif  // HWY_COMPILER_MSVC
+ }
+ 
+ // Returns the lower 32 bits of extended control register 0.
+ // Requires CPU support for "OSXSAVE" (see below).
+ uint32_t ReadXCR0() {
+-#ifdef _MSC_VER
++#if HWY_COMPILER_MSVC
+   return static_cast<uint32_t>(_xgetbv(0));
+-#else
++#else  // HWY_COMPILER_MSVC
+   uint32_t xcr0, xcr0_high;
+   const uint32_t index = 0;
+   asm volatile(".byte 0x0F, 0x01, 0xD0"
+                : "=a"(xcr0), "=d"(xcr0_high)
+                : "c"(index));
+   return xcr0;
+-#endif
++#endif  // HWY_COMPILER_MSVC
+ }
+ 
+ #endif  // HWY_ARCH_X86
+@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13;
+ constexpr uint32_t kAVX512DQ = 1u << 14;
+ constexpr uint32_t kAVX512BW = 1u << 15;
+ constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
+-#endif
++#endif  // HWY_ARCH_X86
+ 
+ }  // namespace
+ 
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12	2021-06-02 10:56:05.267904554 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h	2021-05-31 10:37:11.000000000 -0400
+@@ -65,7 +65,9 @@
+ // HWY_MAX_DYNAMIC_TARGETS in total.
+ #define HWY_HIGHEST_TARGET_BIT_X86 9
+ 
+-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
++#define HWY_SVE2 0x400
++#define HWY_SVE 0x800
++// 0x1000 reserved for Helium
+ #define HWY_NEON 0x2000
+ 
+ #define HWY_HIGHEST_TARGET_BIT_ARM 13
+@@ -90,6 +92,9 @@
+ // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
+ 
+ #define HWY_SCALAR 0x20000000
++
++#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
++
+ // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+ 
+ //------------------------------------------------------------------------------
+@@ -106,25 +111,26 @@
+ #ifndef HWY_BROKEN_TARGETS
+ 
+ // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
+-// SSE4 codegen (msan failure), so disable all those targets.
++// SSE4 codegen (possibly only for msan), so disable all those targets.
+ #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
+-// TODO: Disable all non-scalar targets for every build target once we have
+-// clang-7 enabled in our builders.
+-#ifdef MEMORY_SANITIZER
+ #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
+-#else
+-#define HWY_BROKEN_TARGETS 0
+-#endif
+ // This entails a major speed reduction, so warn unless the user explicitly
+ // opts in to scalar-only.
+ #if !defined(HWY_COMPILE_ONLY_SCALAR)
+ #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
+ #endif
+ 
+-// MSVC, or 32-bit may fail to compile AVX2/3.
+-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
++// 32-bit may fail to compile AVX2/3.
++#elif HWY_ARCH_X86_32
+ #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
+-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
++
++// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
++#elif HWY_COMPILER_MSVC != 0
++#define HWY_BROKEN_TARGETS (HWY_AVX3)
++
++// armv7be has not been tested and is not yet supported.
++#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
++#define HWY_BROKEN_TARGETS (HWY_NEON)
+ 
+ #else
+ #define HWY_BROKEN_TARGETS 0
+@@ -145,53 +151,74 @@
+ // user to override this without any guarantee of success.
+ #ifndef HWY_BASELINE_TARGETS
+ 
+-#ifdef __wasm_simd128__
++// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
++// HWY_TARGET == HWY_SCALAR.
++
++#if HWY_ARCH_WASM && defined(__wasm_simd128__)
+ #define HWY_BASELINE_WASM HWY_WASM
+ #else
+ #define HWY_BASELINE_WASM 0
+ #endif
+ 
+-#ifdef __VSX__
++// Avoid choosing the PPC target until we have an implementation.
++#if HWY_ARCH_PPC && defined(__VSX__) && 0
+ #define HWY_BASELINE_PPC8 HWY_PPC8
+ #else
+ #define HWY_BASELINE_PPC8 0
+ #endif
+ 
+-// GCC 4.5.4 only defines the former; 5.4 defines both.
+-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
++// Avoid choosing the SVE[2] targets the implementation is ready.
++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
++#define HWY_BASELINE_SVE2 HWY_SVE2
++#else
++#define HWY_BASELINE_SVE2 0
++#endif
++
++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
++#define HWY_BASELINE_SVE HWY_SVE
++#else
++#define HWY_BASELINE_SVE 0
++#endif
++
++// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
++#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+ #define HWY_BASELINE_NEON HWY_NEON
+ #else
+ #define HWY_BASELINE_NEON 0
+ #endif
+ 
+-#ifdef __SSE4_1__
++// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
++// we at least get SSE4 on machines supporting AVX but not AVX2.
++// https://stackoverflow.com/questions/18563978/
++#if HWY_ARCH_X86 && \
++    (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
+ #define HWY_BASELINE_SSE4 HWY_SSE4
+ #else
+ #define HWY_BASELINE_SSE4 0
+ #endif
+ 
+-#ifdef __AVX2__
++#if HWY_ARCH_X86 && defined(__AVX2__)
+ #define HWY_BASELINE_AVX2 HWY_AVX2
+ #else
+ #define HWY_BASELINE_AVX2 0
+ #endif
+ 
+-#ifdef __AVX512F__
++#if HWY_ARCH_X86 && defined(__AVX512F__)
+ #define HWY_BASELINE_AVX3 HWY_AVX3
+ #else
+ #define HWY_BASELINE_AVX3 0
+ #endif
+ 
+-#ifdef __riscv_vector
++#if HWY_ARCH_RVV && defined(__riscv_vector)
+ #define HWY_BASELINE_RVV HWY_RVV
+ #else
+ #define HWY_BASELINE_RVV 0
+ #endif
+ 
+ #define HWY_BASELINE_TARGETS                                                \
+-  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
+-   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
+-   HWY_BASELINE_RVV)
++  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
++   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 |               \
++   HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)
+ 
+ #endif  // HWY_BASELINE_TARGETS
+ 
+@@ -242,13 +269,12 @@
+ #define HWY_TARGETS HWY_STATIC_TARGET
+ 
+ // 3) For tests: include all attainable targets (in particular: scalar)
+-#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
++#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
+ #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
+ 
+ // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
+ // excluding superseded targets, in particular scalar.
+ #else
+-
+ #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+ 
+ #endif  // target policy
+@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha
+ #endif
+ 
+ #if HWY_ARCH_ARM
++    case HWY_SVE2:
++      return "SVE2";
++    case HWY_SVE:
++      return "SVE";
+     case HWY_NEON:
+       return "Neon";
+ #endif
+@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       return "Scalar";
+ 
+     default:
+-      return "?";
++      return "Unknown";  // must satisfy gtest IsValidParamName()
+   }
+ }
+ 
+@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                    /* SSE3 */     \
+       nullptr                     /* SSE2 */
+ 
+-#endif  // HWY_ARCH_X86
+-
+-#if HWY_ARCH_ARM
++#elif HWY_ARCH_ARM
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
+ #define HWY_CHOOSE_TARGET_LIST(func_name)       \
+-  nullptr,                       /* reserved */ \
+-      nullptr,                   /* reserved */ \
++  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
++      HWY_CHOOSE_SVE(func_name), /* SVE */      \
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_NEON(func_name) /* NEON */
+ 
+-#endif  // HWY_ARCH_ARM
+-
+-#if HWY_ARCH_PPC
++#elif HWY_ARCH_PPC
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 5
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
+@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                    /* VSX */      \
+       nullptr                     /* AltiVec */
+ 
+-#endif  // HWY_ARCH_PPC
+-
+-#if HWY_ARCH_WASM
++#elif HWY_ARCH_WASM
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
+@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_WASM(func_name) /* WASM */
+ 
+-#endif  // HWY_ARCH_WASM
+-
+-#if HWY_ARCH_RVV
++#elif HWY_ARCH_RVV
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
+@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_RVV(func_name) /* RVV */
+ 
+-#endif  // HWY_ARCH_RVV
++#else
++// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
++// still creating single-entry tables in HWY_EXPORT to ensure portability.
++#define HWY_MAX_DYNAMIC_TARGETS 1
++#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
++#endif
+ 
+ struct ChosenTarget {
+  public:
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12	2021-06-02 10:56:05.264904539 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR)
+ HWY_EXPORT(FakeFunction);
+ 
+ void CheckFakeFunction() {
+-#define CHECK_ARRAY_ENTRY(TGT)                                          \
+-  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                 \
+-    hwy::SetSupportedTargetsForTest(HWY_##TGT);                         \
+-    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
+-    /* the pointer to the already cached function. */                   \
+-    hwy::chosen_target.Update();                                        \
+-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
+-    /* Calling DeInit() will test that the initializer function */      \
+-    /* also calls the right function. */                                \
+-    hwy::chosen_target.DeInit();                                        \
+-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
+-    /* Second call uses the cached value from the previous call. */     \
+-    EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42));       \
++#define CHECK_ARRAY_ENTRY(TGT)                                              \
++  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                     \
++    hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \
++    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \
++    /* the pointer to the already cached function. */                       \
++    hwy::chosen_target.Update();                                            \
++    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
++    /* Calling DeInit() will test that the initializer function */          \
++    /* also calls the right function. */                                    \
++    hwy::chosen_target.DeInit();                                            \
++    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
++    /* Second call uses the cached value from the previous call. */         \
++    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+   }
+   CHECK_ARRAY_ENTRY(AVX3)
+   CHECK_ARRAY_ENTRY(AVX2)
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12	2021-06-02 10:56:05.251904473 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -16,7 +16,6 @@
+ #include <stdint.h>
+ 
+ #include <algorithm>
+-#include <cmath>
+ #include <limits>
+ 
+ #undef HWY_TARGET_INCLUDE
+@@ -173,16 +172,8 @@ struct TestFloatAbs {
+ };
+ 
+ HWY_NOINLINE void TestAllAbs() {
+-  const ForPartialVectors<TestAbs> test;
+-  test(int8_t());
+-  test(int16_t());
+-  test(int32_t());
+-
+-  const ForPartialVectors<TestFloatAbs> test_float;
+-  test_float(float());
+-#if HWY_CAP_FLOAT64
+-  test_float(double());
+-#endif
++  ForSignedTypes(ForPartialVectors<TestAbs>());
++  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
+ }
+ 
+ template <bool kSigned>
+@@ -199,6 +190,45 @@ struct TestLeftShifts {
+     const size_t N = Lanes(d);
+     auto expected = AllocateAligned<T>(N);
+ 
++    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
++    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
++
++    // 0
++    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
++    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
++
++    // 1
++    for (size_t i = 0; i < N; ++i) {
++      const T value = kSigned ? T(i) - T(N) : T(i);
++      expected[i] = T(TU(value) << 1);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
++
++    // max
++    for (size_t i = 0; i < N; ++i) {
++      const T value = kSigned ? T(i) - T(N) : T(i);
++      expected[i] = T(TU(value) << kMaxShift);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
++  }
++};
++
++template <bool kSigned>
++struct TestVariableLeftShifts {
++  template <typename T, class D>
++  HWY_NOINLINE void operator()(T t, D d) {
++    if (kSigned) {
++      // Also test positive values
++      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
++    }
++
++    using TI = MakeSigned<T>;
++    using TU = MakeUnsigned<T>;
++    const size_t N = Lanes(d);
++    auto expected = AllocateAligned<T>(N);
++
+     const auto v0 = Zero(d);
+     const auto v1 = Set(d, 1);
+     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+@@ -209,8 +239,6 @@ struct TestLeftShifts {
+     const auto large_shifts = max_shift - small_shifts;
+ 
+     // Same: 0
+-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+     HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
+ 
+     // Same: 1
+@@ -218,8 +246,6 @@ struct TestLeftShifts {
+       const T value = kSigned ? T(i) - T(N) : T(i);
+       expected[i] = T(TU(value) << 1);
+     }
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
+ 
+     // Same: max
+@@ -227,8 +253,6 @@ struct TestLeftShifts {
+       const T value = kSigned ? T(i) - T(N) : T(i);
+       expected[i] = T(TU(value) << kMaxShift);
+     }
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
+ 
+     // Variable: small
+@@ -252,6 +276,37 @@ struct TestUnsignedRightShifts {
+     const size_t N = Lanes(d);
+     auto expected = AllocateAligned<T>(N);
+ 
++    const auto values = Iota(d, 0);
++
++    const T kMax = LimitsMax<T>();
++    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
++
++    // Shift by 0
++    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
++    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
++
++    // Shift by 1
++    for (size_t i = 0; i < N; ++i) {
++      expected[i] = T(T(i & kMax) >> 1);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
++
++    // max
++    for (size_t i = 0; i < N; ++i) {
++      expected[i] = T(T(i & kMax) >> kMaxShift);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
++  }
++};
++
++struct TestVariableUnsignedRightShifts {
++  template <typename T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    const size_t N = Lanes(d);
++    auto expected = AllocateAligned<T>(N);
++
+     const auto v0 = Zero(d);
+     const auto v1 = Set(d, 1);
+     const auto values = Iota(d, 0);
+@@ -265,21 +320,15 @@ struct TestUnsignedRightShifts {
+     const auto large_shifts = max_shift - small_shifts;
+ 
+     // Same: 0
+-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+     HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
+ 
+     // Same: 1
+     for (size_t i = 0; i < N; ++i) {
+-      expected[i] = T(i >> 1);
++      expected[i] = T(T(i & kMax) >> 1);
+     }
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
+ 
+     // Same: max
+-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+     HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
+ 
+     // Variable: small
+@@ -296,33 +345,120 @@ struct TestUnsignedRightShifts {
+   }
+ };
+ 
+-struct TestSignedRightShifts {
++template <int kAmount, typename T>
++T RightShiftNegative(T val) {
++  // C++ shifts are implementation-defined for negative numbers, and we have
++  // seen divisions replaced with shifts, so resort to bit operations.
++  using TU = hwy::MakeUnsigned<T>;
++  TU bits;
++  CopyBytes<sizeof(T)>(&val, &bits);
++
++  const TU shifted = bits >> kAmount;
++
++  const TU all = ~TU(0);
++  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
++  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
++
++  bits = shifted | sign_extended;
++  CopyBytes<sizeof(T)>(&bits, &val);
++  return val;
++}
++
++class TestSignedRightShifts {
++ public:
+   template <typename T, class D>
+-  HWY_NOINLINE void operator()(T t, D d) {
+-    // Also test positive values
+-    TestUnsignedRightShifts()(t, d);
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    const size_t N = Lanes(d);
++    auto expected = AllocateAligned<T>(N);
++    constexpr T kMin = LimitsMin<T>();
++    constexpr T kMax = LimitsMax<T>();
++    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
++
++    // First test positive values, negative are checked below.
++    const auto v0 = Zero(d);
++    const auto values = Iota(d, 0) & Set(d, kMax);
++
++    // Shift by 0
++    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
++    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
++
++    // Shift by 1
++    for (size_t i = 0; i < N; ++i) {
++      expected[i] = T(T(i & kMax) >> 1);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
++
++    // max
++    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
++    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
++
++    // Even negative value
++    Test<0>(kMin, d, __LINE__);
++    Test<1>(kMin, d, __LINE__);
++    Test<2>(kMin, d, __LINE__);
++    Test<kMaxShift>(kMin, d, __LINE__);
++
++    const T odd = static_cast<T>(kMin + 1);
++    Test<0>(odd, d, __LINE__);
++    Test<1>(odd, d, __LINE__);
++    Test<2>(odd, d, __LINE__);
++    Test<kMaxShift>(odd, d, __LINE__);
++  }
++
++ private:
++  template <int kAmount, typename T, class D>
++  void Test(T val, D d, int line) {
++    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
++    const auto in = Set(d, val);
++    const char* file = __FILE__;
++    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
++    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
++  }
++};
+ 
++struct TestVariableSignedRightShifts {
++  template <typename T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+     using TU = MakeUnsigned<T>;
+     const size_t N = Lanes(d);
+     auto expected = AllocateAligned<T>(N);
+ 
+     constexpr T kMin = LimitsMin<T>();
+-    const auto values = Iota(d, kMin);
++    constexpr T kMax = LimitsMax<T>();
+ 
+     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
++
++    // First test positive values, negative are checked below.
++    const auto v0 = Zero(d);
++    const auto positive = Iota(d, 0) & Set(d, kMax);
++
++    // Shift by 0
++    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
++    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
++
++    // Shift by 1
++    for (size_t i = 0; i < N; ++i) {
++      expected[i] = T(T(i & kMax) >> 1);
++    }
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
++
++    // max
++    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
++    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
++
+     const auto max_shift = Set(d, kMaxShift);
+     const auto small_shifts = And(Iota(d, 0), max_shift);
+     const auto large_shifts = max_shift - small_shifts;
+ 
+-    // Test varying values to shift
++    const auto negative = Iota(d, kMin);
++
++    // Test varying negative to shift
+     for (size_t i = 0; i < N; ++i) {
+-      // We want a right-shift here, which is undefined behavior for negative
+-      // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if
+-      // minT is odd and negative.
+-      T minT = static_cast<T>(kMin + i);
+-      expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0));
++      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
+     }
+-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1)));
++    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
+ 
+     // Shift MSB right by small amounts
+     for (size_t i = 0; i < N; ++i) {
+@@ -343,6 +479,13 @@ struct TestSignedRightShifts {
+ };
+ 
+ HWY_NOINLINE void TestAllShifts() {
++  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
++  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
++  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
++  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
++}
++
++HWY_NOINLINE void TestAllVariableShifts() {
+   const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
+   const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
+   const ForPartialVectors<TestUnsignedRightShifts> shr_u;
+@@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() {
+   ForFloatTypes(ForPartialVectors<TestRound>());
+ }
+ 
++struct TestNearestInt {
++  template <typename TF, class DF>
++  HWY_NOINLINE void operator()(TF tf, const DF df) {
++    using TI = MakeSigned<TF>;
++    const RebindToSigned<DF> di;
++
++    size_t padded;
++    auto in = RoundTestCases(tf, df, padded);
++    auto expected = AllocateAligned<TI>(padded);
++
++    constexpr double max = static_cast<double>(LimitsMax<TI>());
++    for (size_t i = 0; i < padded; ++i) {
++      if (std::isnan(in[i])) {
++        // We replace NaN with 0 below (no_nan)
++        expected[i] = 0;
++      } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
++        // Avoid undefined result for lrintf
++        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
++      } else {
++        expected[i] = lrintf(in[i]);
++      }
++    }
++    for (size_t i = 0; i < padded; i += Lanes(df)) {
++      const auto v = Load(df, &in[i]);
++      const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
++      HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllNearestInt() {
++  ForPartialVectors<TestNearestInt>()(float());
++}
++
+ struct TestTrunc {
+   template <typename T, class D>
+   HWY_NOINLINE void operator()(T t, D d) {
+@@ -909,8 +1086,7 @@ struct TestSumOfLanes {
+ };
+ 
+ HWY_NOINLINE void TestAllSumOfLanes() {
+-  // Only full vectors because lanes in partial vectors are undefined.
+-  const ForFullVectors<TestSumOfLanes> sum;
++  const ForPartialVectors<TestSumOfLanes> sum;
+ 
+   // No u8/u16/i8/i16.
+   sum(uint32_t());
+@@ -976,9 +1152,8 @@ struct TestMaxOfLanes {
+ };
+ 
+ HWY_NOINLINE void TestAllMinMaxOfLanes() {
+-  // Only full vectors because lanes in partial vectors are undefined.
+-  const ForFullVectors<TestMinOfLanes> min;
+-  const ForFullVectors<TestMaxOfLanes> max;
++  const ForPartialVectors<TestMinOfLanes> min;
++  const ForPartialVectors<TestMaxOfLanes> max;
+ 
+   // No u8/u16/i8/i16.
+   min(uint32_t());
+@@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyArithmeticTest);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
++HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
+@@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest,
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
++HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
+ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12	2021-06-02 10:56:05.252904478 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyCombineTest);
+ HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
+ HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
+ HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
+ HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
+ HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+ 
+ #else
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12	2021-06-02 10:56:05.249904463 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyCompareTest);
+ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
+ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
+ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
+ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
+ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12	2021-06-02 10:56:05.261904523 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -16,8 +16,6 @@
+ #include <stdint.h>
+ #include <string.h>
+ 
+-#include <cmath>
+-
+ #undef HWY_TARGET_INCLUDE
+ #define HWY_TARGET_INCLUDE "tests/convert_test.cc"
+ #include "hwy/foreach_target.h"
+@@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() {
+ #endif
+ }
+ 
+-struct TestNearestInt {
+-  template <typename TI, class DI>
+-  HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
+-    using TF = MakeFloat<TI>;
+-    const Rebind<TF, DI> df;
+-    const size_t N = Lanes(df);
+-
+-    // Integer positive
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f)));
+-
+-    // Integer negative
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f)));
+-
+-    // Above positive
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f)));
+-
+-    // Below positive
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f)));
+-
+-    const TF eps = static_cast<TF>(0.0001);
+-    // Above negative
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps)));
+-
+-    // Below negative
+-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps)));
+-  }
+-};
+-
+-HWY_NOINLINE void TestAllNearestInt() {
+-  ForPartialVectors<TestNearestInt>()(int32_t());
+-}
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+ }  // namespace HWY_NAMESPACE
+@@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyConvertTest);
+ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
+ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
+@@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te
+ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
+ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
+-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12	2021-06-02 10:56:05.245904442 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -14,6 +14,7 @@
+ 
+ #include <stddef.h>
+ #include <stdint.h>
++#include <string.h>  // memcmp
+ 
+ #include "hwy/base.h"
+ 
+@@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() {
+   ForFloatTypes(ForPartialVectors<TestCopySign>());
+ }
+ 
++struct TestFirstN {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    const size_t N = Lanes(d);
++    auto mask_lanes = AllocateAligned<T>(N);
++
++    // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we
++    // cannot reliably compare against all bits set (NaN for float types).
++    const T off = 1;
++
++    for (size_t len = 0; len <= N; ++len) {
++      for (size_t i = 0; i < N; ++i) {
++        mask_lanes[i] = i < len ? T(0) : off;
++      }
++      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
++      HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllFirstN() {
++  ForAllTypes(ForPartialVectors<TestFirstN>());
++}
++
+ struct TestIfThenElse {
+   template <class T, class D>
+   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+@@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() {
+   ForAllTypes(ForPartialVectors<TestIfThenElse>());
+ }
+ 
+-// Also tests MaskFromVec/VecFromMask
++struct TestMaskVec {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    RandomState rng;
++
++    const size_t N = Lanes(d);
++    auto mask_lanes = AllocateAligned<T>(N);
++
++    // Each lane should have a chance of having mask=true.
++    for (size_t rep = 0; rep < 100; ++rep) {
++      for (size_t i = 0; i < N; ++i) {
++        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
++      }
++
++      const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
++      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllMaskVec() {
++  const ForPartialVectors<TestMaskVec> test;
++
++  test(uint16_t());
++  test(int16_t());
++  // TODO(janwas): float16_t - cannot compare yet
++
++  test(uint32_t());
++  test(int32_t());
++  test(float());
++
++#if HWY_CAP_INTEGER64
++  test(uint64_t());
++  test(int64_t());
++#endif
++#if HWY_CAP_FLOAT64
++  test(double());
++#endif
++}
++
+ struct TestCompress {
+   template <class T, class D>
+   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+     RandomState rng;
+ 
++    using TU = MakeUnsigned<T>;
++    const Rebind<TU, D> du;
+     const size_t N = Lanes(d);
+     auto in_lanes = AllocateAligned<T>(N);
+-    auto mask_lanes = AllocateAligned<T>(N);
++    auto mask_lanes = AllocateAligned<TU>(N);
+     auto expected = AllocateAligned<T>(N);
+     auto actual = AllocateAligned<T>(N);
+ 
+@@ -224,35 +290,56 @@ struct TestCompress {
+     for (size_t rep = 0; rep < 100; ++rep) {
+       size_t expected_pos = 0;
+       for (size_t i = 0; i < N; ++i) {
+-        in_lanes[i] = static_cast<T>(Random32(&rng));
+-        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
++        const uint64_t bits = Random32(&rng);
++        in_lanes[i] = T();  // cannot initialize float16_t directly.
++        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
++        mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
+         if (mask_lanes[i] == 0) {  // Zero means true (easier to compare)
+           expected[expected_pos++] = in_lanes[i];
+         }
+       }
+ 
+       const auto in = Load(d, in_lanes.get());
+-      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
++      const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
+ 
+-      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+       Store(Compress(in, mask), d, actual.get());
+       // Upper lanes are undefined.
+       for (size_t i = 0; i < expected_pos; ++i) {
+-        HWY_ASSERT(actual[i] == expected[i]);
++        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
+       }
+ 
+       // Also check CompressStore in the same way.
+-      std::fill(actual.get(), actual.get() + N, T(0));
++      memset(actual.get(), 0, N * sizeof(T));
+       const size_t num_written = CompressStore(in, mask, d, actual.get());
+       HWY_ASSERT_EQ(expected_pos, num_written);
+       for (size_t i = 0; i < expected_pos; ++i) {
+-        HWY_ASSERT_EQ(expected[i], actual[i]);
++        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
+       }
+     }
+   }
+ };
+ 
+ #if 0
++namespace detail {  // for code folding
++void PrintCompress16x8Tables() {
++  constexpr size_t N = 8;  // 128-bit SIMD
++  for (uint64_t code = 0; code < 1ull << N; ++code) {
++    std::array<uint8_t, N> indices{0};
++    size_t pos = 0;
++    for (size_t i = 0; i < N; ++i) {
++      if (code & (1ull << i)) {
++        indices[pos++] = i;
++      }
++    }
++
++    // Doubled (for converting lane to byte indices)
++    for (size_t i = 0; i < N; ++i) {
++      printf("%d,", 2 * indices[i]);
++    }
++  }
++  printf("\n");
++}
++
+ // Compressed to nibbles
+ void PrintCompress32x8Tables() {
+   constexpr size_t N = 8;  // AVX2
+@@ -340,16 +427,22 @@ void PrintCompress64x2Tables() {
+   }
+   printf("\n");
+ }
+-
++}  // namespace detail
+ #endif
+ 
+ HWY_NOINLINE void TestAllCompress() {
+-  // PrintCompress32x8Tables();
+-  // PrintCompress64x4Tables();
+-  // PrintCompress32x4Tables();
+-  // PrintCompress64x2Tables();
++  // detail::PrintCompress32x8Tables();
++  // detail::PrintCompress64x4Tables();
++  // detail::PrintCompress32x4Tables();
++  // detail::PrintCompress64x2Tables();
++  // detail::PrintCompress16x8Tables();
+ 
+   const ForPartialVectors<TestCompress> test;
++
++  test(uint16_t());
++  test(int16_t());
++  test(float16_t());
++
+   test(uint32_t());
+   test(int32_t());
+   test(float());
+@@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() {
+   test(uint64_t());
+   test(int64_t());
+ #endif
+-
+ #if HWY_CAP_FLOAT64
+   test(double());
+ #endif
+@@ -432,7 +524,7 @@ struct TestTestBit {
+ };
+ 
+ HWY_NOINLINE void TestAllTestBit() {
+-  ForIntegerTypes(ForFullVectors<TestTestBit>());
++  ForIntegerTypes(ForPartialVectors<TestTestBit>());
+ }
+ 
+ struct TestAllTrueFalse {
+@@ -445,6 +537,8 @@ struct TestAllTrueFalse {
+     auto lanes = AllocateAligned<T>(N);
+     std::fill(lanes.get(), lanes.get() + N, T(0));
+ 
++    auto mask_lanes = AllocateAligned<T>(N);
++
+     HWY_ASSERT(AllTrue(Eq(v, zero)));
+     HWY_ASSERT(!AllFalse(Eq(v, zero)));
+ 
+@@ -456,7 +550,13 @@ struct TestAllTrueFalse {
+     for (size_t i = 0; i < N; ++i) {
+       lanes[i] = T(1);
+       v = Load(d, lanes.get());
+-      HWY_ASSERT(!AllTrue(Eq(v, zero)));
++
++      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
++      // Assigning to an lvalue is insufficient but storing to memory prevents
++      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
++      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
++      HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
++
+       HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
+ 
+       lanes[i] = T(-1);
+@@ -596,7 +696,7 @@ struct TestLogicalMask {
+ };
+ 
+ HWY_NOINLINE void TestAllLogicalMask() {
+-  ForAllTypes(ForFullVectors<TestLogicalMask>());
++  ForAllTypes(ForPartialVectors<TestLogicalMask>());
+ }
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+ }  // namespace HWY_NAMESPACE
+@@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyLogicalTest);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
++HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
++HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
+@@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
+ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12	2021-06-02 10:56:05.247904453 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -12,6 +12,12 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+ 
++// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
++// detected. Must come before Highway headers.
++#if defined(_WIN32) || defined(_WIN64)
++#include <Windows.h>
++#endif
++
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() {
+   ForAllTypes(ForPartialVectors<TestLoadStore>());
+ }
+ 
++struct TestStoreInterleaved3 {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    const size_t N = Lanes(d);
++
++    RandomState rng;
++
++    // Data to be interleaved
++    auto bytes = AllocateAligned<uint8_t>(3 * N);
++    for (size_t i = 0; i < 3 * N; ++i) {
++      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
++    }
++    const auto in0 = Load(d, &bytes[0 * N]);
++    const auto in1 = Load(d, &bytes[1 * N]);
++    const auto in2 = Load(d, &bytes[2 * N]);
++
++    // Interleave here, ensure vector results match scalar
++    auto expected = AllocateAligned<T>(4 * N);
++    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
++    T* actual = actual_aligned.get() + 1;
++
++    for (size_t rep = 0; rep < 100; ++rep) {
++      for (size_t i = 0; i < N; ++i) {
++        expected[3 * i + 0] = bytes[0 * N + i];
++        expected[3 * i + 1] = bytes[1 * N + i];
++        expected[3 * i + 2] = bytes[2 * N + i];
++        // Ensure we do not write more than 3*N bytes
++        expected[3 * N + i] = actual[3 * N + i] = 0;
++      }
++      StoreInterleaved3(in0, in1, in2, d, actual);
++      size_t pos = 0;
++      if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
++        Print(d, "in0", in0, pos / 3);
++        Print(d, "in1", in1, pos / 3);
++        Print(d, "in2", in2, pos / 3);
++        const size_t i = pos - pos % 3;
++        fprintf(stderr, "interleaved %d %d %d  %d %d %d\n", actual[i],
++                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
++                actual[i + 5]);
++        HWY_ASSERT(false);
++      }
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllStoreInterleaved3() {
++#if HWY_TARGET == HWY_RVV
++  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
++  const ForExtendableVectors<TestStoreInterleaved3, 4> test;
++#else
++  const ForPartialVectors<TestStoreInterleaved3> test;
++#endif
++  test(uint8_t());
++}
++
++struct TestStoreInterleaved4 {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    const size_t N = Lanes(d);
++
++    RandomState rng;
++
++    // Data to be interleaved
++    auto bytes = AllocateAligned<uint8_t>(4 * N);
++    for (size_t i = 0; i < 4 * N; ++i) {
++      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
++    }
++    const auto in0 = Load(d, &bytes[0 * N]);
++    const auto in1 = Load(d, &bytes[1 * N]);
++    const auto in2 = Load(d, &bytes[2 * N]);
++    const auto in3 = Load(d, &bytes[3 * N]);
++
++    // Interleave here, ensure vector results match scalar
++    auto expected = AllocateAligned<T>(5 * N);
++    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
++    T* actual = actual_aligned.get() + 1;
++
++    for (size_t rep = 0; rep < 100; ++rep) {
++      for (size_t i = 0; i < N; ++i) {
++        expected[4 * i + 0] = bytes[0 * N + i];
++        expected[4 * i + 1] = bytes[1 * N + i];
++        expected[4 * i + 2] = bytes[2 * N + i];
++        expected[4 * i + 3] = bytes[3 * N + i];
++        // Ensure we do not write more than 4*N bytes
++        expected[4 * N + i] = actual[4 * N + i] = 0;
++      }
++      StoreInterleaved4(in0, in1, in2, in3, d, actual);
++      size_t pos = 0;
++      if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
++        Print(d, "in0", in0, pos / 4);
++        Print(d, "in1", in1, pos / 4);
++        Print(d, "in2", in2, pos / 4);
++        Print(d, "in3", in3, pos / 4);
++        const size_t i = pos;
++        fprintf(stderr, "interleaved %d %d %d %d  %d %d %d %d\n", actual[i],
++                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
++                actual[i + 5], actual[i + 6], actual[i + 7]);
++        HWY_ASSERT(false);
++      }
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllStoreInterleaved4() {
++#if HWY_TARGET == HWY_RVV
++  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
++  const ForExtendableVectors<TestStoreInterleaved4, 4> test;
++#else
++  const ForPartialVectors<TestStoreInterleaved4> test;
++#endif
++  test(uint8_t());
++}
++
+ struct TestLoadDup128 {
+   template <class T, class D>
+   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+@@ -86,13 +205,14 @@ struct TestLoadDup128 {
+     for (size_t i = 0; i < N128; ++i) {
+       lanes[i] = static_cast<T>(1 + i);
+     }
+-    const auto v = LoadDup128(d, lanes);
++
+     const size_t N = Lanes(d);
+-    auto out = AllocateAligned<T>(N);
+-    Store(v, d, out.get());
++    auto expected = AllocateAligned<T>(N);
+     for (size_t i = 0; i < N; ++i) {
+-      HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
++      expected[i] = static_cast<T>(i % N128 + 1);
+     }
++
++    HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
+ #else
+     (void)d;
+ #endif
+@@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() {
+   ForFloatTypes(test);
+ }
+ 
++// Assumes little-endian byte order!
++struct TestScatter {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    using Offset = MakeSigned<T>;
++
++    const size_t N = Lanes(d);
++    const size_t range = 4 * N;                  // number of items to scatter
++    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
++
++    RandomState rng;
++
++    // Data to be scattered
++    auto bytes = AllocateAligned<uint8_t>(max_bytes);
++    for (size_t i = 0; i < max_bytes; ++i) {
++      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
++    }
++    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
++
++    // Scatter into these regions, ensure vector results match scalar
++    auto expected = AllocateAligned<T>(range);
++    auto actual = AllocateAligned<T>(range);
++
++    const Rebind<Offset, D> d_offsets;
++    auto offsets = AllocateAligned<Offset>(N);  // or indices
++
++    for (size_t rep = 0; rep < 100; ++rep) {
++      // Byte offsets
++      std::fill(expected.get(), expected.get() + range, T(0));
++      std::fill(actual.get(), actual.get() + range, T(0));
++      for (size_t i = 0; i < N; ++i) {
++        offsets[i] =
++            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
++        CopyBytes<sizeof(T)>(
++            bytes.get() + i * sizeof(T),
++            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
++      }
++      const auto voffsets = Load(d_offsets, offsets.get());
++      ScatterOffset(data, d, actual.get(), voffsets);
++      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
++        Print(d, "Data", data);
++        Print(d_offsets, "Offsets", voffsets);
++        HWY_ASSERT(false);
++      }
++
++      // Indices
++      std::fill(expected.get(), expected.get() + range, T(0));
++      std::fill(actual.get(), actual.get() + range, T(0));
++      for (size_t i = 0; i < N; ++i) {
++        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
++        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
++                             &expected[offsets[i]]);
++      }
++      const auto vindices = Load(d_offsets, offsets.get());
++      ScatterIndex(data, d, actual.get(), vindices);
++      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
++        Print(d, "Data", data);
++        Print(d_offsets, "Indices", vindices);
++        HWY_ASSERT(false);
++      }
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllScatter() {
++  // No u8,u16,i8,i16.
++  const ForPartialVectors<TestScatter> test;
++  test(uint32_t());
++  test(int32_t());
++
++#if HWY_CAP_INTEGER64
++  test(uint64_t());
++  test(int64_t());
++#endif
++
++  ForFloatTypes(test);
++}
++
+ struct TestGather {
+   template <class T, class D>
+   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+@@ -183,21 +381,15 @@ struct TestGather {
+ 
+ HWY_NOINLINE void TestAllGather() {
+   // No u8,u16,i8,i16.
+-  const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint32_t)> test32;
+-  test32(uint32_t());
+-  test32(int32_t());
++  const ForPartialVectors<TestGather> test;
++  test(uint32_t());
++  test(int32_t());
+ 
+ #if HWY_CAP_INTEGER64
+-  const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint64_t)> test64;
+-  test64(uint64_t());
+-  test64(int64_t());
+-#endif
+-
+-  ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(float)>()(float());
+-
+-#if HWY_CAP_FLOAT64
+-  ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(double)>()(double());
++  test(uint64_t());
++  test(int64_t());
+ #endif
++  ForFloatTypes(test);
+ }
+ 
+ HWY_NOINLINE void TestAllCache() {
+@@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() {
+   int test = 0;
+   Prefetch(&test);
+   FlushCacheline(&test);
++  Pause();
+ }
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+@@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwyMemoryTest);
+ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
+ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
+ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
+ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
+ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12	2021-06-02 10:56:05.259904513 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc	2021-05-31 10:37:11.000000000 -0400
+@@ -223,6 +223,7 @@ struct TestTableLookupBytes {
+ HWY_NOINLINE void TestAllTableLookupBytes() {
+   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
+ }
++
+ struct TestTableLookupLanes {
+ #if HWY_TARGET == HWY_RVV
+   using Index = uint32_t;
+@@ -242,12 +243,13 @@ struct TestTableLookupLanes {
+     if (N <= 8) {  // Test all permutations
+       for (size_t i0 = 0; i0 < N; ++i0) {
+         idx[0] = static_cast<Index>(i0);
++
+         for (size_t i1 = 0; i1 < N; ++i1) {
+-          idx[1] = static_cast<Index>(i1);
++          if (N >= 2) idx[1] = static_cast<Index>(i1);
+           for (size_t i2 = 0; i2 < N; ++i2) {
+-            idx[2] = static_cast<Index>(i2);
++            if (N >= 4) idx[2] = static_cast<Index>(i2);
+             for (size_t i3 = 0; i3 < N; ++i3) {
+-              idx[3] = static_cast<Index>(i3);
++              if (N >= 4) idx[3] = static_cast<Index>(i3);
+ 
+               for (size_t i = 0; i < N; ++i) {
+                 expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+@@ -286,7 +288,7 @@ struct TestTableLookupLanes {
+ };
+ 
+ HWY_NOINLINE void TestAllTableLookupLanes() {
+-  const ForFullVectors<TestTableLookupLanes> test;
++  const ForPartialVectors<TestTableLookupLanes> test;
+   test(uint32_t());
+   test(int32_t());
+   test(float());
+@@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() {
+ HWY_AFTER_NAMESPACE();
+ 
+ #if HWY_ONCE
++namespace hwy {
+ HWY_BEFORE_TEST(HwySwizzleTest);
+ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
+ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
+@@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te
+ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
+ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
+ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
+-HWY_AFTER_TEST();
++}  // namespace hwy
+ #endif
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h
+--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12	2021-06-02 10:56:05.254904488 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h	2021-05-31 10:37:11.000000000 -0400
+@@ -23,7 +23,6 @@
+ #include <stdio.h>
+ #include <string.h>
+ 
+-#include <cmath>  // isfinite
+ #include <cstddef>
+ #include <string>
+ #include <utility>  // std::forward
+@@ -73,7 +72,8 @@ class TestWithParamTarget : public testi
+ 
+ // Function to convert the test parameter of a TestWithParamTarget for
+ // displaying it in the gtest test name.
+-std::string TestParamTargetName(const testing::TestParamInfo<uint32_t>& info) {
++static inline std::string TestParamTargetName(
++    const testing::TestParamInfo<uint32_t>& info) {
+   return TargetName(info.param);
+ }
+ 
+@@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT(
+   static_assert(true, "For requiring trailing semicolon")
+ 
+ #define HWY_BEFORE_TEST(suite)                      \
+-  namespace hwy {                                   \
+   class suite : public hwy::TestWithParamTarget {}; \
+   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
+   static_assert(true, "For requiring trailing semicolon")
+ 
+-#define HWY_AFTER_TEST() \
+-  } /* namespace hwy */       \
+-  static_assert(true, "For requiring trailing semicolon")
+-
+-// Calls test for each enabled and available target.
+-template <class Func, typename... Args>
+-HWY_NOINLINE void RunTest(const Func& func, Args&&... args) {
+-  SetSupportedTargetsForTest(0);
+-  auto targets = SupportedAndGeneratedTargets();
+-
+-  for (uint32_t target : targets) {
+-    SetSupportedTargetsForTest(target);
+-    fprintf(stderr, "Testing for target %s.\n",
+-            TargetName(static_cast<int>(target)));
+-    func(std::forward<Args>(args)...);
+-  }
+-  // Disable the mask after the test.
+-  SetSupportedTargetsForTest(0);
+-}
+-
+ // 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
+ // which triggers a compiler bug.
+ class RandomState {
+@@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand
+ // built-in types.
+ template <class T>
+ inline void PreventElision(T&& output) {
+-#ifndef _MSC_VER
++#if HWY_COMPILER_MSVC
++  (void)output;
++#else   // HWY_COMPILER_MSVC
+   asm volatile("" : "+r"(output) : : "memory");
+-#endif
++#endif  // HWY_COMPILER_MSVC
+ }
+ 
+ // Returns a name for the vector/part/scalar. The type prefix is u/i/f for
+@@ -234,23 +215,34 @@ inline void PreventElision(T&& output) {
+ // understanding which instantiation of a generic test failed.
+ template <typename T>
+ static inline std::string TypeName(T /*unused*/, size_t N) {
+-  std::string prefix(IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u"));
+-  prefix += std::to_string(sizeof(T) * 8);
+-
+-  // Scalars: omit the xN suffix.
+-  if (N == 1) return prefix;
+-
+-  return prefix + 'x' + std::to_string(N);
++  const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
++  char name[64];
++  // Omit the xN suffix for scalars.
++  if (N == 1) {
++    snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
++  } else {
++    snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
++  }
++  return name;
+ }
+ 
+ // String comparison
+ 
+ template <typename T1, typename T2>
+-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) {
++inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
++                       size_t* pos = nullptr) {
+   const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
+   const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
+   for (size_t i = 0; i < size; ++i) {
+-    if (bytes1[i] != bytes2[i]) return false;
++    if (bytes1[i] != bytes2[i]) {
++      fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
++              size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
++              TypeName(T2(), 1).c_str());
++      if (pos != nullptr) {
++        *pos = i;
++      }
++      return false;
++    }
+   }
+   return true;
+ }
+@@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const
+   auto lanes = AllocateAligned<T>(N);
+   Store(v, d, lanes.get());
+   const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
+-  const size_t end = std::min(begin + 5, N);
++  const size_t end = std::min(begin + 7, N);
+   fprintf(stderr, "%s %s [%zu+ ->]:\n  ", TypeName(T(), N).c_str(), caption,
+           begin);
+   for (size_t i = begin; i < end; ++i) {
+-    fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str());
++    fprintf(stderr, "%g,", double(lanes[i]));
+   }
+   if (begin >= end) fprintf(stderr, "(out of bounds)");
+   fprintf(stderr, "\n");
+@@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex
+                               const char* filename = "", const int line = -1,
+                               const size_t lane = 0) {
+   if (!IsEqual(expected, actual)) {
+-    const std::string expected_str = std::to_string(expected);
+-    const std::string actual_str = std::to_string(actual);
+-    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(),
+-                  actual_str.c_str());
++    char expected_str[100];
++    snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
++    char actual_str[100];
++    snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
++    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
++                  actual_str);
+   }
+ }
+ 
+@@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co
+       fprintf(stderr, "\n\n");
+       Print(d, "expect", expected, i);
+       Print(d, "actual", actual, i);
++
++      char expected_str[100];
++      snprintf(expected_str, sizeof(expected_str), "%g",
++               double(expected_lanes[i]));
++      char actual_str[100];
++      snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
++
+       NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
+-                    std::to_string(expected_lanes[i]).c_str(),
+-                    std::to_string(actual_lanes[i]).c_str());
++                    expected_str, actual_str);
+     }
+   }
+ }
+@@ -458,11 +458,8 @@ struct ForeachSizeR<T, 0, kMinLanes, Tes
+ 
+ // These adapters may be called directly, or via For*Types:
+ 
+-// Calls Test for all powers of two in [kMinLanes, kMaxLanes / kDivLanes].
+-// kMaxLanes is used for HWY_GATHER_LANES etc; use a large default because we
+-// don't have access to T in the template argument list.
+-template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1,
+-          size_t kMaxLanes = 1ul << 30>
++// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
++template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
+ struct ForPartialVectors {
+   template <typename T>
+   void operator()(T /*unused*/) const {
+@@ -470,8 +467,8 @@ struct ForPartialVectors {
+     // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
+     ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
+ #else
+-    ForeachSizeR<T, HWY_MIN(kMaxLanes, HWY_LANES(T)) / kDivLanes / kMinLanes,
+-                 kMinLanes, Test>::Do();
++    ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
++                 Test>::Do();
+ #endif
+   }
+ };
+@@ -505,33 +502,19 @@ struct ForGE128Vectors {
+   }
+ };
+ 
+-// Calls Test for all powers of two in [128 bits, max bits/2].
+-template <class Test>
++// Calls Test for all vectors that can be expanded by kFactor.
++template <class Test, size_t kFactor = 2>
+ struct ForExtendableVectors {
+   template <typename T>
+   void operator()(T /*unused*/) const {
+ #if HWY_TARGET == HWY_RVV
+-    ForeachSizeR<T, 4, HWY_LANES(T), Test>::Do();
++    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
+ #else
+-    ForeachSizeR<T, HWY_LANES(T) / 2 / (16 / sizeof(T)), (16 / sizeof(T)),
++    ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
+                  Test>::Do();
+ #endif
+   }
+ };
+-
+-// Calls Test for full vectors only.
+-template <class Test>
+-struct ForFullVectors {
+-  template <typename T>
+-  void operator()(T t) const {
+-#if HWY_TARGET == HWY_RVV
+-    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+-    (void)t;
+-#else
+-    Test()(t, HWY_FULL(T)());
+-#endif
+-  }
+-};
+ 
+ // Type lists to shorten call sites:
+ 
+diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE
+diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in
+diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE
+diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in
+diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE
+diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE
+diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE
+diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile
+diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE
+diff -up chromium-91.0.4472.77/third_party/highway/src/README.md.12 chromium-91.0.4472.77/third_party/highway/src/README.md
+--- chromium-91.0.4472.77/third_party/highway/src/README.md.12	2021-06-02 10:56:05.295904696 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/README.md	2021-05-31 10:37:11.000000000 -0400
+@@ -15,15 +15,19 @@ applying the same operation to 'lanes'.
+ ## Current status
+ 
+ Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
+-A port to RVV is in progress.
++Ports to RVV and SVE/SVE2 are in progress.
+ 
+ Version 0.11 is considered stable enough to use in other projects, and is
+ expected to remain backwards compatible unless serious issues are discovered
+ while implementing SVE/RVV targets. After these targets are added, Highway will
+ reach version 1.0.
+ 
+-Continuous integration tests use a recent version of Clang and older version of
+-MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1.
++Continuous integration tests build with a recent version of Clang (running on
++x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
++
++Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
++GCC cross-compile and QEMU. See the
++[testing process](g3doc/release_testing_process.md) for details.
+ 
+ The `contrib` directory contains SIMD-related utilities: an image class with
+ aligned rows, and a math library (16 functions already implemented, mostly
+@@ -62,9 +66,11 @@ To test on all the attainable targets fo
+ default configuration skips baseline targets (e.g. scalar) that are superseded
+ by another baseline target.
+ 
++Bazel is also supported for building, but it is not as widely used/tested.
++
+ ## Quick start
+ 
+-You can use the `skeleton` examples inside examples/ as a starting point.
++You can use the `benchmark` inside examples/ as a starting point.
+ 
+ A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
+ and their parameters, and the [instruction_matrix][instmtx] indicates the
+diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE
+diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat
+--- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12	2021-06-02 10:56:05.293904685 -0400
++++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat	2021-05-31 10:37:11.000000000 -0400
+@@ -2,9 +2,9 @@
+ REM Switch directory of this batch file
+ cd %~dp0
+ 
+-if not exist build mkdir build
++if not exist build_win mkdir build_win
+ 
+-cd build
++cd build_win
+ cmake .. -G Ninja || goto error
+ ninja || goto error
+ ctest -j || goto error
+diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE
+diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.sh.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.sh
+diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE
+diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time
+diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1
diff --git a/chromium.spec b/chromium.spec
index c606365..4370a44 100644
--- a/chromium.spec
+++ b/chromium.spec
@@ -7,7 +7,7 @@
 
 # This flag is so I can build things very fast on a giant system.
 # Do not enable in Koji builds.
-%global use_all_cpus 0
+%global use_all_cpus 1
 
 %if %{use_all_cpus}
 %global numjobs %{_smp_build_ncpus}
@@ -208,14 +208,14 @@ BuildRequires:  libicu-devel >= 5.4
 %global chromoting_client_id %nil
 %endif
 
-%global majorversion 90
+%global majorversion 91
 
 %if %{freeworld}
 Name:		chromium%{chromium_channel}%{nsuffix}
 %else
 Name:		chromium%{chromium_channel}
 %endif
-Version:	%{majorversion}.0.4430.212
+Version:	%{majorversion}.0.4472.77
 Release:	1%{?dist}
 %if %{?freeworld}
 %if %{?shared}
@@ -234,7 +234,7 @@ License:	BSD and LGPLv2+ and ASL 2.0 and IJG and MIT and GPLv2+ and ISC and Open
 ### Chromium Fedora Patches ###
 Patch0:		chromium-70.0.3538.67-sandbox-pie.patch
 # Use /etc/chromium for initial_prefs
-Patch1:		chromium-89.0.4389.72-initial_prefs-etc-path.patch
+Patch1:		chromium-91.0.4472.77-initial_prefs-etc-path.patch
 # Use gn system files
 Patch2:		chromium-67.0.3396.62-gn-system.patch
 # Do not prefix libpng functions
@@ -249,7 +249,7 @@ Patch6:		chromium-89.0.4389.72-norar.patch
 # https://gitweb.gentoo.org/repo/gentoo.git/tree/www-client/chromium/files/chromium-widevine-r3.patch
 Patch7:		chromium-71.0.3578.98-widevine-r3.patch
 # Disable fontconfig cache magic that breaks remoting
-Patch8:		chromium-83.0.4103.61-disable-fontconfig-cache-magic.patch
+Patch8:		chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch
 # drop rsp clobber, which breaks gcc9 (thanks to Jeff Law)
 Patch9:	chromium-78.0.3904.70-gcc9-drop-rsp-clobber.patch
 # Try to load widevine from other places
@@ -274,16 +274,20 @@ Patch57:	chromium-89.0.4389.72-missing-cstring-header.patch
 # prepare for using system ffmpeg (clean)
 # http://svnweb.mageia.org/packages/cauldron/chromium-browser-stable/current/SOURCES/chromium-53-ffmpeg-no-deprecation-errors.patch?view=markup
 Patch58:	chromium-53-ffmpeg-no-deprecation-errors.patch
-# https://github.com/stha09/chromium-patches/blob/master/chromium-90-angle-constexpr.patch
-Patch59:	chromium-90-angle-constexpr.patch
-# https://github.com/stha09/chromium-patches/blob/master/chromium-90-CrossThreadCopier-qualification.patch
-Patch60:	chromium-90-CrossThreadCopier-qualification.patch
-# https://github.com/stha09/chromium-patches/blob/master/chromium-90-quantization_utils-include.patch
-Patch61:	chromium-90-quantization_utils-include.patch
+# https://github.com/stha09/chromium-patches/blob/master/chromium-91-pcscan-vector-types.patch
+Patch59:	chromium-91-pcscan-vector-types.patch
+# https://github.com/stha09/chromium-patches/blob/master/chromium-91-libyuv-aarch64.patch
+Patch60:	chromium-91-libyuv-aarch64.patch
+# Update third_party/highway to 0.12.2
+# this is needed for sane arm/aarch64
+Patch61:	chromium-91.0.4472.77-update-highway-0.12.2.patch
 # https://github.com/stha09/chromium-patches/blob/master/chromium-90-ruy-include.patch
 Patch62:	chromium-90-ruy-include.patch
-# https://github.com/stha09/chromium-patches/blob/master/chromium-90-TokenizedOutput-include.patch
-Patch63:	chromium-90-TokenizedOutput-include.patch
+# Extra CXXFLAGS for aarch64
+Patch63:	chromium-91.0.4472.77-aarch64-cxxflags-addition.patch
+# Fix issue where closure_compiler thinks java is only allowed in android builds
+# https://bugs.chromium.org/p/chromium/issues/detail?id=1192875
+Patch64:	chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch
 
 # Silence GCC warnings during gn compile
 Patch65:	chromium-84.0.4147.105-gn-gcc-cleanup.patch
@@ -300,9 +304,6 @@ Patch75:	chromium-90.0.4430.72-fstatfix.patch
 Patch76:	chromium-88.0.4324.182-rawhide-gcc-std-max-fix.patch
 # Fix symbol visibility with gcc on swiftshader's libEGL
 Patch77:	chromium-88.0.4324.182-gcc-fix-swiftshader-libEGL-visibility.patch
-# Include support for futex_time64 (64bit time on 32bit platforms)
-# https://chromium.googlesource.com/chromium/src/+/955a586c63c4f99fb734e44221db63f5b2ca25a9%5E%21/#F0
-Patch78:	chromium-89.0.4389.82-support-futex_time64.patch
 # Do not download proprietary widevine module in the background (thanks Debian)
 Patch79:	chromium-90.0.4430.72-widevine-no-download.patch
 # Fix crashes with components/cast_*
@@ -409,6 +410,7 @@ BuildRequires:	harfbuzz-devel >= 2.4.0
 %endif
 BuildRequires:	libatomic
 BuildRequires:	libcap-devel
+BuildRequires:	libcurl-devel
 %if 0%{?bundlelibdrm}
 #nothing
 %else
@@ -924,12 +926,12 @@ udev.
 %patch56 -p1 -b .missing-cstdint
 %patch57 -p1 -b .missing-cstring
 %patch58 -p1 -b .ffmpeg-deprecations
-%patch59 -p1 -b .angle-constexpr
-%patch60 -p1 -b .CrossThreadCopier-qualification
-%patch61 -p1 -b .quantization_utils-include
+%patch59 -p1 -b .pcscan-vector-types
+%patch60 -p1 -b .libyuv-aarch64
+%patch61 -p1 -b .update-highway-0.12.2
 %patch62 -p1 -b .ruy-include
-%patch63 -p1 -b .TokenizedOutput
-
+%patch63 -p1 -b .aarch64-cxxflags-addition
+%patch64 -p1 -b .java-only-allowed
 %patch65 -p1 -b .gn-gcc-cleanup
 %patch66 -p1 -b .remoting-cstring
 %patch67 -p1 -b .i686-textrels
@@ -939,7 +941,6 @@ udev.
 %patch76 -p1 -b .sigstkszfix
 %endif
 %patch77 -p1 -b .gcc-swiftshader-visibility
-%patch78 -p1 -b .futex-time64
 %patch79 -p1 -b .widevine-no-download
 %patch80 -p1 -b .EnumTable-crash
 
@@ -1209,6 +1210,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/cros_system_api' \
 	'third_party/dav1d' \
 	'third_party/dawn' \
+	'third_party/dawn/third_party/khronos' \
 	'third_party/depot_tools' \
 	'third_party/devscripts' \
 	'third_party/devtools-frontend' \
@@ -1247,6 +1249,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/googletest' \
 	'third_party/grpc' \
 	'third_party/harfbuzz-ng' \
+	'third_party/highway' \
 	'third_party/hunspell' \
 	'third_party/iccjpeg' \
 	'third_party/icu' \
@@ -1268,6 +1271,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/libgifcodec' \
 	'third_party/libjingle' \
 	'third_party/libjpeg_turbo' \
+	'third_party/libjxl' \
 	'third_party/libphonenumber' \
 	'third_party/libpng' \
 	'third_party/libsecret' \
@@ -1341,7 +1345,6 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/rnnoise' \
 	'third_party/ruy' \
 	'third_party/s2cellid' \
-	'third_party/schema_org' \
 	'third_party/securemessage' \
 	'third_party/shell-encryption' \
 	'third_party/simplejson' \
@@ -1378,6 +1381,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/wayland' \
 	'third_party/web-animations-js' \
 	'third_party/webdriver' \
+	'third_party/webgpu-cts' \
 	'third_party/webrtc' \
 	'third_party/webrtc/common_audio/third_party/ooura' \
 	'third_party/webrtc/common_audio/third_party/spl_sqrt_floor' \
@@ -1943,6 +1947,10 @@ getent group chrome-remote-desktop >/dev/null || groupadd -r chrome-remote-deskt
 %lang(vi) %{chromium_path}/locales/vi.pak*
 %lang(zh_CN) %{chromium_path}/locales/zh-CN.pak*
 %lang(zh_TW) %{chromium_path}/locales/zh-TW.pak*
+# These are psuedolocales, not real ones.
+# So we just include them always.
+%{chromium_path}/locales/ar-XB.pak*
+%{chromium_path}/locales/en-XA.pak*
 
 %if %{build_headless}
 %files headless
@@ -1999,6 +2007,9 @@ getent group chrome-remote-desktop >/dev/null || groupadd -r chrome-remote-deskt
 
 
 %changelog
+* Tue Jun  1 2021 Tom Callaway <spot@fedoraproject.org> - 91.0.4472.77-1
+- update to 91.0.4472.77
+
 * Tue May 18 2021 Tom Callaway <spot@fedoraproject.org> - 90.0.4430.212-1
 - update to 90.0.4430.212
 
diff --git a/clean_ffmpeg.sh b/clean_ffmpeg.sh
index cf9c5f2..ac34ca2 100755
--- a/clean_ffmpeg.sh
+++ b/clean_ffmpeg.sh
@@ -127,6 +127,7 @@ header_files="	libavcodec/x86/inline_asm.h \
 		libavcodec/pixblockdsp.h \
 		libavcodec/pixels.h \
 		libavcodec/png.h \
+		libavcodec/pngdsp.h \
 		libavcodec/put_bits.h \
 		libavcodec/qpeldsp.h \
 		libavcodec/ratecontrol.h \
@@ -297,7 +298,6 @@ mp3_files="	libavcodec/aarch64/aacpsdsp_init_aarch64.c \
 		libavcodec/sbrdsp.c \
 		libavcodec/sbrdsp_template.c \
 		libavcodec/sinewin.c \
-		libavcodec/sinewin_fixed.c \
 		libavcodec/x86/dct_init.c \
 		libavcodec/x86/dct32.asm \
 		libavcodec/x86/imdct36.asm \
diff --git a/sources b/sources
index 5fdb3e3..727f450 100644
--- a/sources
+++ b/sources
@@ -20,4 +20,4 @@ SHA512 (xcb-proto-1.14.tar.xz) = de66d568163b6da2be9d6c59984f3afa3acd119a7813786
 SHA512 (depot_tools.git-master.tar.gz) = dc323888812b66cc92c53a24a8a58ccf9e2961be67aa21852bd091b8b49569071f06ae9104cb58950e6253ac3a29f0db0663e9f35ef2b1ea28696efb38b42708
 SHA512 (NotoSansSymbols2-Regular.ttf) = 2644b42c3fdccfe12395f9b61553aced169a0f1dc09f5a0fd7898e9d0a372ee4422b6b1cdab3c86ecc91db437e9ae8a951e64e85edc3ac9e9fca428852dbb2ad
 SHA512 (NotoSansTibetan-Regular.ttf) = fb5a48fcaea80eebe7d692f6fcf00d59d47658a358d0ec8e046fc559873f88bd595b2da474d2826abd9e9305f3741c69058d867b1e6048f37fe7d71b5d3af36a
-SHA512 (chromium-90.0.4430.212-clean.tar.xz) = 53c16fcb899ae5de73599a67c7652801b4779c9642c2dacc2f211e6c6accd455507594138e59dcbabe9f80493d78fd4d0d118a58284d9d62f149e549dbba8ccc
+SHA512 (chromium-91.0.4472.77-clean.tar.xz) = 52e4daec5cbaaa91851d33c0699bb0529c2b84bf2d95937cd043914eaf7c75c9e2d512904038acd367888bc465dfe6e4217f2eb1670f2f9ee3cae1f2c2a57d0a