From 0ae19f4f97707e0fc120d17cfc7c0d3979928b36 Mon Sep 17 00:00:00 2001 From: Tom spot Callaway Date: Wed, 2 Jun 2021 16:11:50 -0400 Subject: [PATCH] 91.0.4472.77 --- chromium-91-libyuv-aarch64.patch | 91 + chromium-91-pcscan-vector-types.patch | 47 + ....0.4472.77-aarch64-cxxflags-addition.patch | 14 + ...72.77-disable-fontconfig-cache-magic.patch | 13 + ...-91.0.4472.77-initial_prefs-etc-path.patch | 15 + ...-java-only-allowed-in-android-builds.patch | 13 + ...m-91.0.4472.77-update-highway-0.12.2.patch | 12203 ++++++++++++++++ chromium.spec | 57 +- clean_ffmpeg.sh | 2 +- sources | 2 +- 10 files changed, 12432 insertions(+), 25 deletions(-) create mode 100644 chromium-91-libyuv-aarch64.patch create mode 100644 chromium-91-pcscan-vector-types.patch create mode 100644 chromium-91.0.4472.77-aarch64-cxxflags-addition.patch create mode 100644 chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch create mode 100644 chromium-91.0.4472.77-initial_prefs-etc-path.patch create mode 100644 chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch create mode 100644 chromium-91.0.4472.77-update-highway-0.12.2.patch diff --git a/chromium-91-libyuv-aarch64.patch b/chromium-91-libyuv-aarch64.patch new file mode 100644 index 0000000..77b8f4c --- /dev/null +++ b/chromium-91-libyuv-aarch64.patch @@ -0,0 +1,91 @@ +diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc +index 350c964..2aab413 100644 +--- a/third_party/libyuv/source/row_neon64.cc ++++ b/third_party/libyuv/source/row_neon64.cc +@@ -1835,7 +1835,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 +- : "m"(kShuffleARGBToABGR) // %3 ++ : "Q"(kShuffleARGBToABGR) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + } + +@@ -1859,7 +1859,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 +- : "m"(kShuffleAR64ToARGB) // %3 ++ : "Q"(kShuffleAR64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + } + +@@ -1883,7 +1883,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 +- : "m"(kShuffleAB64ToARGB) // %3 ++ : "Q"(kShuffleAB64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + } + +diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc +index 8656fec..9f9636e 100644 +--- a/third_party/libyuv/source/scale_neon64.cc ++++ b/third_party/libyuv/source/scale_neon64.cc +@@ -601,8 +601,8 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + +- "mov v0.8h, v4.8h \n" +- "mov v1.8h, v5.8h \n" ++ "mov v0.16b, v4.16b \n" ++ "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) +@@ -642,7 +642,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + +- "mov v2.8h, v0.8h \n" ++ "mov v2.16b, v0.16b \n" + "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) + "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) + +@@ -679,7 +679,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + +- "mov v0.8h, v2.8h \n" ++ "mov v0.16b, v2.16b \n" + "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) + "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) + +@@ -687,12 +687,12 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + +- "mov v0.8h, v4.8h \n" ++ "mov v0.16b, v4.16b \n" + "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) + "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) + +- "mov v0.8h, v4.8h \n" +- "mov v1.8h, v5.8h \n" ++ "mov v0.16b, v4.16b \n" ++ "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) +@@ -887,8 +887,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + +- "mov v0.8h, v4.8h \n" +- "mov v1.8h, v5.8h \n" ++ "mov v0.16b, v4.16b \n" ++ "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) diff --git a/chromium-91-pcscan-vector-types.patch b/chromium-91-pcscan-vector-types.patch new file mode 100644 index 0000000..33fc89e --- /dev/null +++ b/chromium-91-pcscan-vector-types.patch @@ -0,0 +1,47 @@ +From 429e6f78a88473208e96689afa2f6e91f07a4f8c Mon Sep 17 00:00:00 2001 +From: Stephan Hartmann +Date: Sat, 10 Apr 2021 17:02:49 +0000 +Subject: [PATCH] GCC: fix vector types in pcscan + + * _mm_cmpeq_epi64 result is __m128i + * maybe_ptrs is __m128i already and doesn't require cast + +Bug: 819294 +Change-Id: I3f8c6cc327191827838e80aea1431ac09315fe88 +Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2817544 +Reviewed-by: Anton Bikineev +Commit-Queue: Stephan Hartmann +Cr-Commit-Position: refs/heads/master@{#871265} +--- + +diff --git a/base/allocator/partition_allocator/starscan/pcscan.cc b/base/allocator/partition_allocator/starscan/pcscan.cc +index c7854ff..d5c0aea 100644 +--- a/base/allocator/partition_allocator/starscan/pcscan.cc ++++ b/base/allocator/partition_allocator/starscan/pcscan.cc +@@ -1143,7 +1143,7 @@ + const __m128i maybe_ptrs = + _mm_loadu_si128(reinterpret_cast<__m128i*>(payload)); + const __m128i vand = _mm_and_si128(maybe_ptrs, cage_mask); +- const __m128d vcmp = _mm_cmpeq_epi64(vand, vbase); ++ const __m128i vcmp = _mm_cmpeq_epi64(vand, vbase); + const int mask = _mm_movemask_pd(_mm_castsi128_pd(vcmp)); + if (LIKELY(!mask)) + continue; +@@ -1153,15 +1153,14 @@ + if (mask & 0b01) { + quarantine_size += + pcscan_task_.TryMarkObjectInNormalBuckets( +- _mm_cvtsi128_si64(_mm_castpd_si128(maybe_ptrs))); ++ _mm_cvtsi128_si64(maybe_ptrs)); + } + if (mask & 0b10) { + // Extraction intrinsics for qwords are only supported in SSE4.1, so + // instead we reshuffle dwords with pshufd. The mask is used to move the + // 4th and 3rd dwords into the second and first position. + static constexpr int kSecondWordMask = (3 << 2) | (2 << 0); +- const __m128i shuffled = +- _mm_shuffle_epi32(_mm_castpd_si128(maybe_ptrs), kSecondWordMask); ++ const __m128i shuffled = _mm_shuffle_epi32(maybe_ptrs, kSecondWordMask); + quarantine_size += + pcscan_task_.TryMarkObjectInNormalBuckets( + _mm_cvtsi128_si64(shuffled)); diff --git a/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch b/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch new file mode 100644 index 0000000..249adf4 --- /dev/null +++ b/chromium-91.0.4472.77-aarch64-cxxflags-addition.patch @@ -0,0 +1,14 @@ +diff -up chromium-91.0.4472.77/build/config/compiler/BUILD.gn.aarch-cxxflags chromium-91.0.4472.77/build/config/compiler/BUILD.gn +--- chromium-91.0.4472.77/build/config/compiler/BUILD.gn.aarch-cxxflags 2021-06-02 12:58:21.998750145 -0400 ++++ chromium-91.0.4472.77/build/config/compiler/BUILD.gn 2021-06-02 12:59:29.762092189 -0400 +@@ -1511,6 +1511,10 @@ config("default_warnings") { + cflags += [ "-Wno-psabi" ] + } + ++ if (current_cpu == "arm" && !is_clang) { ++ cflags_cc += [ "-flax-vector-conversions" ] ++ } ++ + if (!is_clang) { + cflags_cc += [ + # See comment for -Wno-c++11-narrowing. diff --git a/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch b/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch new file mode 100644 index 0000000..f3f362c --- /dev/null +++ b/chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch @@ -0,0 +1,13 @@ +diff -up chromium-91.0.4472.77/base/test/BUILD.gn.nofontconfigcache chromium-91.0.4472.77/base/test/BUILD.gn +--- chromium-91.0.4472.77/base/test/BUILD.gn.nofontconfigcache 2021-06-01 16:41:40.094756454 -0400 ++++ chromium-91.0.4472.77/base/test/BUILD.gn 2021-06-01 16:42:47.736100516 -0400 +@@ -198,9 +198,6 @@ static_library("test_support") { + sources += [ "test_file_util_linux.cc" ] + public_deps += [ ":fontconfig_util_linux" ] + data_deps += [ "//third_party/test_fonts" ] +- if (current_toolchain == host_toolchain) { +- data_deps += [ ":do_generate_fontconfig_caches" ] +- } + } + + if (is_mac) { diff --git a/chromium-91.0.4472.77-initial_prefs-etc-path.patch b/chromium-91.0.4472.77-initial_prefs-etc-path.patch new file mode 100644 index 0000000..96825e2 --- /dev/null +++ b/chromium-91.0.4472.77-initial_prefs-etc-path.patch @@ -0,0 +1,15 @@ +diff -up chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc.etc chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc +--- chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc.etc 2021-06-01 16:37:39.182531036 -0400 ++++ chromium-91.0.4472.77/chrome/browser/first_run/first_run_internal_linux.cc 2021-06-01 16:39:31.590102809 -0400 +@@ -20,9 +20,9 @@ bool IsOrganicFirstRun() { + + base::FilePath InitialPrefsPath() { + // The standard location of the initial prefs is next to the chrome binary. ++ // ...but we patch it to use /etc/chromium + base::FilePath initial_prefs; +- if (!base::PathService::Get(base::DIR_EXE, &initial_prefs)) +- return base::FilePath(); ++ initial_prefs = base::FilePath("/etc/chromium"); + + base::FilePath new_path = initial_prefs.AppendASCII(installer::kInitialPrefs); + if (base::PathIsReadable(new_path)) diff --git a/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch b/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch new file mode 100644 index 0000000..a4748d9 --- /dev/null +++ b/chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch @@ -0,0 +1,13 @@ +diff -up chromium-91.0.4472.77/third_party/closure_compiler/compiler.py.java-allowed chromium-91.0.4472.77/third_party/closure_compiler/compiler.py +--- chromium-91.0.4472.77/third_party/closure_compiler/compiler.py.java-allowed 2021-06-02 17:14:48.445064647 +0000 ++++ chromium-91.0.4472.77/third_party/closure_compiler/compiler.py 2021-06-02 17:15:12.994836949 +0000 +@@ -13,8 +13,7 @@ import subprocess + + + _CURRENT_DIR = os.path.join(os.path.dirname(__file__)) +-_JAVA_PATH = os.path.join(_CURRENT_DIR, "..", "jdk", "current", "bin", "java") +-assert os.path.isfile(_JAVA_PATH), "java only allowed in android builds" ++_JAVA_PATH = "java" + + class Compiler(object): + """Runs the Closure compiler on given source files to typecheck them diff --git a/chromium-91.0.4472.77-update-highway-0.12.2.patch b/chromium-91.0.4472.77-update-highway-0.12.2.patch new file mode 100644 index 0000000..7b53aac --- /dev/null +++ b/chromium-91.0.4472.77-update-highway-0.12.2.patch @@ -0,0 +1,12203 @@ +diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time +diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10 +diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6 +diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt +--- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 2021-06-02 10:56:05.305904746 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt 2021-05-31 10:37:11.000000000 -0400 +@@ -19,7 +19,7 @@ if(POLICY CMP0083) + cmake_policy(SET CMP0083 NEW) + endif() + +-project(hwy VERSION 0.1) ++project(hwy VERSION 0.12.2) # Keep in sync with highway.h version + + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_EXTENSIONS OFF) +@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) + endif() + ++set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?") ++ + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + "int main() { +@@ -51,10 +53,13 @@ check_cxx_source_compiles( + HWY_EMSCRIPTEN + ) + ++set(HWY_CONTRIB_SOURCES ++ hwy/contrib/image/image.cc ++ hwy/contrib/image/image.h ++ hwy/contrib/math/math-inl.h ++) ++ + set(HWY_SOURCES +- contrib/image/image.cc +- contrib/image/image.h +- contrib/math/math-inl.h + hwy/aligned_allocator.cc + hwy/aligned_allocator.h + hwy/base.h +@@ -64,6 +69,7 @@ set(HWY_SOURCES + hwy/nanobenchmark.cc + hwy/nanobenchmark.h + hwy/ops/arm_neon-inl.h ++ hwy/ops/arm_sve-inl.h + hwy/ops/scalar-inl.h + hwy/ops/set_macros-inl.h + hwy/ops/shared-inl.h +@@ -146,13 +152,28 @@ else() + -fno-exceptions + ) + endif() +-endif() ++ ++ if (HWY_CMAKE_ARM7) ++ list(APPEND HWY_FLAGS ++ -march=armv7-a ++ -mfpu=neon-vfpv4 ++ -mfloat-abi=hard # must match the toolchain specified as CXX= ++ -mfp16-format=ieee # required for vcvt_f32_f16 ++ ) ++ endif() # HWY_CMAKE_ARM7 ++ ++endif() # !MSVC + + add_library(hwy STATIC ${HWY_SOURCES}) + target_compile_options(hwy PRIVATE ${HWY_FLAGS}) + set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) + target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR}) + ++add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES}) ++target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS}) ++set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON) ++target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR}) ++ + # -------------------------------------------------------- install library + install(TARGETS hwy + DESTINATION "${CMAKE_INSTALL_LIBDIR}") +@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES}) + endif() + endforeach() + +-# Add a pkg-config file for libhwy and the test library. ++install(TARGETS hwy_contrib ++ DESTINATION "${CMAKE_INSTALL_LIBDIR}") ++# Install all the headers keeping the relative path to the current directory ++# when installing them. ++foreach (source ${HWY_CONTRIB_SOURCES}) ++ if ("${source}" MATCHES "\.h$") ++ get_filename_component(dirname "${source}" DIRECTORY) ++ install(FILES "${source}" ++ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") ++ endif() ++endforeach() ++ ++# Add a pkg-config file for libhwy and the contrib/test libraries. + set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}") +-foreach (pc libhwy.pc libhwy-test.pc) ++foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") +@@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD + # Avoids mismatch between GTest's static CRT and our dynamic. + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +-add_executable(skeleton hwy/examples/skeleton_main.cc) +-target_sources(skeleton PRIVATE +- hwy/examples/skeleton-inl.h +- hwy/examples/skeleton.cc +- hwy/examples/skeleton.h +- hwy/examples/skeleton_shared.h) +-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to +-# observe the difference in targets printed. +-target_compile_options(skeleton PRIVATE ${HWY_FLAGS}) +-target_link_libraries(skeleton hwy) +-set_target_properties(skeleton +- PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") +- +-# Similar: shared headers but without the runtime dispatch in skeleton.cc/h +-add_executable(skeleton_static hwy/examples/skeleton_static_main.cc) +-target_sources(skeleton_static PRIVATE +- hwy/examples/skeleton-inl.h +- hwy/examples/skeleton_shared.h) +-target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS}) +-target_link_libraries(skeleton_static hwy) +-set_target_properties(skeleton_static +- PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") +- + # Programming exercise with integrated benchmark + add_executable(hwy_benchmark hwy/examples/benchmark.cc) + target_sources(hwy_benchmark PRIVATE + hwy/nanobenchmark.cc + hwy/nanobenchmark.h) ++# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to ++# observe the difference in targets printed. + target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS}) + target_link_libraries(hwy_benchmark hwy) + set_target_properties(hwy_benchmark +@@ -272,19 +284,21 @@ endif() + endif() # HWY_SYSTEM_GTEST + + set(HWY_TEST_FILES +- contrib/image/image_test.cc +- # contrib/math/math_test.cc ++ hwy/contrib/image/image_test.cc ++ # hwy/contrib/math/math_test.cc ++ hwy/aligned_allocator_test.cc ++ hwy/base_test.cc ++ hwy/highway_test.cc ++ hwy/targets_test.cc + hwy/examples/skeleton_test.cc + hwy/tests/arithmetic_test.cc + hwy/tests/combine_test.cc + hwy/tests/compare_test.cc + hwy/tests/convert_test.cc +- hwy/tests/hwy_test.cc + hwy/tests/logical_test.cc + hwy/tests/memory_test.cc + hwy/tests/swizzle_test.cc +- hwy/aligned_allocator_test.cc +- hwy/targets_test.cc ++ hwy/tests/test_util_test.cc + ) + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests) +@@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE + get_filename_component(TESTNAME ${TESTFILE} NAME_WE) + add_executable(${TESTNAME} ${TESTFILE}) + target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS}) ++ # Test all targets, not just the best/baseline. This changes the default ++ # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can ++ # cause compile errors because only one may be set, and other CMakeLists.txt ++ # that include us may set them. ++ target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1) + + if(HWY_SYSTEM_GTEST) +- target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main) ++ target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main) + else() +- target_link_libraries(${TESTNAME} hwy gtest gtest_main) ++ target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main) + endif() + # Output test targets in the test directory. + set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/") +diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE +diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in +diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING +diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog +--- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 2021-06-02 10:56:05.151903967 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog 2021-05-31 10:37:11.000000000 -0400 +@@ -1,3 +1,26 @@ ++highway (0.12.2-1) UNRELEASED; urgency=medium ++ ++ * fix scalar-only test and Windows macro conflict with Load/StoreFence ++ * replace deprecated wasm intrinsics ++ ++ -- Jan Wassenberg Mon, 31 May 2021 16:00:00 +0200 ++ ++highway (0.12.1-1) UNRELEASED; urgency=medium ++ ++ * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors ++ * fix warnings, faster ARM div/sqrt, separate hwy_contrib library ++ * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC ++ ++ -- Jan Wassenberg Wed, 19 May 2021 15:00:00 +0200 ++ ++highway (0.12.0-1) UNRELEASED; urgency=medium ++ ++ * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4 ++ * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES ++ * Proper IEEE rounding, reduce libstdc++ usage, inlined math ++ ++ -- Jan Wassenberg Thu, 15 Apr 2021 20:00:00 +0200 ++ + highway (0.11.1-1) UNRELEASED; urgency=medium + + * Fix clang7 asan error, finish f16 conversions and add test +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format +diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf +Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md +--- chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 2021-06-02 10:56:05.117903795 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md 2021-05-31 10:37:11.000000000 -0400 +@@ -33,6 +33,12 @@ The public headers are: + * hwy/cache_control.h: defines stand-alone functions to control caching (e.g. + prefetching) and memory barriers, independent of actual SIMD. + ++* hwy/nanobenchmark.h: library for precisely measuring elapsed time (under ++ varying inputs) for benchmarking small/medium regions of code. ++ ++* hwy/tests/test_util-inl.h: defines macros for invoking tests on all ++ available targets, plus per-target functions useful in tests (e.g. Print). ++ + SIMD implementations must be preceded and followed by the following: + + ``` +@@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE(); + + ## Vector and descriptor types + +-Highway vectors consist of one or more 'lanes' of the same built-in type `T = +-uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16, +-32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load, +-store, and conversion to/from `float32_t`; infinity or NaN have +-implementation-defined results. +- +-Each vector has `N` lanes (a power of two, possibly unknown at compile time). +- +-Platforms such as x86 support multiple vector types, and other platforms require +-that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be +-wrapped inside a class. The Highway API satisfies these constraints because it +-is designed around overloaded functions selected via a zero-sized tag parameter +-`d` of type `D = Simd`. These are typically constructed using aliases: +- +-* `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native +- vector for the current target. For targets (e.g. RVV) that support register +- groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers +- in the group. This effectively multiplies the lane count in each operation +- by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of +- the sizes of the largest and smallest type. `LMUL > 1` is more efficient on +- single-issue machines, but larger values reduce the effective number of +- registers, which may cause the compiler to spill them to memory. ++Highway vectors consist of one or more 'lanes' of the same built-in type ++`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32, ++64`. ++ ++In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store, ++and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN ++are implementation-defined due to ARMv7. ++ ++On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway ++API allows using built-in types as vectors because operations are expressed as ++overloaded functions. Instead of constructors, overloaded initialization ++functions such as `Set` take a zero-sized tag argument called `d` of type `D = ++Simd` and return an actual vector of unspecified type. ++ ++`T` is one of the lane types above, and may be retrieved via `TFromD`. ++ ++`N` is target-dependent and not directly user-specified. The actual lane count ++may not be known at compile time, but can be obtained via `Lanes(d)`. Use this ++value, which is potentially different from `N`, to increment loop counters etc. ++It is typically a power of two, but that is not guaranteed e.g. on SVE. ++ ++`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases: ++ ++* Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a ++ native vector. This is preferred because it fully utilizes vector lanes. ++ ++ For targets (e.g. RVV) that support register groups, the optional `LMUL` (1, ++ 2, 4, 8) specifies the number of registers in the group. This effectively ++ multiplies the lane count in each operation by `LMUL`. For mixed-precision ++ code, `LMUL` must be at least the ratio of the sizes of the largest and ++ smallest type. `LMUL > 1` is more efficient on single-issue machines, but ++ larger values reduce the effective number of registers, which may cause the ++ compiler to spill them to memory. ++ ++* Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector ++ which may be native width, but no more than `N` lanes have observable ++ effects such as loading/storing to memory. This is less performance-portable ++ because it may not use all available lanes. Note that the resulting lane ++ count may also be less than `N`. ++ ++ For targets (e.g. RVV) that have compile-time-unknown lane counts, such ++ vectors incur additional runtime cost in `Load` etc. ++ ++User-specified lane counts or tuples of vectors could cause spills on targets ++with fewer or smaller vectors. By contrast, Highway encourages vector-length ++agnostic code, which is more performance-portable. ++ ++Given that lane counts are potentially compile-time-unknown, storage for vectors ++should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For ++applications that require a compile-time estimate, `MaxLanes(d)` returns the `N` ++from `Simd`, which is NOT necessarily the actual lane count. This is ++DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may ++be very large) and some compilers are not able to interpret it as constexpr. + +-* `const HWY_CAPPED(T, N) d;` for up to `N` lanes. +- +-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors +-for the smaller types must be obtained from those of the larger type (e.g. via ++For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for ++the smaller types must be obtained from those of the larger type (e.g. via + `Rebind`). + +-The type `T` may be accessed as `TFromD`. There are three possibilities for +-the template parameter `N`: +- +-1. Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a +- target with compile-time constant vectors. ++## Using unspecified vector types + +-1. Less than the hardware vector width. This is the result of a compile-time +- decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of +- lanes, even when the hardware vector width could be greater. +- +-1. Unrelated to the hardware vector width, e.g. when the hardware vector width +- is not known at compile-time and may be very large. +- +-In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by +-which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd`, +-which is NOT necessarily the actual vector size (see above) and some compilers +-are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use +-alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of +-`Lanes(d)` elements. +- +-Highway is designed to map a vector variable to a (possibly partial) hardware +-register or register group. By discouraging user-specified `N` and tuples of +-vector variables, we improve performance portability (e.g. by reducing spills to +-memory for platforms that have smaller vectors than the developer expected). +- +-To construct vectors, call factory functions (see "Initialization" below) with +-a tag parameter `d`. +- +-Local variables typically use auto for type deduction. For some generic +-functions, a template argument `V` is sufficient: `template V Squared(V +-v) { return v * v; }`. In general, functions have a `D` template argument and +-can return vectors of type `Vec`. +- +-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined +-functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway +-functions generally take either a `Simd` or vector argument, which are also +-defined in namespace `hwy`, they will typically be found via Argument-Dependent +-Lookup and namespace qualifiers are not necessary. As an exception, Highway +-functions that are templates (e.g. because they require a compile-time argument +-such as a lane index or shift count) require a using-declaration such as +-`using hwy::HWY_NAMESPACE::ShiftLeft`. ++Because vector types are unspecified, local vector variables are typically ++defined using `auto` for type deduction. A template argument `V` suffices for ++simple generic functions: `template V Squared(V v) { return v * v; }`. ++ ++Many functions will need a `D` template argument in order to initialize any ++constants. They can use a separate `V` template argument for vectors, or use ++`Vec`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such ++aliases instead of auto may improve readability of mixed-type code. They can ++also be used for member variables, which are discouraged because compilers often ++have difficulty mapping them to registers. + + ## Operations + +@@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ + bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and + bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used. + ++Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined ++functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions ++generally take either a `Simd` or vector/mask argument. For targets where ++vectors and masks are defined in namespace `hwy`, the functions will be found ++via Argument-Dependent Lookup. However, this does not work for function ++templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using ++hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`. ++ + ### Initialization + + * V **Zero**(D): returns N-lane vector with all bits set to 0. +@@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f + * `V`: `{i,f}` \ + V **Neg**(V a): returns `-a[i]`. + +-* `V`: `{i}{8,16,32}, {f}` \ ++* `V`: `{i,f}` \ + V **Abs**(V a) returns the absolute value of `a[i]`; for + integers, `LimitsMin()` maps to `LimitsMax() + 1`. + +@@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift + shifting `MakeUnsigned` and casting to `T`. Right-shifting negative signed + `T` is the same as an unsigned shift, except that 1-bits are shifted in. + +-Compile-time constant shifts, generally the most efficient variant: ++Compile-time constant shifts, generally the most efficient variant (though 8-bit ++shifts are potentially slower than other lane sizes): + +-* `V`: `{u,i}{16,32,64}` \ ++* `V`: `{u,i}` \ + V **ShiftLeft**<int>(V a) returns `a[i] << int`. + +-* `V`: `{u,i}{16,32,64}` \ ++* `V`: `{u,i}` \ + V **ShiftRight**<int>(V a) returns `a[i] >> int`. + + Shift all lanes by the same (not necessarily compile-time constant) amount: + +-* `V`: `{u,i}{16,32,64}` \ ++* `V`: `{u,i}` \ + V **ShiftLeftSame**(V a, int bits) returns `a[i] << bits`. + +-* `V`: `{u,i}{16,32,64}` \ ++* `V`: `{u,i}` \ + V **ShiftRightSame**(V a, int bits) returns `a[i] >> bits`. + +-Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2): ++Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2): + + * `V`: `{u,i}{16,32,64}` \ + V **operator<<**(V a, V b) returns `a[i] << b[i]`. +@@ -332,12 +347,17 @@ Special functions for signed types: + slightly more efficient; requires the first argument to be non-negative. + + * `V`: `i32/64` \ +- V **BroadcastSignBit(V a) returns `a[i] < 0 ? -1 : 0`. ++ V **BroadcastSignBit**(V a) returns `a[i] < 0 ? -1 : 0`. + + ### Masks + + Let `M` denote a mask capable of storing true/false for each lane. + ++* M **FirstN**(D, size_t N): returns mask with the first `N` ++ lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in ++ an all-true mask. Useful for implementing "masked" stores by loading `prev` ++ followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`. ++ + * M1 **RebindMask**(D, M2 m): returns same mask bits as `m`, but + reinterpreted as a mask for lanes of type `TFromD`. `M1` and `M2` must + have the same number of lanes. +@@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing + * size_t **CountTrue**(M m): returns how many of `m[i]` are true + [0, N]. This is typically more expensive than AllTrue/False. + +-* `V`: `{u,i,f}{32,64}` \ ++* `V`: `{u,i,f}{16,32,64}` \ + V **Compress**(V v, M m): returns `r` such that `r[n]` is + `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true. + Compacts lanes whose mask is set into the lower lanes; upper lanes are +- implementation-defined. ++ implementation-defined. Slow with 16-bit lanes. + +-* `V`: `{u,i,f}{32,64}` \ ++* `V`: `{u,i,f}{16,32,64}` \ + size_t **CompressStore**(V v, M m, D, T* aligned): writes lanes + whose mask is set into `aligned`, starting from lane 0. Returns + `CountTrue(m)`, the number of valid lanes. All subsequent lanes may be +- overwritten! Alignment ensures inactive lanes will not cause faults. ++ overwritten! Alignment ensures inactive lanes will not cause faults. Slow ++ with 16-bit lanes. + + ### Comparisons + +@@ -429,10 +450,16 @@ Memory operands are little-endian, other + lane configuration. Pointers are the addresses of `N` consecutive `T` values, + either naturally-aligned (`aligned`) or possibly unaligned (`p`). + ++**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic ++bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands ++are naturally aligned. An unaligned access may require two load ports. ++ + #### Load + + * Vec<D> **Load**(D, const T* aligned): returns +- `aligned[i]`. ++ `aligned[i]`. May fault if the pointer is not aligned to the vector size. ++ Using this whenever possible improves codegen on SSE4: unlike `LoadU`, ++ `Load` can be fused into a memory operand, which reduces register pressure. + * Vec<D> **LoadU**(D, const T* p): returns `p[i]`. + + * Vec<D> **LoadDup128**(D, const T* p): returns one 128-bit +@@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or + be faster than broadcasting single values, and is more convenient than + preparing constants for the actual vector length. + +-#### Gather ++#### Scatter/Gather + +-**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`: ++**Note**: Offsets/indices are of type `VI = Vec>` and need not ++be unique. The results are implementation-defined if any are negative. + +-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \ +- Vec<D> **GatherOffset**(D, const T* base, VI offsets). +- Returns elements of base selected by possibly repeated *byte* `offsets[i]`. +- Results are implementation-defined if `offsets[i]` is negative. +- +-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \ +- Vec<D> **GatherIndex**(D, const T* base, VI indices). +- Returns vector of `base[indices[i]]`. Indices need not be unique, but +- results are implementation-defined if they are negative. ++**Note**: Where possible, applications should `Load/Store/TableLookup*` entire ++vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form ++`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] = ++F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`. ++ ++* `D`: `{u,i,f}{32,64}` \ ++ void **ScatterOffset**(Vec<D> v, D, const T* base, VI ++ offsets): stores `v[i]` to the base address plus *byte* `offsets[i]`. ++ ++* `D`: `{u,i,f}{32,64}` \ ++ void **ScatterIndex**(Vec<D> v, D, const T* base, VI ++ indices): stores `v[i]` to `base[indices[i]]`. ++ ++* `D`: `{u,i,f}{32,64}` \ ++ Vec<D> **GatherOffset**(D, const T* base, VI offsets): ++ returns elements of base selected by *byte* `offsets[i]`. ++ ++* `D`: `{u,i,f}{32,64}` \ ++ Vec<D> **GatherIndex**(D, const T* base, VI indices): ++ returns vector of `base[indices[i]]`. + + #### Store + +@@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or + * void **StoreU**(Vec<D> a, D, T* p): as Store, but without + the alignment requirement. + ++* `D`: `u8` \ ++ void **StoreInterleaved3**(Vec<D> v0, Vec<D> v1, ++ Vec<D> v2, D, T* p): equivalent to shuffling `v0, v1, v2` ++ followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0], ++ p[2] == v1[0]`. Useful for RGB samples. ++ ++* `D`: `u8` \ ++ void **StoreInterleaved4**(Vec<D> v0, Vec<D> v1, ++ Vec<D> v2, Vec<D> v3, D, T* p): as above, but for four ++ vectors (e.g. RGBA samples). ++ + ### Cache control + + All functions except Stream are defined in cache_control.h. +@@ -483,6 +533,9 @@ All functions except Stream are defined + * void **Prefetch**(const T* p): begins loading the cache line + containing "p". + ++* void **Pause**(): when called inside a spin-loop, may reduce ++ power consumption. ++ + ### Type conversion + + * Vec<D> **BitCast**(D, V): returns the bits of `V` +@@ -525,7 +578,8 @@ if the input exceeds the destination ran + zero and converts the value to same-sized integer. + + * `V`: `f32`; `Ret`: `i32` \ +- Ret **NearestInt**(V a): returns the integer nearest to `a[i]`. ++ Ret **NearestInt**(V a): returns the integer nearest to `a[i]`; ++ results are undefined for NaN. + + ### Swizzle + +@@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with + + ### Reductions + +-**Note**: the following are only available for full vectors (including scalar). +-These 'reduce' all lanes to a single result. This result is broadcasted to all +-lanes at no extra cost; you can use `GetLane` to obtain the value. ++**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is ++broadcasted to all lanes at no extra cost. To obtain a scalar, you can call ++`GetLane`. + + Being a horizontal operation (across lanes of the same vector), these are slower + than normal SIMD operations and are typically used outside critical loops. +@@ -697,9 +751,6 @@ generate such instructions (implying the + finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to + provide an alternative to functions which are not supported by HWY_SCALAR. + +-* `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by +- HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof. +- + * `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as + `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out. + +@@ -707,26 +758,15 @@ The following signal capabilities and ex + + * `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes. + * `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes. ++ ++The following were used to signal the maximum number of lanes for certain ++operations, but this is no longer necessary (nor possible on SVE/RVV), so they ++are DEPRECATED: ++ ++* `HWY_GATHER_LANES(T)`. + * `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits. + * `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits. + +-The following indicate the maximum number of lanes for certain operations. For +-targets that support the feature/operation, the macro evaluates to +-`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))` +-generates the best possible code (or scalar fallback) from the same source code. +- +-* `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset. +-* `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2). +- DEPRECATED, this always matches HWY_LANES(T) and will be removed. +- +-As above, but the feature implies the type so there is no T parameter, thus +-these can be used in `#if` expressions. +- +-* `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this +- always matches HWY_LANES(int64_t) and will be removed. +-* `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED, +- this always matches HWY_LANES(int64_t) and will be removed. +- + ## Detecting supported targets + + `SupportedTargets()` returns a cached (initialized on-demand) bitfield of the +@@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`: + and permitted by the compiler, independently of autovectorization), which + maximizes coverage in tests. + +-If none are defined, the default is to select all attainable targets except any +-non-best baseline (typically `HWY_SCALAR`), which reduces code size. ++If none are defined, but `HWY_IS_TEST` is defined, the default is ++`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable ++targets except any non-best baseline (typically `HWY_SCALAR`), which reduces ++code size. + + ## Compiler support + +@@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags + However, this enables AVX2 instructions in the entire translation unit, which + may violate the one-definition rule and cause crashes. Instead, we use + target-specific attributes introduced via #pragma. Function using SIMD must +-reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. ++reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively, ++individual functions or lambdas may be prefixed with `HWY_ATTR`. + + Immediates (compile-time constants) are specified as template arguments to avoid + constant-propagation issues with Clang on ARM. +diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 2021-06-02 10:56:05.278904609 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h 2021-05-31 10:37:11.000000000 -0400 +@@ -111,6 +111,32 @@ AlignedUniquePtr MakeUniqueAligned(Ar + new (ptr) T(std::forward(args)...), AlignedDeleter()); + } + ++// Helpers for array allocators (avoids overflow) ++namespace detail { ++ ++// Returns x such that 1u << x == n (if n is a power of two). ++static inline constexpr size_t ShiftCount(size_t n) { ++ return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); ++} ++ ++template ++T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { ++ constexpr size_t size = sizeof(T); ++ ++ constexpr bool is_pow2 = (size & (size - 1)) == 0; ++ constexpr size_t bits = ShiftCount(size); ++ static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); ++ ++ const size_t bytes = is_pow2 ? items << bits : items * size; ++ const size_t check = is_pow2 ? bytes >> bits : bytes / size; ++ if (check != items) { ++ return nullptr; // overflowed ++ } ++ return static_cast(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); ++} ++ ++} // namespace detail ++ + // Aligned memory equivalent of make_unique for array types using the + // custom allocators alloc/free. This function calls the constructor with the + // passed Args... on every created item. The destructor of each element will be +@@ -118,10 +144,11 @@ AlignedUniquePtr MakeUniqueAligned(Ar + template + AlignedUniquePtr MakeUniqueAlignedArrayWithAlloc( + size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { +- T* ptr = +- static_cast(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)); +- for (size_t i = 0; i < items; i++) { +- new (ptr + i) T(std::forward(args)...); ++ T* ptr = detail::AllocateAlignedItems(items, alloc, opaque); ++ if (ptr != nullptr) { ++ for (size_t i = 0; i < items; i++) { ++ new (ptr + i) T(std::forward(args)...); ++ } + } + return AlignedUniquePtr(ptr, AlignedDeleter(free, opaque)); + } +@@ -165,7 +192,7 @@ template + AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, + FreePtr free, void* opaque) { + return AlignedFreeUniquePtr( +- static_cast(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)), ++ detail::AllocateAlignedItems(items, alloc, opaque), + AlignedFreer(free, opaque)); + } + +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 2021-06-02 10:56:05.273904584 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -16,6 +16,7 @@ + + #include + ++#include + #include + #include + #include +@@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr) + /*opaque_ptr=*/nullptr); + } + ++TEST(AlignedAllocatorTest, Log2) { ++ EXPECT_EQ(0u, detail::ShiftCount(1)); ++ EXPECT_EQ(1u, detail::ShiftCount(2)); ++ EXPECT_EQ(3u, detail::ShiftCount(8)); ++} ++ ++// Allocator returns null when it detects overflow of items * sizeof(T). ++TEST(AlignedAllocatorTest, Overflow) { ++ constexpr size_t max = ~size_t(0); ++ constexpr size_t msb = (max >> 1) + 1; ++ using Size5 = std::array; ++ using Size10 = std::array; ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(max / 2, nullptr, nullptr)); ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(max / 3, nullptr, nullptr)); ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(max / 4, nullptr, nullptr)); ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(msb, nullptr, nullptr)); ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(msb + 1, nullptr, nullptr)); ++ EXPECT_EQ(nullptr, ++ detail::AllocateAlignedItems(msb / 4, nullptr, nullptr)); ++} ++ + TEST(AlignedAllocatorTest, AllocDefaultPointers) { + const size_t kSize = 7777; + void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr, + /*opaque_ptr=*/nullptr); + ASSERT_NE(nullptr, ptr); + // Make sure the pointer is actually aligned. +- EXPECT_EQ(0, reinterpret_cast(ptr) % kMaxVectorSize); ++ EXPECT_EQ(0U, reinterpret_cast(ptr) % kMaxVectorSize); + char* p = static_cast(ptr); + size_t ret = 0; + for (size_t i = 0; i < kSize; i++) { +@@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP + p[i] = static_cast(i & 0x7F); + if (i) ret += p[i] * p[i - 1]; + } +- EXPECT_NE(0, ret); ++ EXPECT_NE(0U, ret); + FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr); + } + +@@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc) + AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc); + ASSERT_NE(nullptr, ptr); + // We should have only requested one alloc from the allocator. +- EXPECT_EQ(1u, fake_alloc.PendingAllocs()); ++ EXPECT_EQ(1U, fake_alloc.PendingAllocs()); + // Make sure the pointer is actually aligned. +- EXPECT_EQ(0, reinterpret_cast(ptr) % kMaxVectorSize); ++ EXPECT_EQ(0U, reinterpret_cast(ptr) % kMaxVectorSize); + FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc); +- EXPECT_EQ(0u, fake_alloc.PendingAllocs()); ++ EXPECT_EQ(0U, fake_alloc.PendingAllocs()); + } + + TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) { +@@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli + TEST(AlignedAllocatorTest, AllocSingleInt) { + auto ptr = AllocateAligned(1); + ASSERT_NE(nullptr, ptr.get()); +- EXPECT_EQ(0, reinterpret_cast(ptr.get()) % kMaxVectorSize); ++ EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % kMaxVectorSize); + // Force delete of the unique_ptr now to check that it doesn't crash. + ptr.reset(nullptr); + EXPECT_EQ(nullptr, ptr.get()); +@@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple + const size_t kSize = 7777; + auto ptr = AllocateAligned(kSize); + ASSERT_NE(nullptr, ptr.get()); +- EXPECT_EQ(0, reinterpret_cast(ptr.get()) % kMaxVectorSize); ++ EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % kMaxVectorSize); + // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the + // underlying type chosen by AllocateAligned() for the std::unique_ptr. + EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1])); +@@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple + ptr[i] = static_cast(i); + if (i) ret += ptr[i] * ptr[i - 1]; + } +- EXPECT_NE(0, ret); ++ EXPECT_NE(0U, ret); + } + + TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) { +@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli + auto arr = MakeUniqueAlignedArrayWithAlloc>( + 7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc, + &counter); +- // An array shold still only call a single allocation. ++ ASSERT_NE(nullptr, arr.get()); ++ // An array should still only call a single allocation. + EXPECT_EQ(1u, fake_alloc.PendingAllocs()); + EXPECT_EQ(7, counter); + for (size_t i = 0; i < 7; i++) { +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 2021-06-02 10:56:05.266904549 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h 2021-05-31 10:37:11.000000000 -0400 +@@ -34,7 +34,10 @@ + //------------------------------------------------------------------------------ + // Detect compiler using predefined macros + +-#ifdef _MSC_VER ++// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like ++// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that ++// purpose. ++#if defined(_MSC_VER) && !defined(__clang__) + #define HWY_COMPILER_MSVC _MSC_VER + #else + #define HWY_COMPILER_MSVC 0 +@@ -200,6 +203,10 @@ + #define HWY_ARCH_X86_64 0 + #endif + ++#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64 ++#error "Cannot have both x86-32 and x86-64" ++#endif ++ + #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64 + #define HWY_ARCH_X86 1 + #else +@@ -212,14 +219,29 @@ + #define HWY_ARCH_PPC 0 + #endif + +-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) ++#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64) ++#define HWY_ARCH_ARM_A64 1 ++#else ++#define HWY_ARCH_ARM_A64 0 ++#endif ++ ++#if defined(__arm__) || defined(_M_ARM) ++#define HWY_ARCH_ARM_V7 1 ++#else ++#define HWY_ARCH_ARM_V7 0 ++#endif ++ ++#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7 ++#error "Cannot have both A64 and V7" ++#endif ++ ++#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7 + #define HWY_ARCH_ARM 1 + #else + #define HWY_ARCH_ARM 0 + #endif + +-// There isn't yet a standard __wasm or __wasm__. +-#ifdef __EMSCRIPTEN__ ++#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__) + #define HWY_ARCH_WASM 1 + #else + #define HWY_ARCH_WASM 0 +@@ -231,9 +253,11 @@ + #define HWY_ARCH_RVV 0 + #endif + ++// It is an error to detect multiple architectures at the same time, but OK to ++// detect none of the above. + #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \ +- HWY_ARCH_RVV) != 1 +-#error "Must detect exactly one platform" ++ HWY_ARCH_RVV) > 1 ++#error "Must not detect more than one architecture" + #endif + + //------------------------------------------------------------------------------ +@@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t + // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name + // by concatenating base type and bits. + +-// RVV already has a builtin type. +-#if !HWY_ARCH_RVV ++// RVV already has a builtin type and the GCC intrinsics require it. ++#if HWY_ARCH_RVV && HWY_COMPILER_GCC ++#define HWY_NATIVE_FLOAT16 1 ++#else ++#define HWY_NATIVE_FLOAT16 0 ++#endif ++ ++#if HWY_NATIVE_FLOAT16 ++using float16_t = __fp16; ++// Clang does not allow __fp16 arguments, but scalar.h requires LaneType ++// arguments, so use a wrapper. ++// TODO(janwas): replace with _Float16 when that is supported? ++#else ++#pragma pack(push, 1) + struct float16_t { +- // __fp16 cannot be used as a function parameter in clang, so use a wrapper. + uint16_t bits; + }; ++#pragma pack(pop) + #endif ++ + using float32_t = float; + using float64_t = double; + +@@ -506,6 +543,13 @@ struct Relations { + using Narrow = int32_t; + }; + template <> ++struct Relations { ++ using Unsigned = uint16_t; ++ using Signed = int16_t; ++ using Float = float16_t; ++ using Wide = float; ++}; ++template <> + struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; +@@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t + + // Undefined results for x == 0. + HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { +-#ifdef _MSC_VER ++#if HWY_COMPILER_MSVC + unsigned long index; // NOLINT + _BitScanForward(&index, x); + return index; +-#else ++#else // HWY_COMPILER_MSVC + return static_cast(__builtin_ctz(x)); +-#endif ++#endif // HWY_COMPILER_MSVC + } + + HWY_API size_t PopCount(uint64_t x) { +@@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) { + return static_cast(__builtin_popcountll(x)); + #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 + return _mm_popcnt_u64(x); +-#elif HWY_COMPILER_MSVC ++#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 + return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32)); + #else + x -= ((x >> 1) & 0x55555555U); +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 2021-06-02 10:56:05.280904620 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h 2021-05-31 10:37:11.000000000 -0400 +@@ -20,7 +20,9 @@ + + #include "hwy/base.h" + +-#ifndef __SSE2__ ++// Requires SSE2; fails to compile on 32-bit Clang 7 (see ++// https://github.com/gperftools/gperftools/issues/946). ++#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) + #undef HWY_DISABLE_CACHE_CONTROL + #define HWY_DISABLE_CACHE_CONTROL + #endif +@@ -30,6 +32,14 @@ + #include // SSE2 + #endif + ++// Windows.h #defines these, which causes infinite recursion. Temporarily ++// undefine them in this header; these functions are anyway deprecated. ++// TODO(janwas): remove when these functions are removed. ++#pragma push_macro("LoadFence") ++#pragma push_macro("StoreFence") ++#undef LoadFence ++#undef StoreFence ++ + namespace hwy { + + // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. +@@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach + #endif + } + ++// Reduces power consumption in spin-loops. No effect on non-x86. ++HWY_INLINE HWY_ATTR_CACHE void Pause() { ++#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) ++ _mm_pause(); ++#endif ++} ++ + } // namespace hwy + ++// TODO(janwas): remove when these functions are removed. (See above.) ++#pragma pop_macro("StoreFence") ++#pragma pop_macro("LoadFence") ++ + #endif // HIGHWAY_HWY_CACHE_CONTROL_H_ +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 2021-06-02 10:56:05.195904190 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -19,7 +19,6 @@ + #include + #include + +-#include + #include + #include // iota + +@@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi + + class TwoArray { + public: +- // Passed to ctor as a value NOT known to the compiler. Must be a multiple of +- // the vector lane count * 8. ++ // Must be a multiple of the vector lane count * 8. + static size_t NumItems() { return 3456; } + +- explicit TwoArray(const size_t num_items) +- : a_(AllocateAligned(num_items * 2)), b_(a_.get() + num_items) { +- const float init = num_items / NumItems(); // 1, but compiler doesn't know +- std::iota(a_.get(), a_.get() + num_items, init); +- std::iota(b_, b_ + num_items, init); ++ TwoArray() ++ : a_(AllocateAligned(NumItems() * 2)), b_(a_.get() + NumItems()) { ++ // = 1, but compiler doesn't know ++ const float init = static_cast(Unpredictable1()); ++ std::iota(a_.get(), a_.get() + NumItems(), init); ++ std::iota(b_, b_ + NumItems(), init); + } + + protected: +@@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) { + const FuncInput inputs[kNumInputs] = {num_items}; + Result results[kNumInputs]; + +- Benchmark benchmark(num_items); ++ Benchmark benchmark; + + Params p; + p.verbose = false; +@@ -101,7 +100,7 @@ void Intro() { + // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold! + class BenchmarkDot : public TwoArray { + public: +- explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {} ++ BenchmarkDot() : dot_{-1.0f} {} + + FuncOutput operator()(const size_t num_items) { + HWY_FULL(float) d; +@@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray { + sum[i] += sum[i + power]; + } + } +- return dot_ = GetLane(SumOfLanes(sum[0])); ++ dot_ = GetLane(SumOfLanes(sum[0])); ++ return static_cast(dot_); + } + void Verify(size_t num_items) { + if (dot_ == -1.0f) { +@@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray { + // INTERMEDIATE: delta coding + // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold! + struct BenchmarkDelta : public TwoArray { +- explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {} +- + FuncOutput operator()(const size_t num_items) const { + #if HWY_TARGET == HWY_SCALAR + b_[0] = a_[0]; +@@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray + Store(a - shifted, df, &b_[i]); + } + #endif +- return b_[num_items - 1]; ++ return static_cast(b_[num_items - 1]); + } + + void Verify(size_t num_items) { +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 2021-06-02 10:56:05.189904159 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -22,27 +22,62 @@ + // For runtime dispatch, specify the name of the current file (unfortunately + // __FILE__ is not reliable) so that foreach_target.h can re-include it. + #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc" +-// Re-include this file once per enabled target to generate code for it. ++// Generates code for each enabled target by re-including this source file. + #include "hwy/foreach_target.h" + +-#include "hwy/examples/skeleton_shared.h" + #include "hwy/highway.h" + +-// Optional: factor out parts of the implementation into *-inl.h +-#include "hwy/examples/skeleton-inl.h" +- + // Optional, can instead add HWY_ATTR to all functions. + HWY_BEFORE_NAMESPACE(); + namespace skeleton { + namespace HWY_NAMESPACE { + +-// Compiled once per target via multiple inclusion. +-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, +- float* HWY_RESTRICT out) { +- printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), +- ExampleGatherStrategy()); ++// Highway ops reside here; ADL does not find templates nor builtins. ++using namespace hwy::HWY_NAMESPACE; ++ ++// Computes log2 by converting to a vector of floats. Compiled once per target. ++template ++HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values, ++ uint8_t* HWY_RESTRICT log2) { ++ // Type tags for converting to other element types (Rebind = same count). ++ const Rebind d32; ++ const Rebind d8; ++ ++ const auto u8 = Load(d8, values); ++ const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8))); ++ const auto exponent = ShiftRight<23>(bits) - Set(d32, 127); ++ Store(DemoteTo(d8, exponent), d8, log2); ++} ++ ++HWY_NOINLINE void CodepathDemo() { ++ // Highway defaults to portability, but per-target codepaths may be selected ++ // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros: ++#if HWY_CAP_INTEGER64 ++ const char* gather = "Has int64"; ++#else ++ const char* gather = "No int64"; ++#endif ++ printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather); ++} + +- ExampleMulAdd(in1, in2, out); ++HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, ++ uint8_t* HWY_RESTRICT log2) { ++ CodepathDemo(); ++ ++ // Second argument is necessary on RVV until it supports fractional lengths. ++ HWY_FULL(float, 4) df; ++ ++ const size_t N = Lanes(df); ++ size_t i = 0; ++ for (; i + N <= count; i += N) { ++ OneFloorLog2(df, values + i, log2 + i); ++ } ++ // TODO(janwas): implement ++#if HWY_TARGET != HWY_RVV ++ for (; i < count; ++i) { ++ OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i); ++ } ++#endif + } + + // NOLINTNEXTLINE(google-readability-namespace-comments) +@@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE(); + + namespace skeleton { + +-// This macro declares a static array SkeletonHighwayDispatchTable used for +-// dynamic dispatch. This macro should be placed in the same namespace that +-// defines the Skeleton function above. +-HWY_EXPORT(Skeleton); ++// This macro declares a static array used for dynamic dispatch; it resides in ++// the same outer namespace that contains FloorLog2. ++HWY_EXPORT(FloorLog2); + + // This function is optional and only needed in the case of exposing it in the +-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in +-// this module is equivalent to inlining this optional function.. +-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, +- float* HWY_RESTRICT out) { +- return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out); ++// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module ++// is equivalent to inlining this function. ++void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count, ++ uint8_t* HWY_RESTRICT out) { ++ return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out); + } + + // Optional: anything to compile only once, e.g. non-SIMD implementations of +-// public functions provided by this module, can go inside #if HWY_ONCE +-// (after end_target-inl.h). ++// public functions provided by this module, can go inside #if HWY_ONCE. + + } // namespace skeleton + #endif // HWY_ONCE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 2021-06-02 10:56:05.213904281 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h 2021-05-31 10:37:11.000000000 -0400 +@@ -18,15 +18,17 @@ + #ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_ + #define HIGHWAY_HWY_EXAMPLES_SKELETON_H_ + +-// Tiny subset of Highway API: essentials for declaring an interface, without +-// any implementation details. ++#include ++ ++// Platform-specific definitions used for declaring an interface, independent of ++// the SIMD instruction set. + #include "hwy/base.h" // HWY_RESTRICT + + namespace skeleton { + +-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256. +-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, +- float* HWY_RESTRICT out); ++// Computes base-2 logarithm by converting to float. Supports dynamic dispatch. ++void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count, ++ uint8_t* HWY_RESTRICT out); + + } // namespace skeleton + +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 2021-06-02 10:56:05.164904033 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -29,41 +29,31 @@ + // It is fine to #include normal or *-inl headers. + #include + +-#include "hwy/examples/skeleton_shared.h" + #include "hwy/highway.h" + + HWY_BEFORE_NAMESPACE(); + namespace skeleton { + namespace HWY_NAMESPACE { + +-using hwy::HWY_NAMESPACE::MulAdd; ++using namespace hwy::HWY_NAMESPACE; + +-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256. +-HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1, +- const float* HWY_RESTRICT in2, +- float* HWY_RESTRICT out) { +- // Descriptor(s) for all vector types used in this function. +- HWY_FULL(float) df; +- +- const auto mul = Set(df, kMultiplier); +- for (size_t i = 0; i < 256; i += Lanes(df)) { +- const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i)); +- Store(result, df, out + i); ++// Example of a type-agnostic (caller-specified lane type) and width-agnostic ++// (uses best available instruction set) function in a header. ++// ++// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size. ++template ++HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array, ++ const T* HWY_RESTRICT add_array, ++ const size_t size, T* HWY_RESTRICT x_array) { ++ for (size_t i = 0; i < size; i += Lanes(d)) { ++ const auto mul = Load(d, mul_array + i); ++ const auto add = Load(d, add_array + i); ++ auto x = Load(d, x_array + i); ++ x = MulAdd(mul, x, add); ++ Store(x, d, x_array + i); + } + } + +-// (This doesn't generate SIMD instructions, so is not required here) +-HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() { +- // Highway functions generate per-target implementations from the same source +- // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed, +- // entirely different codepaths can also be selected like so: +-#if HWY_GATHER_LANES > 1 +- return "Has gather"; +-#else +- return "Gather is limited to one lane"; +-#endif +-} +- + // NOLINTNEXTLINE(google-readability-namespace-comments) + } // namespace HWY_NAMESPACE + } // namespace skeleton +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 2021-06-02 10:56:05.170904063 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -12,30 +12,96 @@ + // See the License for the specific language governing permissions and + // limitations under the License. + +-// Example of unit test for the "skeleton" module. ++// Example of unit test for the "skeleton" library. + +-#include "hwy/examples/skeleton.h" // Skeleton ++#include "hwy/examples/skeleton.h" + + #include + +-#include "hwy/tests/test_util-inl.h" // RunTest ++#undef HWY_TARGET_INCLUDE ++#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc" ++#include "hwy/foreach_target.h" ++#include "hwy/highway.h" ++#include "hwy/tests/test_util-inl.h" + ++// Optional: factor out parts of the implementation into *-inl.h ++#include "hwy/examples/skeleton-inl.h" ++ ++HWY_BEFORE_NAMESPACE(); + namespace skeleton { ++namespace HWY_NAMESPACE { ++ ++using namespace hwy::HWY_NAMESPACE; ++ ++// Calls function defined in skeleton.cc. ++struct TestFloorLog2 { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, DF df) { ++ const size_t count = 5 * Lanes(df); ++ auto in = hwy::AllocateAligned(count); ++ auto expected = hwy::AllocateAligned(count); ++ ++ hwy::RandomState rng; ++ for (size_t i = 0; i < count; ++i) { ++ expected[i] = Random32(&rng) & 7; ++ in[i] = static_cast(1u << expected[i]); ++ } ++ auto out = hwy::AllocateAligned(count); ++ CallFloorLog2(in.get(), count, out.get()); ++ int sum = 0; ++ for (size_t i = 0; i < count; ++i) { ++ // TODO(janwas): implement ++#if HWY_TARGET != HWY_RVV ++ HWY_ASSERT_EQ(expected[i], out[i]); ++#endif ++ sum += out[i]; ++ } ++ hwy::PreventElision(sum); ++ } ++}; ++ ++HWY_NOINLINE void TestAllFloorLog2() { ++ ForPartialVectors()(float()); ++} ++ ++// Calls function defined in skeleton-inl.h. ++struct TestSumMulAdd { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ hwy::RandomState rng; ++ const size_t count = 4096; ++ EXPECT_TRUE(count % Lanes(d) == 0); ++ auto mul = hwy::AllocateAligned(count); ++ auto x = hwy::AllocateAligned(count); ++ auto add = hwy::AllocateAligned(count); ++ for (size_t i = 0; i < count; ++i) { ++ mul[i] = static_cast(Random32(&rng) & 0xF); ++ x[i] = static_cast(Random32(&rng) & 0xFF); ++ add[i] = static_cast(Random32(&rng) & 0xFF); ++ } ++ double expected_sum = 0.0; ++ for (size_t i = 0; i < count; ++i) { ++ expected_sum += mul[i] * x[i] + add[i]; ++ } + +-TEST(SkeletonTest, MainTest) { +- HWY_ALIGN_MAX float in1[256]; +- HWY_ALIGN_MAX float in2[256]; +- HWY_ALIGN_MAX float out[256]; +- for (size_t i = 0; i < 256; ++i) { +- in1[i] = static_cast(i); +- in2[i] = in1[i] + 300; ++ MulAddLoop(d, mul.get(), add.get(), count, x.get()); ++ HWY_ASSERT_EQ(4344240.0, expected_sum); + } ++}; + +- // Tests will run for all compiled targets to ensure all are OK. +- hwy::RunTest([&in1, &in2, &out]() { +- Skeleton(in1, in2, out); +- // Add EXPECT_... calls here. +- }); ++HWY_NOINLINE void TestAllSumMulAdd() { ++ ForFloatTypes(ForPartialVectors()); + } + ++// NOLINTNEXTLINE(google-readability-namespace-comments) ++} // namespace HWY_NAMESPACE ++} // namespace skeleton ++HWY_AFTER_NAMESPACE(); ++ ++#if HWY_ONCE ++namespace skeleton { ++HWY_BEFORE_TEST(SkeletonTest); ++HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2); ++HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd); + } // namespace skeleton ++#endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 2021-06-02 10:56:05.269904564 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h 2021-05-31 10:37:11.000000000 -0400 +@@ -25,10 +25,10 @@ + + namespace hwy { + +-// API version (https://semver.org/) ++// API version (https://semver.org/); keep in sync with CMakeLists.txt. + #define HWY_MAJOR 0 +-#define HWY_MINOR 11 +-#define HWY_PATCH 1 ++#define HWY_MINOR 12 ++#define HWY_PATCH 2 + + //------------------------------------------------------------------------------ + // Shorthand for descriptors (defined in shared-inl.h) used to select overloads. +@@ -49,7 +49,7 @@ namespace hwy { + HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) + #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) + +-// Vector of up to MAX_N lanes. ++// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead. + #define HWY_CAPPED(T, MAX_N) \ + hwy::HWY_NAMESPACE::Simd + +@@ -75,6 +75,10 @@ namespace hwy { + #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME + #elif HWY_STATIC_TARGET == HWY_NEON + #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME ++#elif HWY_STATIC_TARGET == HWY_SVE ++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME ++#elif HWY_STATIC_TARGET == HWY_SVE2 ++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME + #elif HWY_STATIC_TARGET == HWY_PPC8 + #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME + #elif HWY_STATIC_TARGET == HWY_SSE4 +@@ -143,6 +147,18 @@ FunctionCache Function + #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr + #endif + ++#if HWY_TARGETS & HWY_SVE ++#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME ++#else ++#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr ++#endif ++ ++#if HWY_TARGETS & HWY_SVE2 ++#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME ++#else ++#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr ++#endif ++ + #if HWY_TARGETS & HWY_PPC8 + #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME + #else +@@ -261,8 +277,11 @@ FunctionCache Function + #elif HWY_TARGET == HWY_AVX3 + #include "hwy/ops/x86_512-inl.h" + #elif HWY_TARGET == HWY_PPC8 ++#error "PPC is not yet supported" + #elif HWY_TARGET == HWY_NEON + #include "hwy/ops/arm_neon-inl.h" ++#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 ++#include "hwy/ops/arm_sve-inl.h" + #elif HWY_TARGET == HWY_WASM + #include "hwy/ops/wasm_128-inl.h" + #elif HWY_TARGET == HWY_RVV +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 2021-06-02 10:56:05.276904599 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -29,128 +29,43 @@ + #include + #include + ++#if defined(_WIN32) || defined(_WIN64) ++#ifndef NOMINMAX ++#define NOMINMAX ++#endif // NOMINMAX ++#include ++#endif ++ ++#if defined(__MACH__) ++#include ++#include ++#endif ++ ++#if defined(__HAIKU__) ++#include ++#endif ++ + #include "hwy/base.h" + #if HWY_ARCH_PPC + #include // NOLINT __ppc_get_timebase_freq + #elif HWY_ARCH_X86 + +-#ifdef _MSC_VER ++#if HWY_COMPILER_MSVC + #include + #else + #include // NOLINT +-#endif // _MSC_VER ++#endif // HWY_COMPILER_MSVC + + #endif // HWY_ARCH_X86 + + namespace hwy { +-namespace platform { +-namespace { +- +-#if HWY_ARCH_X86 +- +-void Cpuid(const uint32_t level, const uint32_t count, +- uint32_t* HWY_RESTRICT abcd) { +-#if HWY_COMPILER_MSVC +- int regs[4]; +- __cpuidex(regs, level, count); +- for (int i = 0; i < 4; ++i) { +- abcd[i] = regs[i]; +- } +-#else +- uint32_t a; +- uint32_t b; +- uint32_t c; +- uint32_t d; +- __cpuid_count(level, count, a, b, c, d); +- abcd[0] = a; +- abcd[1] = b; +- abcd[2] = c; +- abcd[3] = d; +-#endif +-} +- +-std::string BrandString() { +- char brand_string[49]; +- std::array abcd; +- +- // Check if brand string is supported (it is on all reasonable Intel/AMD) +- Cpuid(0x80000000U, 0, abcd.data()); +- if (abcd[0] < 0x80000004U) { +- return std::string(); +- } +- +- for (size_t i = 0; i < 3; ++i) { +- Cpuid(0x80000002U + i, 0, abcd.data()); +- memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd)); +- } +- brand_string[48] = 0; +- return brand_string; +-} +- +-// Returns the frequency quoted inside the brand string. This does not +-// account for throttling nor Turbo Boost. +-double NominalClockRate() { +- const std::string& brand_string = BrandString(); +- // Brand strings include the maximum configured frequency. These prefixes are +- // defined by Intel CPUID documentation. +- const char* prefixes[3] = {"MHz", "GHz", "THz"}; +- const double multipliers[3] = {1E6, 1E9, 1E12}; +- for (size_t i = 0; i < 3; ++i) { +- const size_t pos_prefix = brand_string.find(prefixes[i]); +- if (pos_prefix != std::string::npos) { +- const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); +- if (pos_space != std::string::npos) { +- const std::string digits = +- brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); +- return std::stod(digits) * multipliers[i]; +- } +- } +- } +- +- return 0.0; +-} +- +-#endif // HWY_ARCH_X86 +- +-} // namespace +- +-// Returns tick rate. Invariant means the tick counter frequency is independent +-// of CPU throttling or sleep. May be expensive, caller should cache the result. +-double InvariantTicksPerSecond() { +-#if HWY_ARCH_PPC +- return __ppc_get_timebase_freq(); +-#elif HWY_ARCH_X86 +- // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs. +- return NominalClockRate(); +-#else +- // Fall back to clock_gettime nanoseconds. +- return 1E9; +-#endif +-} +- +-} // namespace platform + namespace { +- +-// Prevents the compiler from eliding the computations that led to "output". +-template +-inline void PreventElision(T&& output) { +-#if HWY_COMPILER_MSVC == 0 +- // Works by indicating to the compiler that "output" is being read and +- // modified. The +r constraint avoids unnecessary writes to memory, but only +- // works for built-in types (typically FuncOutput). +- asm volatile("" : "+r"(output) : : "memory"); +-#else +- // MSVC does not support inline assembly anymore (and never supported GCC's +- // RTL constraints). Self-assignment with #pragma optimize("off") might be +- // expected to prevent elision, but it does not with MSVC 2015. Type-punning +- // with volatile pointers generates inefficient code on MSVC 2017. +- static std::atomic dummy(T{}); +- dummy.store(output, std::memory_order_relaxed); +-#endif +-} +- + namespace timer { + ++// Ticks := platform-specific timer values (CPU cycles on x86). Must be ++// unsigned to guarantee wraparound on overflow. ++using Ticks = uint64_t; ++ + // Start/Stop return absolute timestamps and must be placed immediately before + // and after the region to measure. We provide separate Start/Stop functions + // because they use different fences. +@@ -202,8 +117,8 @@ namespace timer { + + // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, + // divide by InvariantTicksPerSecond. +-inline uint64_t Start64() { +- uint64_t t; ++inline Ticks Start() { ++ Ticks t; + #if HWY_ARCH_PPC + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); + #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC +@@ -228,8 +143,15 @@ inline uint64_t Start64() { + : "rdx", "memory", "cc"); + #elif HWY_ARCH_RVV + asm volatile("rdcycle %0" : "=r"(t)); +-#else +- // Fall back to OS - unsure how to reliably query cntvct_el0 frequency. ++#elif defined(_WIN32) || defined(_WIN64) ++ LARGE_INTEGER counter; ++ (void)QueryPerformanceCounter(&counter); ++ t = counter.QuadPart; ++#elif defined(__MACH__) ++ t = mach_absolute_time(); ++#elif defined(__HAIKU__) ++ t = system_time_nsecs(); // since boot ++#else // POSIX + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = ts.tv_sec * 1000000000LL + ts.tv_nsec; +@@ -237,7 +159,7 @@ inline uint64_t Start64() { + return t; + } + +-inline uint64_t Stop64() { ++inline Ticks Stop() { + uint64_t t; + #if HWY_ARCH_PPC + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +@@ -261,61 +183,7 @@ inline uint64_t Stop64() { + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); + #else +- t = Start64(); +-#endif +- return t; +-} +- +-// Returns a 32-bit timestamp with about 4 cycles less overhead than +-// Start64. Only suitable for measuring very short regions because the +-// timestamp overflows about once a second. +-inline uint32_t Start32() { +- uint32_t t; +-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC +- _ReadWriteBarrier(); +- _mm_lfence(); +- _ReadWriteBarrier(); +- t = static_cast(__rdtsc()); +- _ReadWriteBarrier(); +- _mm_lfence(); +- _ReadWriteBarrier(); +-#elif HWY_ARCH_X86_64 +- asm volatile( +- "lfence\n\t" +- "rdtsc\n\t" +- "lfence" +- : "=a"(t) +- : +- // "memory" avoids reordering. rdx = TSC >> 32. +- : "rdx", "memory"); +-#elif HWY_ARCH_RVV +- asm volatile("rdcycle %0" : "=r"(t)); +-#else +- t = static_cast(Start64()); +-#endif +- return t; +-} +- +-inline uint32_t Stop32() { +- uint32_t t; +-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC +- _ReadWriteBarrier(); +- unsigned aux; +- t = static_cast(__rdtscp(&aux)); +- _ReadWriteBarrier(); +- _mm_lfence(); +- _ReadWriteBarrier(); +-#elif HWY_ARCH_X86_64 +- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). +- asm volatile( +- "rdtscp\n\t" +- "lfence" +- : "=a"(t) +- : +- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. +- : "rcx", "rdx", "memory"); +-#else +- t = static_cast(Stop64()); ++ t = Start(); + #endif + return t; + } +@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value + } + + } // namespace robust_statistics ++} // namespace ++namespace platform { ++namespace { + +-// Ticks := platform-specific timer values (CPU cycles on x86). Must be +-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to +-// read than 64 bit. +-using Ticks = uint32_t; ++// Prevents the compiler from eliding the computations that led to "output". ++template ++inline void PreventElision(T&& output) { ++#if HWY_COMPILER_MSVC == 0 ++ // Works by indicating to the compiler that "output" is being read and ++ // modified. The +r constraint avoids unnecessary writes to memory, but only ++ // works for built-in types (typically FuncOutput). ++ asm volatile("" : "+r"(output) : : "memory"); ++#else ++ // MSVC does not support inline assembly anymore (and never supported GCC's ++ // RTL constraints). Self-assignment with #pragma optimize("off") might be ++ // expected to prevent elision, but it does not with MSVC 2015. Type-punning ++ // with volatile pointers generates inefficient code on MSVC 2017. ++ static std::atomic dummy(T{}); ++ dummy.store(output, std::memory_order_relaxed); ++#endif ++} ++ ++#if HWY_ARCH_X86 ++ ++void Cpuid(const uint32_t level, const uint32_t count, ++ uint32_t* HWY_RESTRICT abcd) { ++#if HWY_COMPILER_MSVC ++ int regs[4]; ++ __cpuidex(regs, level, count); ++ for (int i = 0; i < 4; ++i) { ++ abcd[i] = regs[i]; ++ } ++#else ++ uint32_t a; ++ uint32_t b; ++ uint32_t c; ++ uint32_t d; ++ __cpuid_count(level, count, a, b, c, d); ++ abcd[0] = a; ++ abcd[1] = b; ++ abcd[2] = c; ++ abcd[3] = d; ++#endif ++} ++ ++std::string BrandString() { ++ char brand_string[49]; ++ std::array abcd; ++ ++ // Check if brand string is supported (it is on all reasonable Intel/AMD) ++ Cpuid(0x80000000U, 0, abcd.data()); ++ if (abcd[0] < 0x80000004U) { ++ return std::string(); ++ } ++ ++ for (size_t i = 0; i < 3; ++i) { ++ Cpuid(static_cast(0x80000002U + i), 0, abcd.data()); ++ memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd)); ++ } ++ brand_string[48] = 0; ++ return brand_string; ++} ++ ++// Returns the frequency quoted inside the brand string. This does not ++// account for throttling nor Turbo Boost. ++double NominalClockRate() { ++ const std::string& brand_string = BrandString(); ++ // Brand strings include the maximum configured frequency. These prefixes are ++ // defined by Intel CPUID documentation. ++ const char* prefixes[3] = {"MHz", "GHz", "THz"}; ++ const double multipliers[3] = {1E6, 1E9, 1E12}; ++ for (size_t i = 0; i < 3; ++i) { ++ const size_t pos_prefix = brand_string.find(prefixes[i]); ++ if (pos_prefix != std::string::npos) { ++ const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); ++ if (pos_space != std::string::npos) { ++ const std::string digits = ++ brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); ++ return std::stod(digits) * multipliers[i]; ++ } ++ } ++ } ++ ++ return 0.0; ++} ++ ++#endif // HWY_ARCH_X86 ++ ++} // namespace ++ ++double InvariantTicksPerSecond() { ++#if HWY_ARCH_PPC ++ return __ppc_get_timebase_freq(); ++#elif HWY_ARCH_X86 ++ // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs. ++ return NominalClockRate(); ++#elif defined(_WIN32) || defined(_WIN64) ++ LARGE_INTEGER freq; ++ (void)QueryPerformanceFrequency(&freq); ++ return double(freq.QuadPart); ++#elif defined(__MACH__) ++ // https://developer.apple.com/library/mac/qa/qa1398/_index.html ++ mach_timebase_info_data_t timebase; ++ (void)mach_timebase_info(&timebase); ++ return double(timebase.denom) / timebase.numer * 1E9; ++#else ++ // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency. ++ return 1E9; // Haiku and clock_gettime return nanoseconds. ++#endif ++} + +-// Returns timer overhead / minimum measurable difference. +-Ticks TimerResolution() { ++double Now() { ++ static const double mul = 1.0 / InvariantTicksPerSecond(); ++ return static_cast(timer::Start()) * mul; ++} ++ ++uint64_t TimerResolution() { + // Nested loop avoids exceeding stack/L1 capacity. +- Ticks repetitions[Params::kTimerSamples]; ++ timer::Ticks repetitions[Params::kTimerSamples]; + for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) { +- Ticks samples[Params::kTimerSamples]; ++ timer::Ticks samples[Params::kTimerSamples]; + for (size_t i = 0; i < Params::kTimerSamples; ++i) { +- const Ticks t0 = timer::Start32(); +- const Ticks t1 = timer::Stop32(); ++ const timer::Ticks t0 = timer::Start(); ++ const timer::Ticks t1 = timer::Stop(); + samples[i] = t1 - t0; + } + repetitions[rep] = robust_statistics::Mode(samples); +@@ -462,18 +439,21 @@ Ticks TimerResolution() { + return robust_statistics::Mode(repetitions); + } + +-static const Ticks timer_resolution = TimerResolution(); ++} // namespace platform ++namespace { ++ ++static const timer::Ticks timer_resolution = platform::TimerResolution(); + + // Estimates the expected value of "lambda" values with a variable number of + // samples until the variability "rel_mad" is less than "max_rel_mad". + template +-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, +- const Params& p, const Lambda& lambda) { ++timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, ++ const Params& p, const Lambda& lambda) { + // Choose initial samples_per_eval based on a single estimated duration. +- Ticks t0 = timer::Start32(); ++ timer::Ticks t0 = timer::Start(); + lambda(); +- Ticks t1 = timer::Stop32(); +- Ticks est = t1 - t0; ++ timer::Ticks t1 = timer::Stop(); ++ timer::Ticks est = t1 - t0; + static const double ticks_per_second = platform::InvariantTicksPerSecond(); + const size_t ticks_per_eval = + static_cast(ticks_per_second * p.seconds_per_eval); +@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max + est == 0 ? p.min_samples_per_eval : ticks_per_eval / est; + samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval); + +- std::vector samples; ++ std::vector samples; + samples.reserve(1 + samples_per_eval); + samples.push_back(est); + + // Percentage is too strict for tiny differences, so also allow a small + // absolute "median absolute deviation". +- const Ticks max_abs_mad = (timer_resolution + 99) / 100; ++ const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100; + *rel_mad = 0.0; // ensure initialized + + for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { + samples.reserve(samples.size() + samples_per_eval); + for (size_t i = 0; i < samples_per_eval; ++i) { +- t0 = timer::Start32(); ++ t0 = timer::Start(); + lambda(); +- t1 = timer::Stop32(); ++ t1 = timer::Stop(); + samples.push_back(t1 - t0); + } + +@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max + NANOBENCHMARK_CHECK(est != 0); + + // Median absolute deviation (mad) is a robust measure of 'variability'. +- const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( ++ const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( + samples.data(), samples.size(), est); +- *rel_mad = static_cast(int(abs_mad)) / est; ++ *rel_mad = static_cast(abs_mad) / static_cast(est); + + if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) { + if (p.verbose) { +- printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n", +- samples.size(), est, abs_mad, *rel_mad * 100.0); ++ printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n", ++ samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0); + } + return est; + } +@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i + return unique; + } + +-// Returns how often we need to call func for sufficient precision, or zero +-// on failure (e.g. the elapsed time is too long for a 32-bit tick count). ++// Returns how often we need to call func for sufficient precision. + size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, + const Params& p) { + // Min elapsed ticks for any input. +- Ticks min_duration = ~0u; ++ timer::Ticks min_duration = ~timer::Ticks(0); + + for (const FuncInput input : unique) { +- // Make sure a 32-bit timer is sufficient. +- const uint64_t t0 = timer::Start64(); +- PreventElision(func(arg, input)); +- const uint64_t t1 = timer::Stop64(); +- const uint64_t elapsed = t1 - t0; +- if (elapsed >= (1ULL << 30)) { +- fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n", +- input); +- return 0; +- } +- + double rel_mad; +- const Ticks total = SampleUntilStable( ++ const timer::Ticks total = SampleUntilStable( + p.target_rel_mad, &rel_mad, p, +- [func, arg, input]() { PreventElision(func(arg, input)); }); ++ [func, arg, input]() { platform::PreventElision(func(arg, input)); }); + min_duration = std::min(min_duration, total - timer_resolution); + } + +@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui + const size_t num_skip = + min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration; + if (p.verbose) { +- printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution, +- max_skip, min_duration, num_skip); ++ printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n", ++ size_t(timer_resolution), max_skip, size_t(min_duration), num_skip); + } + return num_skip; + } +@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co + } + + // Returns total ticks elapsed for all inputs. +-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs, +- const Params& p, double* max_rel_mad) { ++timer::Ticks TotalDuration(const Func func, const uint8_t* arg, ++ const InputVec* inputs, const Params& p, ++ double* max_rel_mad) { + double rel_mad; +- const Ticks duration = ++ const timer::Ticks duration = + SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() { + for (const FuncInput input : *inputs) { +- PreventElision(func(arg, input)); ++ platform::PreventElision(func(arg, input)); + } + }); + *max_rel_mad = std::max(*max_rel_mad, rel_mad); +@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const + + // Returns overhead of accessing inputs[] and calling a function; this will + // be deducted from future TotalDuration return values. +-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) { ++timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, ++ const Params& p) { + double rel_mad; + // Zero tolerance because repeatability is crucial and EmptyFunc is fast. + return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() { + for (const FuncInput input : *inputs) { +- PreventElision(EmptyFunc(arg, input)); ++ platform::PreventElision(EmptyFunc(arg, input)); + } + }); + } + + } // namespace + +-int Unpredictable1() { return timer::Start64() != ~0ULL; } ++int Unpredictable1() { return timer::Start() != ~0ULL; } + + size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, + const size_t num_inputs, Result* results, const Params& p) { +@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui + ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); + InputVec subset(full.size() - num_skip); + +- const Ticks overhead = Overhead(arg, &full, p); +- const Ticks overhead_skip = Overhead(arg, &subset, p); ++ const timer::Ticks overhead = Overhead(arg, &full, p); ++ const timer::Ticks overhead_skip = Overhead(arg, &subset, p); + if (overhead < overhead_skip) { +- fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead, +- overhead_skip); ++ fprintf(stderr, "Measurement failed: overhead %zu < %zu\n", ++ size_t(overhead), size_t(overhead_skip)); + return 0; + } + + if (p.verbose) { +- printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(), +- overhead, overhead_skip); ++ printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(), ++ size_t(overhead), size_t(overhead_skip)); + } + + double max_rel_mad = 0.0; +- const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); ++ const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); + + for (size_t i = 0; i < unique.size(); ++i) { + FillSubset(full, unique[i], num_skip, &subset); +- const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad); ++ const timer::Ticks total_skip = ++ TotalDuration(func, arg, &subset, p, &max_rel_mad); + + if (total < total_skip) { +- fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip); ++ fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total), ++ size_t(total_skip)); + return 0; + } + +- const Ticks duration = (total - overhead) - (total_skip - overhead_skip); ++ const timer::Ticks duration = ++ (total - overhead) - (total_skip - overhead_skip); + results[i].input = unique[i]; + results[i].ticks = static_cast(duration) * mul; + results[i].variability = static_cast(max_rel_mad); +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 2021-06-02 10:56:05.272904579 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h 2021-05-31 10:37:11.000000000 -0400 +@@ -44,11 +44,6 @@ + // central tendency of the measurement samples with the "half sample mode", + // which is more robust to outliers and skewed data than the mean or median. + +-// WARNING if included from multiple translation units compiled with distinct +-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE +-// macro that is unique to the current compile flags. We must also avoid +-// standard library headers such as vector and functional that define functions. +- + #include + #include + +@@ -79,6 +74,16 @@ namespace platform { + // This call may be expensive, callers should cache the result. + double InvariantTicksPerSecond(); + ++// Returns current timestamp [in seconds] relative to an unspecified origin. ++// Features: monotonic (no negative elapsed time), steady (unaffected by system ++// time changes), high-resolution (on the order of microseconds). ++double Now(); ++ ++// Returns ticks elapsed in back to back timer calls, i.e. a function of the ++// timer resolution (minimum measurable difference) and overhead. ++// This call is expensive, callers should cache the result. ++uint64_t TimerResolution(); ++ + } // namespace platform + + // Returns 1, but without the compiler knowing what the value is. This prevents +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 2021-06-02 10:56:05.275904594 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -15,11 +15,11 @@ + #include "hwy/nanobenchmark.h" + + #include +-#include // strtol +-#include // sleep + + #include + ++#include "hwy/tests/test_util-inl.h" ++ + namespace hwy { + namespace { + +@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in + + template + void MeasureDiv(const FuncInput (&inputs)[N]) { ++ printf("Measuring integer division (output on final two lines)\n"); + Result results[N]; + Params params; + params.max_evals = 4; // avoid test timeout +@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp + } + } + +-template +-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) { +- printf("Expect a 'measurement failed' below:\n"); +- Result results[N]; +- +- const size_t num_results = Measure( +- [](const void*, const FuncInput input) -> FuncOutput { +- // Loop until the sleep succeeds (not interrupted by signal). We assume +- // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit. +- while (sleep(2) != 0) { +- } +- return input; +- }, +- nullptr, inputs, N, results); +- NANOBENCHMARK_CHECK(num_results == 0); +- (void)num_results; +-} +- +-void RunAll(const int argc, char** /*argv*/) { +- // unpredictable == 1 but the compiler doesn't know that. +- const int unpredictable = argc != 999; ++TEST(NanobenchmarkTest, RunAll) { ++ const int unpredictable = Unpredictable1(); // == 1, unknown to compiler. + static const FuncInput inputs[] = {static_cast(unpredictable) + 2, + static_cast(unpredictable + 9)}; + + MeasureDiv(inputs); + MeasureRandom(inputs); +- EnsureLongMeasurementFails(inputs); + } + + } // namespace + } // namespace hwy +- +-int main(int argc, char* argv[]) { +- hwy::RunAll(argc, argv); +- return 0; +-} +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 2021-06-02 10:56:05.239904412 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE(); + namespace hwy { + namespace HWY_NAMESPACE { + ++namespace detail { // for code folding and Raw128 ++ + // Macros used to define single and double function calls for multiple types + // for full and half vectors. These macros are undefined at the end of the file. + +@@ -133,7 +135,7 @@ namespace HWY_NAMESPACE { + HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args) + + // float and double +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ + HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \ + HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \ +@@ -181,7 +183,7 @@ namespace HWY_NAMESPACE { + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) + + // Emulation of some intrinsics on armv7. +-#if !defined(__aarch64__) ++#if HWY_ARCH_ARM_V7 + #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] + #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] + #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] +@@ -294,7 +296,7 @@ struct Raw128 { + using type = float32x4_t; + }; + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + template <> + struct Raw128 { + using type = float64x2_t; +@@ -352,7 +354,7 @@ struct Raw128 { + using type = float32x2_t; + }; + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + template <> + struct Raw128 { + using type = float64x1_t; +@@ -437,12 +439,14 @@ struct Raw128 { + using type = int8x8_t; + }; + ++} // namespace detail ++ + template + using Full128 = Simd; + + template + class Vec128 { +- using Raw = typename Raw128::type; ++ using Raw = typename detail::Raw128::type; + + public: + HWY_INLINE Vec128() {} +@@ -480,7 +484,8 @@ class Vec128 { + // FF..FF or 0, also for floating-point - see README. + template + class Mask128 { +- using Raw = typename Raw128::type; ++ // ARM C Language Extensions return and expect unsigned type. ++ using Raw = typename detail::Raw128, N>::type; + + public: + HWY_INLINE Mask128() {} +@@ -573,7 +578,7 @@ HWY_INLINE Vec128 BitCastFro + Vec128 v) { + return Vec128(vreinterpret_s64_u8(v.raw)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, + Vec128 v) { + return Vec128(vreinterpret_f64_u8(v.raw)); +@@ -615,7 +620,7 @@ HWY_INLINE Vec128 BitCastFromBy + return Vec128(vreinterpretq_s64_u8(v.raw)); + } + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, + Vec128 v) { + return Vec128(vreinterpretq_f64_u8(v.raw)); +@@ -664,15 +669,25 @@ template + HWY_INLINE Vec128 Undefined(Simd /*d*/) { + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") +- typename Raw128::type a; ++ typename detail::Raw128::type a; + return Vec128(a); + HWY_DIAGNOSTICS(pop) + } + +-// ------------------------------ Extract lane ++// Returns a vector with lane i=[0, N) set to "first" + i. ++template ++Vec128 Iota(const Simd d, const T2 first) { ++ HWY_ALIGN T lanes[16 / sizeof(T)]; ++ for (size_t i = 0; i < 16 / sizeof(T); ++i) { ++ lanes[i] = static_cast(first + static_cast(i)); ++ } ++ return Load(d, lanes); ++} ++ ++// ------------------------------ GetLane + + HWY_INLINE uint8_t GetLane(const Vec128 v) { +- return vget_lane_u8(vget_low_u8(v.raw), 0); ++ return vgetq_lane_u8(v.raw, 0); + } + template + HWY_INLINE uint8_t GetLane(const Vec128 v) { +@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128< + } + + HWY_INLINE int8_t GetLane(const Vec128 v) { +- return vget_lane_s8(vget_low_s8(v.raw), 0); ++ return vgetq_lane_s8(v.raw, 0); + } + template + HWY_INLINE int8_t GetLane(const Vec128 v) { +@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128 v) { +- return vget_lane_u16(vget_low_u16(v.raw), 0); ++ return vgetq_lane_u16(v.raw, 0); + } + template + HWY_INLINE uint16_t GetLane(const Vec128 v) { +@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128 + } + + HWY_INLINE int16_t GetLane(const Vec128 v) { +- return vget_lane_s16(vget_low_s16(v.raw), 0); ++ return vgetq_lane_s16(v.raw, 0); + } + template + HWY_INLINE int16_t GetLane(const Vec128 v) { +@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128< + } + + HWY_INLINE uint32_t GetLane(const Vec128 v) { +- return vget_lane_u32(vget_low_u32(v.raw), 0); ++ return vgetq_lane_u32(v.raw, 0); + } + template + HWY_INLINE uint32_t GetLane(const Vec128 v) { +@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128 + } + + HWY_INLINE int32_t GetLane(const Vec128 v) { +- return vget_lane_s32(vget_low_s32(v.raw), 0); ++ return vgetq_lane_s32(v.raw, 0); + } + template + HWY_INLINE int32_t GetLane(const Vec128 v) { +@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128< + } + + HWY_INLINE uint64_t GetLane(const Vec128 v) { +- return vget_lane_u64(vget_low_u64(v.raw), 0); ++ return vgetq_lane_u64(v.raw, 0); + } + HWY_INLINE uint64_t GetLane(const Vec128 v) { + return vget_lane_u64(v.raw, 0); + } + HWY_INLINE int64_t GetLane(const Vec128 v) { +- return vget_lane_s64(vget_low_s64(v.raw), 0); ++ return vgetq_lane_s64(v.raw, 0); + } + HWY_INLINE int64_t GetLane(const Vec128 v) { + return vget_lane_s64(v.raw, 0); + } + + HWY_INLINE float GetLane(const Vec128 v) { +- return vget_lane_f32(vget_low_f32(v.raw), 0); ++ return vgetq_lane_f32(v.raw, 0); + } + HWY_INLINE float GetLane(const Vec128 v) { + return vget_lane_f32(v.raw, 0); +@@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128 v) { + return vget_lane_f32(v.raw, 0); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE double GetLane(const Vec128 v) { +- return vget_lane_f64(vget_low_f64(v.raw), 0); ++ return vgetq_lane_f64(v.raw, 0); + } + HWY_INLINE double GetLane(const Vec128 v) { + return vget_lane_f64(v.raw, 0); +@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu + // ------------------------------ Average + + // Returns (a + b + 1) / 2 +- +-// Unsigned + HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) + HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) + +@@ -802,6 +815,7 @@ HWY_INLINE Vec128 Abs(const Vec + HWY_INLINE Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_s32(v.raw)); + } ++// i64 is implemented after BroadcastSignBit. + HWY_INLINE Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_f32(v.raw)); + } +@@ -823,7 +837,7 @@ HWY_INLINE Vec128 Abs(const Ve + return Vec128(vabs_f32(v.raw)); + } + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 Abs(const Vec128 v) { + return Vec128(vabsq_f64(v.raw)); + } +@@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn + HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below + + HWY_INLINE Vec128 Neg(const Vec128 v) { +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return Vec128(vneg_s64(v.raw)); + #else + return Zero(Simd()) - v; +@@ -847,7 +861,7 @@ HWY_INLINE Vec128 Neg(const + } + + HWY_INLINE Vec128 Neg(const Vec128 v) { +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return Vec128(vnegq_s64(v.raw)); + #else + return Zero(Full128()) - v; +@@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v + + // ------------------------------ Shl + ++HWY_INLINE Vec128 operator<<(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); ++} ++template ++HWY_INLINE Vec128 operator<<(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); ++} ++ + HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); +@@ -905,6 +929,16 @@ HWY_INLINE Vec128 operator< + return Vec128(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); + } + ++HWY_INLINE Vec128 operator<<(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshlq_s8(v.raw, bits.raw)); ++} ++template ++HWY_INLINE Vec128 operator<<(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshl_s8(v.raw, bits.raw)); ++} ++ + HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s16(v.raw, bits.raw)); +@@ -936,6 +970,18 @@ HWY_INLINE Vec128 operator<< + + // ------------------------------ Shr (Neg) + ++HWY_INLINE Vec128 operator>>(const Vec128 v, ++ const Vec128 bits) { ++ const int8x16_t neg_bits = Neg(BitCast(Full128(), bits)).raw; ++ return Vec128(vshlq_u8(v.raw, neg_bits)); ++} ++template ++HWY_INLINE Vec128 operator>>(const Vec128 v, ++ const Vec128 bits) { ++ const int8x8_t neg_bits = Neg(BitCast(Simd(), bits)).raw; ++ return Vec128(vshl_u8(v.raw, neg_bits)); ++} ++ + HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int16x8_t neg_bits = Neg(BitCast(Full128(), bits)).raw; +@@ -971,6 +1017,16 @@ HWY_INLINE Vec128 operator> + return Vec128(vshl_u64(v.raw, neg_bits)); + } + ++HWY_INLINE Vec128 operator>>(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); ++} ++template ++HWY_INLINE Vec128 operator>>(const Vec128 v, ++ const Vec128 bits) { ++ return Vec128(vshl_s8(v.raw, Neg(bits).raw)); ++} ++ + HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); +@@ -1059,7 +1115,7 @@ HWY_INLINE Vec128 operator*( + HWY_INLINE Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + int32x4_t rhi = vmull_high_s16(a.raw, b.raw); + #else + int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); +@@ -1070,7 +1126,7 @@ HWY_INLINE Vec128 MulHigh(const + HWY_INLINE Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); + #else + uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); +@@ -1139,24 +1195,37 @@ HWY_INLINE Vec128 ApproximateR + return Vec128(vrecpe_f32(v.raw)); + } + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) + #else +-// Emulated with approx reciprocal + Newton-Raphson + mul ++// Not defined on armv7: approximate ++namespace detail { ++ ++HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( ++ const Vec128 recip, const Vec128 divisor) { ++ return Vec128(vrecpsq_f32(recip.raw, divisor.raw)); ++} ++template ++HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( ++ const Vec128 recip, Vec128 divisor) { ++ return Vec128(vrecps_f32(recip.raw, divisor.raw)); ++} ++ ++} // namespace detail ++ + template + HWY_INLINE Vec128 operator/(const Vec128 a, + const Vec128 b) { + auto x = ApproximateReciprocal(b); +- // Newton-Raphson on 1/x - b +- const auto two = Set(Simd(), 2); +- x = x * (two - b * x); +- x = x * (two - b * x); +- x = x * (two - b * x); ++ x *= detail::ReciprocalNewtonRaphsonStep(x, b); ++ x *= detail::ReciprocalNewtonRaphsonStep(x, b); ++ x *= detail::ReciprocalNewtonRaphsonStep(x, b); + return a * x; + } + #endif + +-// Absolute value of difference. ++// ------------------------------ Absolute value of difference. ++ + HWY_INLINE Vec128 AbsDiff(const Vec128 a, const Vec128 b) { + return Vec128(vabdq_f32(a.raw, b.raw)); + } +@@ -1169,7 +1238,7 @@ HWY_INLINE Vec128 AbsDiff(cons + // ------------------------------ Floating-point multiply-add variants + + // Returns add + mul * x +-#if defined(__aarch64__) ++#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 + template + HWY_INLINE Vec128 MulAdd(const Vec128 mul, + const Vec128 x, +@@ -1180,6 +1249,17 @@ HWY_INLINE Vec128 MulAdd(const Ve + const Vec128 add) { + return Vec128(vfmaq_f32(add.raw, mul.raw, x.raw)); + } ++#else ++// Emulate FMA for floats. ++template ++HWY_INLINE Vec128 MulAdd(const Vec128 mul, ++ const Vec128 x, ++ const Vec128 add) { ++ return mul * x + add; ++} ++#endif ++ ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +@@ -1190,18 +1270,10 @@ HWY_INLINE Vec128 MulAdd(const V + const Vec128 add) { + return Vec128(vfmaq_f64(add.raw, mul.raw, x.raw)); + } +-#else +-// Emulate FMA for floats. +-template +-HWY_INLINE Vec128 MulAdd(const Vec128 mul, +- const Vec128 x, +- const Vec128 add) { +- return mul * x + add; +-} + #endif + + // Returns add - mul * x +-#if defined(__aarch64__) ++#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 + template + HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, +@@ -1213,7 +1285,17 @@ HWY_INLINE Vec128 NegMulAdd(const + const Vec128 add) { + return Vec128(vfmsq_f32(add.raw, mul.raw, x.raw)); + } ++#else ++// Emulate FMA for floats. ++template ++HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, ++ const Vec128 x, ++ const Vec128 add) { ++ return add - mul * x; ++} ++#endif + ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { +@@ -1224,14 +1306,6 @@ HWY_INLINE Vec128 NegMulAdd(cons + const Vec128 add) { + return Vec128(vfmsq_f64(add.raw, mul.raw, x.raw)); + } +-#else +-// Emulate FMA for floats. +-template +-HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, +- const Vec128 x, +- const Vec128 add) { +- return add - mul * x; +-} + #endif + + // Returns mul * x - sub +@@ -1241,12 +1315,6 @@ HWY_INLINE Vec128 MulSub(const + const Vec128 sub) { + return MulAdd(mul, x, Neg(sub)); + } +-template +-HWY_INLINE Vec128 MulSub(const Vec128 mul, +- const Vec128 x, +- const Vec128 sub) { +- return MulAdd(mul, x, Neg(sub)); +-} + + // Returns -mul * x - sub + template +@@ -1255,14 +1323,23 @@ HWY_INLINE Vec128 NegMulSub(co + const Vec128 sub) { + return Neg(MulAdd(mul, x, sub)); + } ++ ++#if HWY_ARCH_ARM_A64 ++template ++HWY_INLINE Vec128 MulSub(const Vec128 mul, ++ const Vec128 x, ++ const Vec128 sub) { ++ return MulAdd(mul, x, Neg(sub)); ++} + template + HWY_INLINE Vec128 NegMulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return Neg(MulAdd(mul, x, sub)); + } ++#endif + +-// ------------------------------ Floating-point square root ++// ------------------------------ Floating-point square root (IfThenZeroElse) + + // Approximate reciprocal square root + HWY_INLINE Vec128 ApproximateReciprocalSqrt(const Vec128 v) { +@@ -1275,80 +1352,36 @@ HWY_INLINE Vec128 ApproximateR + } + + // Full precision square root +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) + #else +-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt. +-template +-HWY_INLINE Vec128 Sqrt(const Vec128 v) { +- auto b = v; +- auto Y = ApproximateReciprocalSqrt(v); +- auto x = v * Y; +- const auto half = Set(Simd(), 0.5); +- const auto oneandhalf = Set(Simd(), 1.5); +- for (size_t i = 0; i < 3; i++) { +- b = b * Y * Y; +- Y = oneandhalf - half * b; +- x = x * Y; +- } +- return IfThenZeroElse(v == Zero(Simd()), x); +-} +-#endif +- +-// ================================================== COMPARE +- +-// Comparisons fill a lane with 1-bits if the condition is true, else 0. ++namespace detail { + +-template +-HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { +- static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); +- return Mask128{m.raw}; ++HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, ++ const Vec128 recip) { ++ return Vec128(vrsqrtsq_f32(root.raw, recip.raw)); ++} ++template ++HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, ++ Vec128 recip) { ++ return Vec128(vrsqrts_f32(root.raw, recip.raw)); + } + +-#define HWY_NEON_BUILD_TPL_HWY_COMPARE +-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 +-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ +- const Vec128 a, const Vec128 b +-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw +- +-// ------------------------------ Equality +-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) +-#if defined(__aarch64__) +-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) +-#else +-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. +-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) +-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) +-#endif ++} // namespace detail + +-// ------------------------------ Strict inequality ++// Not defined on armv7: approximate ++template ++HWY_INLINE Vec128 Sqrt(const Vec128 v) { ++ auto recip = ApproximateReciprocalSqrt(v); + +-// Signed/float < (no unsigned) +-#if defined(__aarch64__) +-HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) +-#else +-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) +-#endif +-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) ++ recip *= detail::ReciprocalSqrtStep(v * recip, recip); ++ recip *= detail::ReciprocalSqrtStep(v * recip, recip); ++ recip *= detail::ReciprocalSqrtStep(v * recip, recip); + +-// Signed/float > (no unsigned) +-#if defined(__aarch64__) +-HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE) +-#else +-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE) ++ const auto root = v * recip; ++ return IfThenZeroElse(v == Zero(Simd()), root); ++} + #endif +-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE) +- +-// ------------------------------ Weak inequality +- +-// Float <= >= +-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) +-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE) +- +-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE +-#undef HWY_NEON_BUILD_RET_HWY_COMPARE +-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE +-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE + + // ================================================== LOGICAL + +@@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato + // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. + template + HWY_INLINE Vec128 Not(const Vec128 v) { +- const Full128 d8; +- return Vec128(vmvnq_u8(BitCast(d8, v).raw)); ++ const Full128 d; ++ const Repartition d8; ++ return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); + } + template + HWY_INLINE Vec128 Not(const Vec128 v) { +- const Repartition> d8; +- return Vec128(vmvn_u8(BitCast(d8, v).raw)); ++ const Simd d; ++ const Repartition d8; ++ using V8 = decltype(Zero(d8)); ++ return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); + } + + // ------------------------------ And +@@ -1463,33 +1499,38 @@ HWY_API Vec128 BroadcastSignBit(co + return ShiftRight(v); + } + +-// ------------------------------ Make mask ++// ================================================== MASK + +-template +-HWY_INLINE Mask128 TestBit(Vec128 v, Vec128 bit) { +- static_assert(!hwy::IsFloat(), "Only integer vectors supported"); +- return (v & bit) == bit; +-} ++// ------------------------------ To/from vector + +-// Mask and Vec are the same (true = FF..FF). ++// Mask and Vec have the same representation (true = FF..FF). + template + HWY_INLINE Mask128 MaskFromVec(const Vec128 v) { +- return Mask128(v.raw); ++ const Simd, N> du; ++ return Mask128(BitCast(du, v).raw); + } + ++// DEPRECATED + template + HWY_INLINE Vec128 VecFromMask(const Mask128 v) { +- return Vec128(v.raw); ++ return BitCast(Simd(), Vec128, N>(v.raw)); + } + + template +-HWY_INLINE Vec128 VecFromMask(Simd /* tag */, +- const Mask128 v) { +- return Vec128(v.raw); ++HWY_INLINE Vec128 VecFromMask(Simd d, const Mask128 v) { ++ return BitCast(d, Vec128, N>(v.raw)); + } + +-// IfThenElse(mask, yes, no) +-// Returns mask ? b : a. ++// ------------------------------ RebindMask ++ ++template ++HWY_API Mask128 RebindMask(Simd dto, Mask128 m) { ++ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); ++ return MaskFromVec(BitCast(dto, VecFromMask(Simd(), m))); ++} ++ ++// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a. ++ + #define HWY_NEON_BUILD_TPL_HWY_IF + #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 + #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ +@@ -1524,7 +1565,6 @@ HWY_INLINE Vec128 ZeroIfNegative(V + return Max(zero, v); + } + +- + // ------------------------------ Mask logical + + template +@@ -1557,30 +1597,183 @@ HWY_API Mask128 Xor(const Mask128< + return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); + } + +-// ------------------------------ Min (IfThenElse, BroadcastSignBit) ++// ================================================== COMPARE + +-namespace detail { ++// Comparisons fill a lane with 1-bits if the condition is true, else 0. + +-#if defined(__aarch64__) ++// ------------------------------ Shuffle2301 (for i64 compares) + +-HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { +- return Vec128(vcgtq_u64(a.raw, b.raw)); ++// Swap 32-bit halves in 64-bits ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64_u32(v.raw)); ++} ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64_s32(v.raw)); ++} ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64_f32(v.raw)); ++} ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64q_u32(v.raw)); + } +-HWY_INLINE Vec128 Gt(Vec128 a, +- Vec128 b) { +- return Vec128(vcgt_u64(a.raw, b.raw)); ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64q_s32(v.raw)); ++} ++HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { ++ return Vec128(vrev64q_f32(v.raw)); + } + +-HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { +- return Vec128(vcgtq_s64(a.raw, b.raw)); ++#define HWY_NEON_BUILD_TPL_HWY_COMPARE ++#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 ++#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ ++ const Vec128 a, const Vec128 b ++#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw ++ ++// ------------------------------ Equality ++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) ++#if HWY_ARCH_ARM_A64 ++HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) ++#else ++// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. ++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) ++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) ++#endif ++ ++// ------------------------------ Strict inequality (signed, float) ++#if HWY_ARCH_ARM_A64 ++HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) ++#else ++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) ++#endif ++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) ++ ++// ------------------------------ Weak inequality (float) ++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) ++ ++#undef HWY_NEON_BUILD_TPL_HWY_COMPARE ++#undef HWY_NEON_BUILD_RET_HWY_COMPARE ++#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE ++#undef HWY_NEON_BUILD_ARG_HWY_COMPARE ++ ++// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq) ++ ++#if HWY_ARCH_ARM_V7 ++ ++template ++HWY_INLINE Mask128 operator==(const Vec128 a, ++ const Vec128 b) { ++ const Simd d32; ++ const Simd d64; ++ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); ++ const auto cmp64 = cmp32 & Shuffle2301(cmp32); ++ return MaskFromVec(BitCast(d64, cmp64)); ++} ++ ++template ++HWY_INLINE Mask128 operator==(const Vec128 a, ++ const Vec128 b) { ++ const Simd d32; ++ const Simd d64; ++ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); ++ const auto cmp64 = cmp32 & Shuffle2301(cmp32); ++ return MaskFromVec(BitCast(d64, cmp64)); + } +-HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { +- return Vec128(vcgt_s64(a.raw, b.raw)); ++ ++HWY_INLINE Mask128 operator<(const Vec128 a, ++ const Vec128 b) { ++ const int64x2_t sub = vqsubq_s64(a.raw, b.raw); ++ return MaskFromVec(BroadcastSignBit(Vec128(sub))); ++} ++HWY_INLINE Mask128 operator<(const Vec128 a, ++ const Vec128 b) { ++ const int64x1_t sub = vqsub_s64(a.raw, b.raw); ++ return MaskFromVec(BroadcastSignBit(Vec128(sub))); + } + + #endif + +-} // namespace detail ++// ------------------------------ Reversed comparisons ++ ++template ++HWY_API Mask128 operator>(Vec128 a, Vec128 b) { ++ return operator<(b, a); ++} ++template ++HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { ++ return operator<=(b, a); ++} ++ ++// ------------------------------ FirstN (Iota, Lt) ++ ++template ++HWY_API Mask128 FirstN(const Simd d, size_t num) { ++ const RebindToSigned di; // Signed comparisons are cheaper. ++ return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); ++} ++ ++// ------------------------------ TestBit (Eq) ++ ++#define HWY_NEON_BUILD_TPL_HWY_TESTBIT ++#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 ++#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ ++ Vec128 v, Vec128 bit ++#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw ++ ++#if HWY_ARCH_ARM_A64 ++HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) ++#else ++// No 64-bit versions on armv7 ++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) ++HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) ++ ++template ++HWY_INLINE Mask128 TestBit(Vec128 v, ++ Vec128 bit) { ++ return (v & bit) == bit; ++} ++template ++HWY_INLINE Mask128 TestBit(Vec128 v, ++ Vec128 bit) { ++ return (v & bit) == bit; ++} ++ ++#endif ++#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT ++#undef HWY_NEON_BUILD_RET_HWY_TESTBIT ++#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT ++#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT ++ ++// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) ++HWY_INLINE Vec128 Abs(const Vec128 v) { ++#if HWY_ARCH_ARM_A64 ++ return Vec128(vabsq_s64(v.raw)); ++#else ++ const auto zero = Zero(Full128()); ++ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); ++#endif ++} ++HWY_INLINE Vec128 Abs(const Vec128 v) { ++#if HWY_ARCH_ARM_A64 ++ return Vec128(vabs_s64(v.raw)); ++#else ++ const auto zero = Zero(Simd()); ++ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); ++#endif ++} ++ ++// ------------------------------ Min (IfThenElse, BroadcastSignBit) ++ ++#if HWY_ARCH_ARM_A64 ++ ++HWY_INLINE Mask128 operator<(Vec128 a, Vec128 b) { ++ return Mask128(vcltq_u64(a.raw, b.raw)); ++} ++HWY_INLINE Mask128 operator<(Vec128 a, ++ Vec128 b) { ++ return Mask128(vclt_u64(a.raw, b.raw)); ++} ++ ++#endif + + // Unsigned + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) +@@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, + template + HWY_INLINE Vec128 Min(const Vec128 a, + const Vec128 b) { +-#if defined(__aarch64__) +- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); ++#if HWY_ARCH_ARM_A64 ++ return IfThenElse(b < a, b, a); + #else + const Simd du; + const Simd di; +@@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v + template + HWY_INLINE Vec128 Min(const Vec128 a, + const Vec128 b) { +-#if defined(__aarch64__) +- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); ++#if HWY_ARCH_ARM_A64 ++ return IfThenElse(b < a, b, a); + #else + const Vec128 sign = detail::SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); +@@ -1612,7 +1805,7 @@ HWY_INLINE Vec128 Min(const + } + + // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN. +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2) + #else + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) +@@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, + template + HWY_INLINE Vec128 Max(const Vec128 a, + const Vec128 b) { +-#if defined(__aarch64__) +- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); ++#if HWY_ARCH_ARM_A64 ++ return IfThenElse(b < a, a, b); + #else + const Simd du; + const Simd di; +@@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v + template + HWY_INLINE Vec128 Max(const Vec128 a, + const Vec128 b) { +-#if defined(__aarch64__) +- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); ++#if HWY_ARCH_ARM_A64 ++ return IfThenElse(b < a, a, b); + #else + const Vec128 sign = detail::SaturatedSub(a, b); + return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); +@@ -1650,7 +1843,7 @@ HWY_INLINE Vec128 Max(const + } + + // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN. +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2) + #else + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) +@@ -1696,7 +1889,7 @@ HWY_INLINE Vec128 LoadU(Full128(vld1q_f32(aligned)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 LoadU(Full128 /* tag */, + const double* HWY_RESTRICT aligned) { + return Vec128(vld1q_f64(aligned)); +@@ -1741,7 +1934,7 @@ HWY_INLINE Vec128 LoadU(Simd(vld1_f32(p)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 LoadU(Simd /* tag */, + const double* HWY_RESTRICT p) { + return Vec128(vld1_f64(p)); +@@ -1755,73 +1948,72 @@ HWY_INLINE Vec128 LoadU(Simd< + // we don't actually care what is in it, and we don't want + // to introduce extra overhead by initializing it to something. + +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const uint8_t* HWY_RESTRICT p) { +- uint32x2_t a = Undefined(d).raw; ++ uint32x2_t a = Undefined(Simd()).raw; + uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_u8_u32(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const uint16_t* HWY_RESTRICT p) { +- uint32x2_t a = Undefined(d).raw; ++ uint32x2_t a = Undefined(Simd()).raw; + uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_u16_u32(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const uint32_t* HWY_RESTRICT p) { +- uint32x2_t a = Undefined(d).raw; ++ uint32x2_t a = Undefined(Simd()).raw; + uint32x2_t b = vld1_lane_u32(p, a, 0); + return Vec128(b); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const int8_t* HWY_RESTRICT p) { +- int32x2_t a = Undefined(d).raw; ++ int32x2_t a = Undefined(Simd()).raw; + int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_s8_s32(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const int16_t* HWY_RESTRICT p) { +- int32x2_t a = Undefined(d).raw; ++ int32x2_t a = Undefined(Simd()).raw; + int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_s16_s32(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const int32_t* HWY_RESTRICT p) { +- int32x2_t a = Undefined(d).raw; ++ int32x2_t a = Undefined(Simd()).raw; + int32x2_t b = vld1_lane_s32(p, a, 0); + return Vec128(b); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const float* HWY_RESTRICT p) { +- float32x2_t a = Undefined(d).raw; ++ float32x2_t a = Undefined(Simd()).raw; + float32x2_t b = vld1_lane_f32(p, a, 0); + return Vec128(b); + } + + // ------------------------------ Load 16 + +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const uint8_t* HWY_RESTRICT p) { +- uint16x4_t a = Undefined(d).raw; ++ uint16x4_t a = Undefined(Simd()).raw; + uint16x4_t b = vld1_lane_u16(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_u8_u16(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const uint16_t* HWY_RESTRICT p) { +- uint16x4_t a = Undefined(d).raw; ++ uint16x4_t a = Undefined(Simd()).raw; + uint16x4_t b = vld1_lane_u16(p, a, 0); + return Vec128(b); + } +- +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const int8_t* HWY_RESTRICT p) { +- int16x4_t a = Undefined(d).raw; ++ int16x4_t a = Undefined(Simd()).raw; + int16x4_t b = vld1_lane_s16(reinterpret_cast(p), a, 0); + return Vec128(vreinterpret_s8_s16(b)); + } +-HWY_INLINE Vec128 LoadU(Simd d, ++HWY_INLINE Vec128 LoadU(Simd /*tag*/, + const int16_t* HWY_RESTRICT p) { +- int16x4_t a = Undefined(d).raw; ++ int16x4_t a = Undefined(Simd()).raw; + int16x4_t b = vld1_lane_s16(p, a, 0); + return Vec128(b); + } +@@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128 v, Full128 /* tag */, + double* HWY_RESTRICT aligned) { + vst1q_f64(aligned, v.raw); +@@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd /* tag */, + double* HWY_RESTRICT p) { + vst1_f64(p, v.raw); +@@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, + uint8_t* HWY_RESTRICT p) { + uint32x2_t a = vreinterpret_u32_u8(v.raw); +- vst1_lane_u32(p, a, 0); ++ vst1_lane_u32(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + uint16_t* HWY_RESTRICT p) { + uint32x2_t a = vreinterpret_u32_u16(v.raw); +- vst1_lane_u32(p, a, 0); ++ vst1_lane_u32(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + uint32_t* HWY_RESTRICT p) { +@@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, + int8_t* HWY_RESTRICT p) { + int32x2_t a = vreinterpret_s32_s8(v.raw); +- vst1_lane_s32(p, a, 0); ++ vst1_lane_s32(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + int16_t* HWY_RESTRICT p) { + int32x2_t a = vreinterpret_s32_s16(v.raw); +- vst1_lane_s32(p, a, 0); ++ vst1_lane_s32(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + int32_t* HWY_RESTRICT p) { +@@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, + uint8_t* HWY_RESTRICT p) { + uint16x4_t a = vreinterpret_u16_u8(v.raw); +- vst1_lane_u16(p, a, 0); ++ vst1_lane_u16(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + uint16_t* HWY_RESTRICT p) { +@@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, + int8_t* HWY_RESTRICT p) { + int16x4_t a = vreinterpret_s16_s8(v.raw); +- vst1_lane_s16(p, a, 0); ++ vst1_lane_s16(reinterpret_cast(p), a, 0); + } + HWY_INLINE void StoreU(const Vec128 v, Simd, + int16_t* HWY_RESTRICT p) { +@@ -2068,18 +2260,18 @@ HWY_INLINE Vec128 PromoteTo(Fu + const Vec128 v) { + return Vec128(vmovl_u32(v.raw)); + } +-HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, ++HWY_INLINE Vec128 PromoteTo(Full128 d, + const Vec128 v) { +- return Vec128(vmovl_u8(v.raw)); ++ return BitCast(d, Vec128(vmovl_u8(v.raw))); + } +-HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, ++HWY_INLINE Vec128 PromoteTo(Full128 d, + const Vec128 v) { + uint16x8_t a = vmovl_u8(v.raw); +- return Vec128(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a)))); ++ return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); + } +-HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, ++HWY_INLINE Vec128 PromoteTo(Full128 d, + const Vec128 v) { +- return Vec128(vmovl_u16(v.raw)); ++ return BitCast(d, Vec128(vmovl_u16(v.raw))); + } + + // Unsigned: zero-extend to half vector. +@@ -2105,9 +2297,9 @@ HWY_INLINE Vec128 PromoteTo + return Vec128(vget_low_u64(vmovl_u32(v.raw))); + } + template +-HWY_INLINE Vec128 PromoteTo(Simd /* tag */, ++HWY_INLINE Vec128 PromoteTo(Simd d, + const Vec128 v) { +- return Vec128(vget_low_s16(vmovl_u8(v.raw))); ++ return BitCast(d, Vec128(vget_low_u16(vmovl_u8(v.raw)))); + } + template + HWY_INLINE Vec128 PromoteTo(Simd /* tag */, +@@ -2170,12 +2362,14 @@ HWY_INLINE Vec128 PromoteTo( + + HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, + const Vec128 v) { +- return Vec128(vcvt_f32_f16(v.raw)); ++ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); ++ return Vec128(f32); + } + template + HWY_INLINE Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128(vget_low_f32(vcvt_f32_f16(v.raw))); ++ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); ++ return Vec128(vget_low_f32(f32)); + } + + #else +@@ -2204,7 +2398,7 @@ HWY_INLINE Vec128 PromoteTo(Si + + #endif + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + + HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, + const Vec128 v) { +@@ -2298,12 +2492,13 @@ HWY_INLINE Vec128 DemoteTo(Si + + HWY_INLINE Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{vcvt_f16_f32(v.raw)}; ++ return Vec128{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))}; + } + template + HWY_INLINE Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))}; ++ const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw)); ++ return Vec128(vreinterpret_u16_f16(f16)); + } + + #else +@@ -2339,7 +2534,7 @@ HWY_INLINE Vec128 DemoteTo + } + + #endif +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + + HWY_INLINE Vec128 DemoteTo(Simd /* tag */, + const Vec128 v) { +@@ -2397,7 +2592,7 @@ HWY_INLINE Vec128 DemoteTo(Si + const Vec128 v) { + Vec128 a = DemoteTo(Simd(), v); + Vec128 b; +- uint16x8_t c = vcombine_s16(a.raw, b.raw); ++ int16x8_t c = vcombine_s16(a.raw, b.raw); + return Vec128(vqmovn_s16(c)); + } + +@@ -2426,7 +2621,7 @@ HWY_INLINE Vec128 ConvertTo( + return Vec128(vcvt_s32_f32(v.raw)); + } + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + + HWY_INLINE Vec128 ConvertTo(Full128 /* tag */, + const Vec128 v) { +@@ -2451,7 +2646,7 @@ HWY_INLINE Vec128 ConvertTo( + + // ------------------------------ Round (IfThenElse, mask, logical) + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + // Toward nearest integer + HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) + +@@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, + // representation, clearing the lowest 23-exp mantissa bits. This requires 9 + // integer operations and 3 constants, which is likely more expensive. + ++namespace detail { ++ ++// The original value is already the desired result if NaN or the magnitude is ++// large (i.e. the value is already an integer). ++template ++HWY_API Mask128 UseInt(const Vec128 v) { ++ return Abs(v) < Set(Simd(), MantissaEnd()); ++} ++ ++} // namespace detail ++ + template + HWY_INLINE Vec128 Trunc(const Vec128 v) { + const Simd df; +- const Simd di; ++ const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + +- // The original value is already the desired result if NaN or the magnitude is +- // large (i.e. the value is already an integer). +- const auto max = Set(df, MantissaEnd()); +- return IfThenElse(Abs(v) < max, int_f, v); ++ return IfThenElse(detail::UseInt(v), int_f, v); + } + + template +@@ -2506,7 +2709,7 @@ HWY_INLINE Vec128 Round(const + template + HWY_INLINE Vec128 Ceil(const Vec128 v) { + const Simd df; +- const Simd di; ++ const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); +@@ -2514,9 +2717,7 @@ HWY_INLINE Vec128 Ceil(const V + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + +- // Keep original if NaN or the magnitude is large (already an int). +- const auto max = Set(df, MantissaEnd()); +- return IfThenElse(Abs(v) < max, int_f - neg1, v); ++ return IfThenElse(detail::UseInt(v), int_f - neg1, v); + } + + template +@@ -2530,16 +2731,14 @@ HWY_INLINE Vec128 Floor(const + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + +- // Keep original if NaN or the magnitude is large (already an int). +- const auto max = Set(df, MantissaEnd()); +- return IfThenElse(Abs(v) < max, int_f + neg1, v); ++ return IfThenElse(detail::UseInt(v), int_f + neg1, v); + } + + #endif + + // ------------------------------ NearestInt (Round) + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + + HWY_INLINE Vec128 NearestInt(const Vec128 v) { + return Vec128(vcvtnq_s32_f32(v.raw)); +@@ -2596,7 +2795,7 @@ HWY_INLINE Vec128 LowerHalf( + HWY_INLINE Vec128 LowerHalf(const Vec128 v) { + return Vec128(vget_low_f32(v.raw)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 LowerHalf(const Vec128 v) { + return Vec128(vget_low_f64(v.raw)); + } +@@ -2629,7 +2828,7 @@ HWY_INLINE Vec128 UpperHalf( + HWY_INLINE Vec128 UpperHalf(const Vec128 v) { + return Vec128(vget_high_f32(v.raw)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 UpperHalf(const Vec128 v) { + return Vec128(vget_high_f64(v.raw)); + } +@@ -2714,7 +2913,7 @@ HWY_INLINE Vec128 ShiftRightLanes( + + // ------------------------------ Broadcast/splat any lane + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + // Unsigned + template + HWY_INLINE Vec128 Broadcast(const Vec128 v) { +@@ -2886,7 +3085,7 @@ HWY_API Vec128 TableLookupBytes(const + const Vec128 from) { + const Full128 d; + const Repartition d8; +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, + BitCast(d8, from).raw))); + #else +@@ -2911,33 +3110,58 @@ HWY_INLINE Vec128 TableLookupBytes + BitCast(d8, from).raw))); + } + +-// ------------------------------ Hard-coded shuffles ++// ------------------------------ TableLookupLanes + +-// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +-// Shuffle0321 rotates one lane to the right (the previous least-significant +-// lane is now most-significant). These could also be implemented via +-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. ++// Returned by SetTableIndices for use by TableLookupLanes. ++template ++struct Indices128 { ++ typename detail::Raw128::type raw; ++}; + +-// Swap 32-bit halves in 64-bits +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64_u32(v.raw)); +-} +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64_s32(v.raw)); +-} +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64_f32(v.raw)); ++template ++HWY_INLINE Indices128 SetTableIndices(Simd d, const int32_t* idx) { ++#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) ++ for (size_t i = 0; i < N; ++i) { ++ HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); ++ } ++#endif ++ ++ const Repartition d8; ++ alignas(16) uint8_t control[16] = {0}; ++ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { ++ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { ++ control[idx_lane * sizeof(T) + idx_byte] = ++ static_cast(idx[idx_lane] * sizeof(T) + idx_byte); ++ } ++ } ++ return Indices128{BitCast(d, Load(d8, control)).raw}; + } +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64q_u32(v.raw)); ++ ++template ++HWY_INLINE Vec128 TableLookupLanes( ++ const Vec128 v, const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64q_s32(v.raw)); ++template ++HWY_INLINE Vec128 TableLookupLanes( ++ const Vec128 v, const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +-HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { +- return Vec128(vrev64q_f32(v.raw)); ++template ++HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, ++ const Indices128 idx) { ++ const Simd di; ++ const auto idx_i = BitCast(di, Vec128{idx.raw}); ++ return BitCast(Simd(), TableLookupBytes(BitCast(di, v), idx_i)); + } + ++// ------------------------------ Other shuffles (TableLookupBytes) ++ ++// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). ++// Shuffle0321 rotates one lane to the right (the previous least-significant ++// lane is now most-significant). These could also be implemented via ++// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. ++ + // Swap 64-bit halves + template + HWY_INLINE Vec128 Shuffle1032(const Vec128 v) { +@@ -2975,49 +3199,6 @@ HWY_INLINE Vec128 Shuffle0123(const V + return TableLookupBytes(v, BitCast(d, Load(d8, bytes))); + } + +-// ------------------------------ TableLookupLanes +- +-// Returned by SetTableIndices for use by TableLookupLanes. +-template +-struct Indices128 { +- uint8x16_t raw; +-}; +- +-template +-HWY_INLINE Indices128 SetTableIndices(const Full128, const int32_t* idx) { +-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) +- const size_t N = 16 / sizeof(T); +- for (size_t i = 0; i < N; ++i) { +- HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); +- } +-#endif +- +- const Full128 d8; +- alignas(16) uint8_t control[16]; +- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { +- const size_t idx_lane = idx_byte / sizeof(T); +- const size_t mod = idx_byte % sizeof(T); +- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; +- } +- return Indices128{Load(d8, control).raw}; +-} +- +-HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128(idx.raw)); +-} +-HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128(idx.raw)); +-} +-HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- const Full128 di; +- const Full128 df; +- return BitCast(df, +- TableLookupBytes(BitCast(di, v), Vec128(idx.raw))); +-} +- + // ------------------------------ Interleave lanes + + // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides +@@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter + HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) + HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + // For 64 bit types, we only have the "q" version of the function defined as + // interleaving 64-wide registers with 64-wide types in them makes no sense. + HWY_INLINE Vec128 InterleaveLower(const Vec128 a, +@@ -3079,7 +3260,7 @@ HWY_INLINE Vec128 InterleaveLower + const Vec128 b) { + return Vec128(vzip1q_f32(a.raw, b.raw)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { + return Vec128(vzip1q_f64(a.raw, b.raw)); +@@ -3090,10 +3271,10 @@ HWY_INLINE Vec128 InterleaveUpper + const Vec128 b) { + return Vec128(vzip2q_f32(a.raw, b.raw)); + } +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_s64(a.raw, b.raw)); ++ return Vec128(vzip2q_f64(a.raw, b.raw)); + } + #endif + +@@ -3105,119 +3286,125 @@ HWY_INLINE Vec128 InterleaveUppe + // Full vectors + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_u8(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_u16(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_u32(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw))); + } + + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_s8(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_s16(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1q_s32(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw))); + } + + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_u8(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_u16(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_u32(a.raw, b.raw)); ++ return Vec128(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw))); + } + + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_s8(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_s16(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw))); + } + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2q_s32(a.raw, b.raw)); ++ return Vec128(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw))); + } + + // Half vectors or less + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_u8(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_u16(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_u32(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw))); + } + + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_s8(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_s16(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip1_s32(a.raw, b.raw)); ++ return Vec128( ++ vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw))); + } + + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_u8(a.raw, b.raw)); ++ return Vec128(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_u16(a.raw, b.raw)); ++ return Vec128(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_u32(a.raw, b.raw)); ++ return Vec128(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw))); + } + + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_s8(a.raw, b.raw)); ++ return Vec128(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_s16(a.raw, b.raw)); ++ return Vec128(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw))); + } + template + HWY_INLINE Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128(vzip2_s32(a.raw, b.raw)); ++ return Vec128(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw))); + } + + // ------------------------------ Blocks +@@ -3274,84 +3461,113 @@ HWY_INLINE Vec128 OddEven(const Vec12 + + // ================================================== MISC + +-// Returns a vector with lane i=[0, N) set to "first" + i. +-template +-Vec128 Iota(const Simd d, const T2 first) { +- HWY_ALIGN T lanes[16 / sizeof(T)]; +- for (size_t i = 0; i < 16 / sizeof(T); ++i) { +- lanes[i] = static_cast(first + static_cast(i)); ++// ------------------------------ Scatter (Store) ++ ++template ++HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); ++ } ++} ++ ++template ++HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ for (size_t i = 0; i < N; ++i) { ++ base[index_lanes[i]] = lanes[i]; + } +- return Load(d, lanes); + } + +-// ------------------------------ Gather (requires GetLane) ++// ------------------------------ Gather (Load/Store) + + template + HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { +- static_assert(N == 1, "NEON does not support full gather"); +- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); +- const uintptr_t address = reinterpret_cast(base) + GetLane(offset); +- T val; +- CopyBytes(reinterpret_cast(address), &val); +- return Set(d, val); ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ alignas(16) T lanes[N]; ++ const uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); ++ } ++ return Load(d, lanes); + } + + template + HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, + const Vec128 index) { +- static_assert(N == 1, "NEON does not support full gather"); +- static_assert(sizeof(T) == sizeof(Index), "T must match Index"); +- return Set(d, base[GetLane(index)]); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ alignas(16) T lanes[N]; ++ for (size_t i = 0; i < N; ++i) { ++ lanes[i] = base[index_lanes[i]]; ++ } ++ return Load(d, lanes); + } + +-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301) ++// ------------------------------ Reductions + +-#if !defined(__aarch64__) ++namespace detail { + +-template +-HWY_INLINE Mask128 operator==(const Vec128 a, +- const Vec128 b) { +- const Simd d32; +- const Simd d64; +- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); +- const auto cmp64 = cmp32 & Shuffle2301(cmp32); +- return MaskFromVec(BitCast(d64, cmp64)); ++// N=1 for any T: no-op ++template ++HWY_API Vec128 SumOfLanes(const Vec128 v) { ++ return v; + } +- +-template +-HWY_INLINE Mask128 operator==(const Vec128 a, +- const Vec128 b) { +- const Simd d32; +- const Simd d64; +- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); +- const auto cmp64 = cmp32 & Shuffle2301(cmp32); +- return MaskFromVec(BitCast(d64, cmp64)); ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; + } + +-HWY_INLINE Mask128 operator<(const Vec128 a, +- const Vec128 b) { +- const int64x2_t sub = vqsubq_s64(a.raw, b.raw); +- return MaskFromVec(BroadcastSignBit(Vec128(sub))); ++// u32/i32/f32: N=2 ++template ++HWY_API Vec128 SumOfLanes(const Vec128 v10) { ++ return v10 + Shuffle2301(v10); + } +-HWY_INLINE Mask128 operator<(const Vec128 a, +- const Vec128 b) { +- const int64x1_t sub = vqsub_s64(a.raw, b.raw); +- return MaskFromVec(BroadcastSignBit(Vec128(sub))); ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Min(v10, Shuffle2301(v10)); + } +- +-template +-HWY_INLINE Mask128 operator>(const Vec128 a, +- const Vec128 b) { +- return b < a; ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Max(v10, Shuffle2301(v10)); + } +-#endif +- +-// ------------------------------ Reductions + +-#if defined(__aarch64__) +-// Supported for 32b and 64b vector types. Returns the sum in each lane. ++// full vectors ++#if HWY_ARCH_ARM_A64 + HWY_INLINE Vec128 SumOfLanes(const Vec128 v) { + return Vec128(vdupq_n_u32(vaddvq_u32(v.raw))); + } +@@ -3398,20 +3614,15 @@ HWY_INLINE Vec128 SumOfLanes(co + } + #endif + +-namespace detail { +- +-// For u32/i32/f32. +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); +@@ -3419,15 +3630,13 @@ HWY_API Vec128 MaxOfLanes(hwy::Siz + } + + // For u64/i64[/f64]. +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); + } +@@ -3435,6 +3644,10 @@ HWY_API Vec128 MaxOfLanes(hwy::Siz + } // namespace detail + + template ++HWY_API Vec128 SumOfLanes(const Vec128 v) { ++ return detail::SumOfLanes(v); ++} ++template + HWY_API Vec128 MinOfLanes(const Vec128 v) { + return detail::MinOfLanes(hwy::SizeTag(), v); + } +@@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Vec128 values = + BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + // Can't vaddv - we need two separate bytes (16 bits). + const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); + const uint8x8_t x4 = vpadd_u8(x2, x2); + const uint8x8_t x8 = vpadd_u8(x4, x4); +- return vreinterpret_u16_u8(x8)[0]; ++ return vget_lane_u64(vreinterpret_u64_u8(x8), 0); + #else + // Don't have vpaddq, so keep doubling lane size. + const uint16x8_t x2 = vpaddlq_u8(values.raw); + const uint32x4_t x4 = vpaddlq_u16(x2); + const uint64x2_t x8 = vpaddlq_u32(x4); +- return (uint64_t(x8[1]) << 8) | x8[0]; ++ return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); + #endif + } + +@@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Vec128 slice(Load(Simd(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddv_u8(values.raw); + #else + const uint16x4_t x2 = vpaddl_u8(values.raw); +@@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_u16(values.raw); + #else + const uint32x4_t x2 = vpaddlq_u16(values.raw); +@@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Simd du; + const Vec128 slice(Load(Simd(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddv_u16(values.raw); + #else + const uint32x2_t x2 = vpaddl_u16(values.raw); +@@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_u32(values.raw); + #else + const uint64x2_t x2 = vpaddlq_u32(values.raw); +@@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Simd du; + const Vec128 slice(Load(Simd(), kSliceLanes).raw); + const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddv_u32(values.raw); + #else + const uint64x1_t x2 = vpaddl_u32(values.raw); +@@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si + const Full128 du; + const Vec128 values = + BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_u64(values.raw); + #else + return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); +@@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag + const int8x16_t ones = + vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_s8(ones); + #else + const int16x8_t x2 = vpaddlq_s8(ones); + const int32x4_t x4 = vpaddlq_s16(x2); + const int64x2_t x8 = vpaddlq_s32(x4); +- return x8[0] + x8[1]; ++ return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1); + #endif + } + template +@@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag + const int16x8_t ones = + vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_s16(ones); + #else + const int32x4_t x2 = vpaddlq_s16(ones); + const int64x2_t x4 = vpaddlq_s32(x2); +- return x4[0] + x4[1]; ++ return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1); + #endif + } + +@@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag + const int32x4_t ones = + vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); + +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + return vaddvq_s32(ones); + #else + const int64x2_t x2 = vpaddlq_s32(ones); +- return x2[0] + x2[1]; ++ return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1); + #endif + } + + template + HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { +-#if defined(__aarch64__) ++#if HWY_ARCH_ARM_A64 + const Full128 di; + const int64x2_t ones = + vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); + return vaddvq_s64(ones); + #else +- const Full128 di; +- const int64x2_t ones = +- vshrq_n_u64(BitCast(di, VecFromMask(Full128(), mask)).raw, 63); +- return ones[0] + ones[1]; ++ const Full128 du; ++ const auto mask_u = VecFromMask(du, RebindMask(du, mask)); ++ const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); ++ return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1); + #endif + } + +@@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma + // Full + template + HWY_INLINE bool AllFalse(const Mask128 m) { ++#if HWY_ARCH_ARM_A64 ++ const Full128 d32; ++ const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128(), m))); ++ return (vmaxvq_u32(m32.raw) == 0); ++#else + const auto v64 = BitCast(Full128(), VecFromMask(Full128(), m)); + uint32x2_t a = vqmovn_u64(v64.raw); +- return vreinterpret_u64_u32(a)[0] == 0; ++ return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0; ++#endif + } + + // Partial +@@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128 Load8Bytes(Full128 /*d*/, ++ const uint8_t* bytes) { ++ return Vec128(vreinterpretq_u8_u64( ++ vld1q_dup_u64(reinterpret_cast(bytes)))); ++} ++ ++// Load 8 bytes and return half-reg with N <= 8 bytes. ++template ++HWY_INLINE Vec128 Load8Bytes(Simd d, ++ const uint8_t* bytes) { ++ return Load(d, bytes); ++} ++ + template +-HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { ++HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, ++ const uint64_t mask_bits) { ++ HWY_DASSERT(mask_bits < 256); ++ const Simd d; ++ const Repartition d8; ++ const Simd du; ++ ++ // ARM does not provide an equivalent of AVX2 permutevar, so we need byte ++ // indices for VTBL (one vector's worth for each of 256 combinations of ++ // 8 mask bits). Loading them directly would require 4 KiB. We can instead ++ // store lane indices and convert to byte indices (2*lane + 0..1), with the ++ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane ++ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. ++ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles ++ // is likely more costly than the higher cache footprint from storing bytes. ++ alignas(16) constexpr uint8_t table[256 * 8] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, ++ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, ++ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, ++ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, ++ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, ++ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, ++ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, ++ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, ++ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, ++ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, ++ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, ++ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, ++ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, ++ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, ++ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, ++ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, ++ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, ++ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, ++ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, ++ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, ++ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, ++ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, ++ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, ++ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, ++ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, ++ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, ++ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, ++ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, ++ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, ++ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, ++ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, ++ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, ++ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, ++ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, ++ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, ++ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, ++ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, ++ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, ++ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, ++ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, ++ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, ++ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, ++ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, ++ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, ++ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, ++ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, ++ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, ++ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, ++ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, ++ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, ++ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, ++ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, ++ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, ++ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, ++ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, ++ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, ++ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, ++ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, ++ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, ++ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, ++ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, ++ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, ++ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, ++ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, ++ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, ++ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, ++ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, ++ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, ++ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, ++ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, ++ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, ++ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, ++ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, ++ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, ++ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, ++ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, ++ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, ++ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, ++ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, ++ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, ++ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, ++ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, ++ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, ++ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, ++ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, ++ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, ++ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, ++ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, ++ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, ++ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, ++ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, ++ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, ++ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, ++ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, ++ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, ++ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, ++ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, ++ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, ++ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, ++ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, ++ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, ++ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, ++ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, ++ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, ++ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, ++ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, ++ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, ++ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, ++ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, ++ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, ++ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; ++ ++ const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); ++ const Vec128 pairs = ZipLower(byte_idx, byte_idx); ++ return BitCast(d, pairs + Set(du, 0x0100)); ++} ++ ++template ++HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, ++ const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + + // There are only 4 lanes, so we can afford to load the index vector directly. +@@ -3742,7 +4113,8 @@ HWY_INLINE Vec128 Idx32x4FromBits( + #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64 + + template +-HWY_INLINE Vec128 Idx64x2FromBits(const uint64_t mask_bits) { ++HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, ++ const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 4); + + // There are only 2 lanes, so we can afford to load the index vector directly. +@@ -3761,59 +4133,15 @@ HWY_INLINE Vec128 Idx64x2FromBits( + + // Helper function called by both Compress and CompressStore - avoids a + // redundant BitsFromMask in the latter. +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +- +-#if HWY_CAP_INTEGER64 +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +- +-#endif +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); +-} +- +-#if HWY_CAP_FLOAT64 +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); ++template ++HWY_API Vec128 Compress(Vec128 v, const uint64_t mask_bits) { ++ const auto idx = ++ detail::IdxFromBits(hwy::SizeTag(), mask_bits); ++ using D = Simd; ++ const RebindToSigned di; ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + } + +-#endif +- + } // namespace detail + + template +@@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ Full128 /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw}; ++ vst3q_u8(unaligned, triple); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved3(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; ++ vst3_u8(unaligned, triple); ++} ++ ++// <= 32 bits: avoid writing more than N bytes by copying to buffer ++template ++HWY_API void StoreInterleaved3(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ alignas(16) uint8_t buf[24]; ++ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; ++ vst3_u8(buf, triple); ++ CopyBytes(buf, unaligned); ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++// 128 bits ++HWY_API void StoreInterleaved4(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ const Vec128 v3, ++ Full128 /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; ++ vst4q_u8(unaligned, quad); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved4(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ const Vec128 v3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; ++ vst4_u8(unaligned, quad); ++} ++ ++// <= 32 bits: avoid writing more than N bytes by copying to buffer ++template ++HWY_API void StoreInterleaved4(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ const Vec128 v3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ alignas(16) uint8_t buf[32]; ++ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; ++ vst4_u8(buf, quad); ++ CopyBytes(buf, unaligned); ++} ++ + // ================================================== Operator wrapper + + // These apply to all x86_*-inl.h because there are no restrictions on V. +@@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a + return a <= b; + } + +-#if !defined(__aarch64__) ++namespace detail { // for code folding ++#if HWY_ARCH_ARM_V7 + #undef vuzp1_s8 + #undef vuzp1_u8 + #undef vuzp1_s16 +@@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a + #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 + #undef HWY_NEON_DEF_FUNCTION_UINTS + #undef HWY_NEON_EVAL ++} // namespace detail + + // NOLINTNEXTLINE(google-readability-namespace-comments) + } // namespace HWY_NAMESPACE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 2021-06-02 10:56:05.230904367 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -39,6 +39,11 @@ using TFromV = TFromD>; + hwy::EnableIf>() && !IsFloat>()>* = nullptr + #define HWY_IF_FLOAT_V(V) hwy::EnableIf>()>* = nullptr + ++// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4 ++template ++using Full = Simd> (-kShift)) ++ : (HWY_LANES(T) << kShift)>; ++ + // ================================================== MACROS + + // Generate specializations and function definitions using X macros. Although +@@ -58,29 +63,30 @@ namespace detail { // for code folding + + // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the + // preprocessor cannot easily do it. +-#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \ +- X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP) \ +- X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP) \ +- X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP) \ +- X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP) +- +-#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \ +- X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP) \ +- X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP) \ +- X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP) \ +- X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP) +- +-#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \ +- X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP) \ +- X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP) \ +- X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP) \ +- X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP) +- +-#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \ +- X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP) \ +- X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP) \ +- X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP) \ +- X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP) ++// TODO(janwas): GCC does not yet support fractional LMUL ++#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP) ++ ++#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP) ++ ++#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP) ++ ++#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \ ++ X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP) + + // SEW for unsigned: + #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \ +@@ -153,63 +159,61 @@ namespace detail { // for code folding + + // Assemble types for use in x-macros + #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t +-#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL +-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t ++#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL ++#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t + #define HWY_RVV_M(MLEN) vbool##MLEN##_t + + } // namespace detail + + // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly + +-// TODO(janwas): do we want fractional LMUL? (can encode as negative) +-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they +-// need many registers. +-#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- using HWY_RVV_D(CHAR, SEW, LMUL) = \ +- Simd; \ +- using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \ +- template <> \ +- struct DFromV_t { \ +- using Lane = HWY_RVV_T(BASE, SEW); \ +- using type = Simd; \ ++// Until we have full intrinsic support for fractional LMUL, mixed-precision ++// code can use LMUL 1..8 (adequate unless they need many registers). ++#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ using HWY_RVV_D(CHAR, SEW, LMUL) = Full; \ ++ using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \ ++ template <> \ ++ struct DFromV_t { \ ++ using Lane = HWY_RVV_T(BASE, SEW); \ ++ using type = Full; \ + }; + using Vf16m1 = vfloat16m1_t; + using Vf16m2 = vfloat16m2_t; + using Vf16m4 = vfloat16m4_t; + using Vf16m8 = vfloat16m8_t; +-using Df16m1 = Simd; +-using Df16m2 = Simd; +-using Df16m4 = Simd; +-using Df16m8 = Simd; ++using Df16m1 = Full; ++using Df16m2 = Full; ++using Df16m4 = Full; ++using Df16m8 = Full; + + HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _) + #undef HWY_SPECIALIZE + + // vector = f(d), e.g. Zero +-#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \ + (void)Lanes(d); \ +- return v##OP##_##CHAR##SEW##m##LMUL(); \ ++ return v##OP##_##CHAR##SEW##LMUL(); \ + } + + // vector = f(vector), e.g. Not +-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_v_##CHAR##SEW##m##LMUL(v); \ ++ return v##OP##_v_##CHAR##SEW##LMUL(v); \ + } + + // vector = f(vector, scalar), e.g. detail::Add +-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ +- return v##OP##_##CHAR##SEW##m##LMUL(a, b); \ ++#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ ++ return v##OP##_##CHAR##SEW##LMUL(a, b); \ + } + + // vector = f(vector, vector), e.g. Add +-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(a, b); \ + } + + // ================================================== INIT +@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _) + + // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL! + // vlenb is not exposed through intrinsics and vreadvl is not VLMAX. +-#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \ +- return v##OP##SEW##m##LMUL(); \ ++#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \ ++ return v##OP##SEW##LMUL(); \ + } + + HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e) +@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero, + template + using VFromD = decltype(Zero(D())); + ++// Partial ++template ++HWY_API VFromD> Zero(Simd /*tag*/) { ++ return Zero(Full()); ++} ++ + // ------------------------------ Set + // vector = f(d, scalar), e.g. Set +-#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \ + (void)Lanes(d); \ +- return v##OP##_##CHAR##SEW##m##LMUL(arg); \ ++ return v##OP##_##CHAR##SEW##LMUL(arg); \ + } + + HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x) + HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f) + #undef HWY_RVV_SET + ++// Partial vectors ++template ++HWY_API VFromD> Set(Simd /*tag*/, T arg) { ++ return Set(Full(), arg); ++} ++ + // ------------------------------ Undefined + + // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized +@@ -265,7 +281,7 @@ HWY_API VFromD Undefined(D d) { + namespace detail { + + // u8: no change +-#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v; \ +@@ -276,25 +292,25 @@ namespace detail { + } + + // Other integers +-#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v); \ +- } \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ +- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \ +- return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v); \ ++#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ ++ return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ ++ } \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ ++ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \ ++ return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ + } + + // Float: first cast to/from unsigned +-#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL( \ +- v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v)); \ +- } \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ +- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \ +- return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL( \ +- v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v)); \ ++#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ ++ return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ ++ v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v)); \ ++ } \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ ++ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \ ++ return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL( \ ++ v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ + } + + HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _) +@@ -315,6 +331,12 @@ HWY_API VFromD BitCast(D d, FromV v) + return detail::BitCastFromByte(d, detail::BitCastToByte(v)); + } + ++// Partial ++template ++HWY_API VFromD> BitCast(Simd /*tag*/, FromV v) { ++ return BitCast(Full(), v); ++} ++ + namespace detail { + + template >> +@@ -336,6 +358,12 @@ HWY_API VFromD Iota0(const D /*d*/) + return BitCastToUnsigned(Iota0(DU())); + } + ++// Partial ++template ++HWY_API VFromD> Iota0(Simd /*tag*/) { ++ return Iota0(Full()); ++} ++ + } // namespace detail + + // ================================================== LOGICAL +@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) { + // ------------------------------ Or + + // Scalar argument plus mask. Used by VecFromMask. +-#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm, \ + HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \ +- return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm); \ ++ return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm); \ + } + + namespace detail { +@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, + // ------------------------------ ShiftLeft[Same] + + // Intrinsics do not define .vi forms, so use .vx instead. +-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- template \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits); \ +- } \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ +- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast(bits)); \ ++#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ template \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ ++ return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits); \ ++ } \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ ++ return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast(bits)); \ + } + + HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll) +@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi + #undef HWY_RVV_SHIFT + + // ------------------------------ Shl +-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(v, bits); \ + } + + HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll) + +-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, \ +- detail::BitCastToUnsigned(bits)); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \ + } + + HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll) +@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons + + // ------------------------------ MulAdd + // Note: op is still named vv, not vvv. +-#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ + HWY_RVV_V(BASE, SEW, LMUL) add) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x); \ + } + + HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc) +@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub + // of all bits; SLEN 8 / LMUL 4 = half of all bits. + + // mask = f(vector, vector) +-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_M(MLEN) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ + (void)Lanes(DFromV()); \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b); \ + } + + // ------------------------------ Eq +@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo + #undef HWY_RVV_RETM_ARGMM + + // ------------------------------ IfThenElse +-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ +- HWY_RVV_V(BASE, SEW, LMUL) no) { \ +- return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes); \ ++#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ ++ HWY_RVV_V(BASE, SEW, LMUL) no) { \ ++ return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes); \ + } + + HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge) +@@ -710,7 +737,7 @@ template + using MFromD = decltype(MaskFromVec(Zero(D()))); + + template +-HWY_API MFromD RebindMask(const D d, const MFrom mask) { ++HWY_API MFromD RebindMask(const D /*d*/, const MFrom mask) { + // No need to check lane size/LMUL are the same: if not, casting MFrom to + // MFromD would fail. + return mask; +@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, + + // ------------------------------ Load + +-#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \ +- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ +- (void)Lanes(d); \ +- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p); \ ++#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \ ++ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ ++ (void)Lanes(d); \ ++ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p); \ + } + HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le) + #undef HWY_RVV_LOAD + +-// Partial load ++// Partial + template + HWY_API VFromD> Load(Simd d, const T* HWY_RESTRICT p) { + return Load(d, p); +@@ -800,16 +827,22 @@ HWY_API VFromD LoadU(D d, const TFrom + + // ------------------------------ Store + +-#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ +- HWY_RVV_D(CHAR, SEW, LMUL) d, \ +- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ +- (void)Lanes(d); \ +- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v); \ ++#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ ++ HWY_RVV_D(CHAR, SEW, LMUL) d, \ ++ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ ++ (void)Lanes(d); \ ++ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v); \ + } + HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se) + #undef HWY_RVV_RET_ARGVDP + ++// Partial ++template ++HWY_API void Store(VFromD> v, Simd d, T* HWY_RESTRICT p) { ++ return Store(v, Full(), p); ++} ++ + // ------------------------------ StoreU + + // RVV only requires lane alignment, not natural alignment of the entire vector. +@@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H + Store(v, d, aligned); + } + ++// ------------------------------ ScatterOffset ++ ++#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ ++ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ ++ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ ++ HWY_RVV_V(int, SEW, LMUL) offset) { \ ++ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ ++ base, detail::BitCastToUnsigned(offset), v); \ ++ } ++HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sx) ++#undef HWY_RVV_SCATTER ++ ++// Partial ++template ++HWY_API void ScatterOffset(VFromD> v, Simd d, ++ T* HWY_RESTRICT base, ++ VFromD, N>> offset) { ++ return ScatterOffset(v, Full(), base, offset); ++} ++ ++// ------------------------------ ScatterIndex ++ ++template ++HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, ++ const VFromD> index) { ++ return ScatterOffset(v, d, base, ShiftLeft<2>(index)); ++} ++ ++template ++HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, ++ const VFromD> index) { ++ return ScatterOffset(v, d, base, ShiftLeft<3>(index)); ++} ++ + // ------------------------------ GatherOffset + +-#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ +- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ +- HWY_RVV_V(int, SEW, LMUL) offset) { \ +- return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL( \ +- base, detail::BitCastToUnsigned(offset)); \ ++#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ ++ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ ++ HWY_RVV_V(int, SEW, LMUL) offset) { \ ++ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ ++ base, detail::BitCastToUnsigned(offset)); \ + } + HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lx) + #undef HWY_RVV_GATHER + ++// Partial ++template ++HWY_API VFromD> GatherOffset(Simd d, ++ const T* HWY_RESTRICT base, ++ VFromD, N>> offset) { ++ return GatherOffset(Full(), base, offset); ++} ++ + // ------------------------------ GatherIndex + + template +@@ -852,37 +928,101 @@ HWY_API VFromD GatherIndex(D d, const + return GatherOffset(d, base, ShiftLeft<3>(index)); + } + +-// ================================================== CONVERT ++// ------------------------------ StoreInterleaved3 + +-// ------------------------------ PromoteTo U ++#define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API void NAME( \ ++ HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b, \ ++ HWY_RVV_V(BASE, SEW, LMUL) c, HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ ++ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ ++ const v##BASE##SEW##LMUL##x3_t triple = \ ++ vcreate_##CHAR##SEW##LMUL##x3(a, b, c); \ ++ return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple); \ ++ } ++// Segments are limited to 8 registers, so we can only go up to LMUL=2. ++HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3) ++HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3) + +-HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); } +-HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); } +-HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); } ++#undef HWY_RVV_STORE3 + +-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); } +-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); } ++// Partial ++template ++HWY_API void StoreInterleaved3(VFromD> v0, VFromD> v1, ++ VFromD> v2, Simd /*tag*/, ++ T* unaligned) { ++ return StoreInterleaved3(v0, v1, v2, Full(), unaligned); ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++#define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API void NAME( \ ++ HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ ++ HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ ++ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ ++ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \ ++ const v##BASE##SEW##LMUL##x4_t quad = \ ++ vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3); \ ++ return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad); \ ++ } ++// Segments are limited to 8 registers, so we can only go up to LMUL=2. ++HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4) ++HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4) + +-HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) { +- return vzext_vf2_u32m2(v); +-} +-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) { +- return vzext_vf2_u32m4(v); +-} +-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) { +- return vzext_vf2_u32m8(v); +-} ++#undef HWY_RVV_STORE4 + +-HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) { +- return vzext_vf2_u64m2(v); +-} +-HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) { +- return vzext_vf2_u64m4(v); +-} +-HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) { +- return vzext_vf2_u64m8(v); ++// Partial ++template ++HWY_API void StoreInterleaved4(VFromD> v0, VFromD> v1, ++ VFromD> v2, VFromD> v3, ++ Simd /*tag*/, T* unaligned) { ++ return StoreInterleaved4(v0, v1, v2, v3, Full(), unaligned); + } + ++// ================================================== CONVERT ++ ++#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \ ++ HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ ++ PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/, \ ++ HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ ++ return OP##CHAR##BITS##LMUL(v); \ ++ } ++ ++// TODO(janwas): GCC does not yet support fractional LMUL ++#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ ++ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2)*/ \ ++ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1) \ ++ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2) \ ++ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4) ++ ++#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ ++ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4)*/ \ ++ /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2)*/ \ ++ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1) \ ++ HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2) ++ ++// ------------------------------ PromoteTo ++ ++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8) ++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16) ++HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32) ++HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8) ++ ++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8) ++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16) ++HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32) ++HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8) ++ ++HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16) ++HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32) ++ ++// i32 to f64 ++HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32) ++ ++#undef HWY_RVV_PROMOTE_X4 ++#undef HWY_RVV_PROMOTE_X2 ++#undef HWY_RVV_PROMOTE ++ + template + HWY_API VFromD> PromoteTo(Simd d, + VFromD> v) { +@@ -901,67 +1041,6 @@ HWY_API VFromD> Promote + return BitCast(d, PromoteTo(Simd(), v)); + } + +-// ------------------------------ PromoteTo I +- +-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); } +-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); } +-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); } +- +-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); } +-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); } +- +-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) { +- return vsext_vf2_i32m2(v); +-} +-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) { +- return vsext_vf2_i32m4(v); +-} +-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) { +- return vsext_vf2_i32m8(v); +-} +- +-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) { +- return vsext_vf2_i64m2(v); +-} +-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) { +- return vsext_vf2_i64m4(v); +-} +-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) { +- return vsext_vf2_i64m8(v); +-} +- +-// ------------------------------ PromoteTo F +- +-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) { +- return vfwcvt_f_f_v_f32m2(v); +-} +-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) { +- return vfwcvt_f_f_v_f32m4(v); +-} +-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) { +- return vfwcvt_f_f_v_f32m8(v); +-} +- +-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) { +- return vfwcvt_f_f_v_f64m2(v); +-} +-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) { +- return vfwcvt_f_f_v_f64m4(v); +-} +-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) { +- return vfwcvt_f_f_v_f64m8(v); +-} +- +-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) { +- return vfwcvt_f_x_v_f64m2(v); +-} +-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) { +- return vfwcvt_f_x_v_f64m4(v); +-} +-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) { +- return vfwcvt_f_x_v_f64m8(v); +-} +- + // ------------------------------ DemoteTo U + + // First clamp negative numbers to zero to match x86 packus. +@@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */, + + // ------------------------------ ConvertTo F + +-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ + HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) { \ +- return vfcvt_f_x_v_f##SEW##m##LMUL(v); \ ++ return vfcvt_f_x_v_f##SEW##LMUL(v); \ + } \ + /* Truncates (rounds toward zero). */ \ + HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v); \ ++ return vfcvt_rtz_x_f_v_i##SEW##LMUL(v); \ + } \ + /* Uses default rounding mode. */ \ + HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return vfcvt_x_f_v_i##SEW##m##LMUL(v); \ ++ return vfcvt_x_f_v_i##SEW##LMUL(v); \ + } + + // API only requires f32 but we provide f64 for internal use (otherwise, it +@@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */, + HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _) + #undef HWY_RVV_CONVERT + ++// Partial ++template ++HWY_API VFromD> ConvertTo(Simd /*tag*/, FromV v) { ++ return ConvertTo(Full(), v); ++} ++ + // ================================================== SWIZZLE + + // ------------------------------ Compress + +-#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ +- return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v); \ ++#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ ++ return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v); \ + } + ++HWY_RVV_FOREACH_UI16(HWY_RVV_COMPRESS, Compress, compress) + HWY_RVV_FOREACH_UI32(HWY_RVV_COMPRESS, Compress, compress) + HWY_RVV_FOREACH_UI64(HWY_RVV_COMPRESS, Compress, compress) + HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress) +@@ -1121,10 +1207,10 @@ HWY_API VFromD SetTableIndices(D d, + + // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX + // to 2048! We could instead use vrgatherei16. +-#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(v, idx); \ + } + + HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather) +@@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D + using T = MakeUnsigned>; + return detail::And(iota0, static_cast(~(LanesPerBlock(d) - 1))); + } +- + } // namespace detail + + template +@@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) { + + // ------------------------------ GetLane + +-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v); \ ++#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ ++ return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); \ + } + + HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x) +@@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL + + // ------------------------------ ShiftLeftLanes + +-// vector = f(vector, size_t) +-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) { \ +- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes); \ ++// vector = f(vector, vector, size_t) ++#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ ++ size_t lanes) { \ ++ return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes); \ + } + + namespace detail { +@@ -1270,7 +1356,7 @@ template + HWY_API V ShiftLeftLanes(const V v) { + using D = DFromV; + const RebindToSigned di; +- const auto shifted = detail::SlideUp(v, kLanes); ++ const auto shifted = detail::SlideUp(v, v, kLanes); + // Match x86 semantics by zeroing lower lanes in 128-bit blocks + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di); + const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1); +@@ -1300,7 +1386,7 @@ template + HWY_API V ShiftRightLanes(const V v) { + using D = DFromV; + const RebindToSigned di; +- const auto shifted = detail::SlideDown(v, kLanes); ++ const auto shifted = detail::SlideDown(v, v, kLanes); + // Match x86 semantics by zeroing upper lanes in 128-bit blocks + constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di); + const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1); +@@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c + template + HWY_API V ConcatLowerLower(const V hi, const V lo) { + // Move lower half into upper +- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV()) / 2); ++ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV()) / 2); + return ConcatUpperLower(hi_up, lo); + } + +@@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c + template + HWY_API V ConcatUpperUpper(const V hi, const V lo) { + // Move upper half into lower +- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV()) / 2); ++ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV()) / 2); + return ConcatUpperLower(hi, lo_down); + } + +@@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c + template + HWY_API V ConcatLowerUpper(const V hi, const V lo) { + // Move half of both inputs to the other half +- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV()) / 2); +- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV()) / 2); ++ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV()) / 2); ++ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV()) / 2); + return ConcatUpperLower(hi_up, lo_down); + } + +@@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b) + // ================================================== REDUCE + + // vector = f(vector, zero_m1) +-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ +- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) { \ +- vsetvlmax_e##SEW##m##LMUL(); \ +- return Set(HWY_RVV_D(CHAR, SEW, LMUL)(), \ +- GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \ +- v0, v, v0))); \ ++#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ ++ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ ++ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \ ++ vsetvlmax_e##SEW##LMUL(); \ ++ return Set( \ ++ HWY_RVV_D(CHAR, SEW, LMUL)(), \ ++ GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \ + } + + // ------------------------------ SumOfLanes + + namespace detail { +- + HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum) + HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum) +- + } // namespace detail + + template + HWY_API V SumOfLanes(const V v) { + using T = TFromV; +- const auto v0 = Zero(Simd()); // always m1 ++ const auto v0 = Zero(Full()); // always m1 + return detail::RedSum(v, v0); + } + + // ------------------------------ MinOfLanes + namespace detail { +- + HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu) + HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin) + HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin) +- + } // namespace detail + + template + HWY_API V MinOfLanes(const V v) { + using T = TFromV; +- const Simd d1; // always m1 ++ const Full d1; // always m1 + const auto neutral = Set(d1, HighestValue()); + return detail::RedMin(v, neutral); + } + + // ------------------------------ MaxOfLanes + namespace detail { +- + HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu) + HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax) + HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax) +- + } // namespace detail + + template + HWY_API V MaxOfLanes(const V v) { + using T = TFromV; +- const Simd d1; // always m1 ++ const Full d1; // always m1 + const auto neutral = Set(d1, LowestValue()); + return detail::RedMax(v, neutral); + } +@@ -1507,7 +1587,7 @@ HWY_API VFromD LoadDup128(D d, const + #define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP) \ + HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \ + /* LMUL=1 is always enough */ \ +- Simd d8; \ ++ Full d8; \ + const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN; \ + /* TODO(janwas): how to convert vbool* to vuint?*/ \ + /*Store(m, d8, p);*/ \ +@@ -1518,6 +1598,22 @@ HWY_API VFromD LoadDup128(D d, const + HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _) + #undef HWY_RVV_STORE_MASK_BITS + ++// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) ++ ++// Disallow for 8-bit because Iota is likely to overflow. ++template ++HWY_API MFromD FirstN(const D d, const size_t n) { ++ const RebindToSigned di; ++ return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n))); ++} ++ ++template ++HWY_API MFromD FirstN(const D d, const size_t n) { ++ const auto zero = Zero(d); ++ const auto one = Set(d, 1); ++ return Eq(detail::SlideUp(one, zero, n), one); ++} ++ + // ------------------------------ Neg + + template +@@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) { + } + + // vector = f(vector), but argument is repeated +-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ ++#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ +- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v); \ ++ return v##OP##_vv_##CHAR##SEW##LMUL(v, v); \ + } + + HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn) +@@ -1565,7 +1661,6 @@ template + HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) { + return Lt(Abs(v), Set(DFromV(), MantissaEnd>())); + } +- + } // namespace detail + + template +@@ -1636,10 +1731,8 @@ HWY_API VFromD Iota(const D d, TFromD + // Using vwmul does not work for m8, so use mulh instead. Highway only provides + // MulHigh for 16-bit, so use a private wrapper. + namespace detail { +- + HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu) + HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh) +- + } // namespace detail + + template +@@ -1649,7 +1742,7 @@ HWY_API VFromD> dw; +- return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo)); ++ return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo)); + } + + // ================================================== END MACROS +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 2021-06-02 10:56:05.237904402 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -19,7 +19,6 @@ + #include + + #include // std::min +-#include + + #include "hwy/base.h" + #include "hwy/ops/shared-inl.h" +@@ -199,7 +198,7 @@ HWY_API Vec1 BroadcastSignBit(const V + template + HWY_API Mask1 RebindMask(Sisd /*tag*/, Mask1 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); +- return Mask1(m.raw); ++ return Mask1{m.bits}; + } + + // v must be 0 or FF..FF. +@@ -224,6 +223,11 @@ Vec1 VecFromMask(Sisd /* tag */, c + return v; + } + ++template ++HWY_INLINE Mask1 FirstN(Sisd /*tag*/, size_t n) { ++ return Mask1::FromBool(n != 0); ++} ++ + // Returns mask ? yes : no. + template + HWY_INLINE Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, +@@ -357,9 +361,9 @@ HWY_INLINE Vec1 operator>>(const Vec1 + + template + HWY_INLINE Vec1 operator+(Vec1 a, Vec1 b) { +- const uint64_t a64 = static_cast(a.raw); +- const uint64_t b64 = static_cast(b.raw); +- return Vec1((a64 + b64) & ~T(0)); ++ const uint64_t a64 = static_cast(a.raw); ++ const uint64_t b64 = static_cast(b.raw); ++ return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); + } + HWY_INLINE Vec1 operator+(const Vec1 a, const Vec1 b) { + return Vec1(a.raw + b.raw); +@@ -370,9 +374,9 @@ HWY_INLINE Vec1 operator+(const + + template + HWY_INLINE Vec1 operator-(Vec1 a, Vec1 b) { +- const uint64_t a64 = static_cast(a.raw); +- const uint64_t b64 = static_cast(b.raw); +- return Vec1((a64 - b64) & ~T(0)); ++ const uint64_t a64 = static_cast(a.raw); ++ const uint64_t b64 = static_cast(b.raw); ++ return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); + } + HWY_INLINE Vec1 operator-(const Vec1 a, const Vec1 b) { + return Vec1(a.raw - b.raw); +@@ -388,21 +392,25 @@ HWY_INLINE Vec1 operator-(const + // Unsigned + HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); + } + HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535))); + } + + // Signed + HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); + } + HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767))); + } + + // ------------------------------ Saturating subtraction +@@ -412,21 +420,25 @@ HWY_INLINE Vec1 SaturatedAdd(co + // Unsigned + HWY_INLINE Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); + } + HWY_INLINE Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535))); + } + + // Signed + HWY_INLINE Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); + } + HWY_INLINE Vec1 SaturatedSub(const Vec1 a, + const Vec1 b) { +- return Vec1(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)); ++ return Vec1( ++ static_cast(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767))); + } + + // ------------------------------ Average +@@ -435,11 +447,11 @@ HWY_INLINE Vec1 SaturatedSub(co + + HWY_INLINE Vec1 AverageRound(const Vec1 a, + const Vec1 b) { +- return Vec1((a.raw + b.raw + 1) / 2); ++ return Vec1(static_cast((a.raw + b.raw + 1) / 2)); + } + HWY_INLINE Vec1 AverageRound(const Vec1 a, + const Vec1 b) { +- return Vec1((a.raw + b.raw + 1) / 2); ++ return Vec1(static_cast((a.raw + b.raw + 1) / 2)); + } + + // ------------------------------ Absolute value +@@ -514,15 +526,15 @@ HWY_INLINE Vec1 operator/(const Vec1< + + // Returns the upper 16 bits of a * b in each lane. + HWY_INLINE Vec1 MulHigh(const Vec1 a, const Vec1 b) { +- return Vec1((a.raw * b.raw) >> 16); ++ return Vec1(static_cast((a.raw * b.raw) >> 16)); + } + HWY_INLINE Vec1 MulHigh(const Vec1 a, + const Vec1 b) { + // Cast to uint32_t first to prevent overflow. Otherwise the result of + // uint16_t * uint16_t is in "int" which may overflow. In practice the result + // is the same but this way it is also defined. +- return Vec1( +- (static_cast(a.raw) * static_cast(b.raw)) >> 16); ++ return Vec1(static_cast( ++ (static_cast(a.raw) * static_cast(b.raw)) >> 16)); + } + + // Multiplies even lanes (0, 2 ..) and returns the double-wide result. +@@ -617,6 +629,31 @@ HWY_INLINE Vec1 Round(const Vec1 v + return Vec1(static_cast(rounded)); + } + ++// Round-to-nearest even. ++HWY_INLINE Vec1 NearestInt(const Vec1 v) { ++ using T = float; ++ using TI = int32_t; ++ ++ const T abs = Abs(v).raw; ++ const bool signbit = std::signbit(v.raw); ++ ++ if (!(abs < MantissaEnd())) { // Huge or NaN ++ // Check if too large to cast or NaN ++ if (!(abs <= static_cast(LimitsMax()))) { ++ return Vec1(signbit ? LimitsMin() : LimitsMax()); ++ } ++ return Vec1(static_cast(v.raw)); ++ } ++ const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); ++ const TI rounded = static_cast(v.raw + bias); ++ if (rounded == 0) return Vec1(0); ++ // Round to even ++ if ((rounded & 1) && std::abs(static_cast(rounded) - v.raw) == T(0.5)) { ++ return Vec1(rounded - (signbit ? -1 : 1)); ++ } ++ return Vec1(rounded); ++} ++ + template + HWY_INLINE Vec1 Trunc(const Vec1 v) { + using TI = MakeSigned; +@@ -641,7 +678,8 @@ V Ceiling(const V v) { + Bits bits; + CopyBytes(&v, &bits); + +- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias; ++ const int exponent = ++ static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => 0 or 1. +@@ -672,7 +710,8 @@ V Floor(const V v) { + Bits bits; + CopyBytes(&v, &bits); + +- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias; ++ const int exponent = ++ static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); + // Already an integer. + if (exponent >= kMantissaBits) return v; + // |v| <= 1 => -1 or 0. +@@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1 v, + return Store(v, d, p); + } + ++// ------------------------------ StoreInterleaved3 ++ ++HWY_API void StoreInterleaved3(const Vec1 v0, const Vec1 v1, ++ const Vec1 v2, Sisd d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ StoreU(v0, d, unaligned + 0); ++ StoreU(v1, d, unaligned + 1); ++ StoreU(v2, d, unaligned + 2); ++} ++ ++HWY_API void StoreInterleaved4(const Vec1 v0, const Vec1 v1, ++ const Vec1 v2, const Vec1 v3, ++ Sisd d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ StoreU(v0, d, unaligned + 0); ++ StoreU(v1, d, unaligned + 1); ++ StoreU(v2, d, unaligned + 2); ++ StoreU(v3, d, unaligned + 3); ++} ++ + // ------------------------------ Stream + + template +@@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1 v, + return Store(v, d, aligned); + } + ++// ------------------------------ Scatter ++ ++template ++HWY_INLINE void ScatterOffset(Vec1 v, Sisd d, T* base, ++ const Vec1 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ uint8_t* const base8 = reinterpret_cast(base) + offset.raw; ++ return Store(v, d, reinterpret_cast(base8)); ++} ++ ++template ++HWY_INLINE void ScatterIndex(Vec1 v, Sisd d, T* HWY_RESTRICT base, ++ const Vec1 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ return Store(v, d, base + index.raw); ++} ++ + // ------------------------------ Gather + + template + HWY_INLINE Vec1 GatherOffset(Sisd d, const T* base, + const Vec1 offset) { +- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + const uintptr_t addr = reinterpret_cast(base) + offset.raw; + return Load(d, reinterpret_cast(addr)); + } +@@ -792,7 +868,7 @@ HWY_INLINE Vec1 GatherOffset(Sisd + template + HWY_INLINE Vec1 GatherIndex(Sisd d, const T* HWY_RESTRICT base, + const Vec1 index) { +- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return Load(d, base + index.raw); + } + +@@ -833,15 +909,20 @@ HWY_INLINE Vec1 DemoteTo(Sisd + + static HWY_INLINE Vec1 PromoteTo(Sisd /* tag */, + const Vec1 v) { ++#if HWY_NATIVE_FLOAT16 + uint16_t bits16; + CopyBytes<2>(&v.raw, &bits16); ++#else ++ const uint16_t bits16 = v.raw.bits; ++#endif + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { +- const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); ++ const float subnormal = ++ (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); + return Vec1(sign ? -subnormal : subnormal); + } + +@@ -867,8 +948,12 @@ static HWY_INLINE Vec1 Demote + // Tiny or zero => zero. + Vec1 out; + if (exp < -24) { +- bits32 = 0; +- CopyBytes<2>(&bits32, &out); ++#if HWY_NATIVE_FLOAT16 ++ const uint16_t zero = 0; ++ CopyBytes<2>(&zero, &out.raw); ++#else ++ out.raw.bits = 0; ++#endif + return out; + } + +@@ -890,7 +975,12 @@ static HWY_INLINE Vec1 Demote + HWY_DASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + HWY_DASSERT(bits16 < 0x10000); +- CopyBytes<2>(&bits16, &out); ++#if HWY_NATIVE_FLOAT16 ++ const uint16_t narrowed = static_cast(bits16); // big-endian safe ++ CopyBytes<2>(&narrowed, &out.raw); ++#else ++ out.raw.bits = static_cast(bits16); ++#endif + return out; + } + +@@ -919,18 +1009,6 @@ HWY_INLINE Vec1 U8FromU32(const + return DemoteTo(Sisd(), v); + } + +-// Approximation of round-to-nearest for numbers representable as int32_t. +-HWY_INLINE Vec1 NearestInt(const Vec1 v) { +- const float f = v.raw; +- if (std::isinf(f) || +- std::fabs(f) > static_cast(LimitsMax())) { +- return Vec1(std::signbit(f) ? LimitsMin() +- : LimitsMax()); +- } +- const float bias = f < 0.0f ? -0.5f : 0.5f; +- return Vec1(static_cast(f + bias)); +-} +- + // ================================================== SWIZZLE + + // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*, +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 2021-06-02 10:56:05.224904336 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -31,11 +31,6 @@ + #undef HWY_ALIGN + #undef HWY_LANES + +-#undef HWY_GATHER_LANES +-#undef HWY_VARIABLE_SHIFT_LANES +-#undef HWY_COMPARE64_LANES +-#undef HWY_MINMAX64_LANES +- + #undef HWY_CAP_INTEGER64 + #undef HWY_CAP_FLOAT64 + #undef HWY_CAP_GE256 +@@ -53,11 +48,6 @@ + #define HWY_ALIGN alignas(16) + #define HWY_LANES(T) (16 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) 1 +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_COMPARE64_LANES 2 +-#define HWY_MINMAX64_LANES 1 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 + #define HWY_CAP_GE256 0 +@@ -73,11 +63,6 @@ + #define HWY_ALIGN alignas(32) + #define HWY_LANES(T) (32 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) HWY_LANES(T) +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_COMPARE64_LANES 4 +-#define HWY_MINMAX64_LANES 1 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 + #define HWY_CAP_GE256 1 +@@ -96,11 +81,6 @@ + #define HWY_ALIGN alignas(64) + #define HWY_LANES(T) (64 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) HWY_LANES(T) +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_COMPARE64_LANES 8 +-#define HWY_MINMAX64_LANES 8 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 + #define HWY_CAP_GE256 1 +@@ -121,11 +101,6 @@ + #define HWY_ALIGN alignas(16) + #define HWY_LANES(T) (16 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) 1 +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_COMPARE64_LANES 2 +-#define HWY_MINMAX64_LANES 2 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 + #define HWY_CAP_GE256 0 +@@ -142,19 +117,14 @@ + #define HWY_ALIGN alignas(16) + #define HWY_LANES(T) (16 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) 1 +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_MINMAX64_LANES 2 +-#define HWY_COMPARE64_LANES 2 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_GE256 0 + #define HWY_CAP_GE512 0 + +-#ifdef __arm__ +-#define HWY_CAP_FLOAT64 0 +-#else ++#if HWY_ARCH_ARM_A64 + #define HWY_CAP_FLOAT64 1 ++#else ++#define HWY_CAP_FLOAT64 0 + #endif + + #define HWY_NAMESPACE N_NEON +@@ -162,17 +132,34 @@ + // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + + //----------------------------------------------------------------------------- ++// SVE[2] ++#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE ++ ++// SVE only requires lane alignment, not natural alignment of the entire vector. ++#define HWY_ALIGN alignas(8) ++// Upper bound, not the actual lane count! ++#define HWY_LANES(T) (256 / sizeof(T)) ++ ++#define HWY_CAP_INTEGER64 1 ++#define HWY_CAP_FLOAT64 1 ++#define HWY_CAP_GE256 0 ++#define HWY_CAP_GE512 0 ++ ++#if HWY_TARGET == HWY_SVE2 ++#define HWY_NAMESPACE N_SVE2 ++#else ++#define HWY_NAMESPACE N_SVE ++#endif ++ ++// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE? ++ ++//----------------------------------------------------------------------------- + // WASM + #elif HWY_TARGET == HWY_WASM + + #define HWY_ALIGN alignas(16) + #define HWY_LANES(T) (16 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) 1 +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-#define HWY_COMPARE64_LANES 2 +-#define HWY_MINMAX64_LANES 2 +- + #define HWY_CAP_INTEGER64 0 + #define HWY_CAP_FLOAT64 0 + #define HWY_CAP_GE256 0 +@@ -194,11 +181,6 @@ + // mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h. + #define HWY_LANES(T) (4096 / sizeof(T)) + +-#define HWY_GATHER_LANES(T) HWY_LANES(T) +-#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) +-// Cannot use HWY_LANES/sizeof here because these are used in an #if. +-#define HWY_COMPARE64_LANES 256 +-#define HWY_MINMAX64_LANES 256 + + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 +@@ -215,13 +197,9 @@ + #elif HWY_TARGET == HWY_SCALAR + + #define HWY_ALIGN ++// For internal use only; use Lanes(d) instead. + #define HWY_LANES(T) 1 + +-#define HWY_GATHER_LANES(T) 1 +-#define HWY_VARIABLE_SHIFT_LANES(T) 1 +-#define HWY_COMPARE64_LANES 1 +-#define HWY_MINMAX64_LANES 1 +- + #define HWY_CAP_INTEGER64 1 + #define HWY_CAP_FLOAT64 1 + #define HWY_CAP_GE256 0 +@@ -265,3 +243,7 @@ + #else + #define HWY_ATTR + #endif ++ ++// DEPRECATED ++#undef HWY_GATHER_LANES ++#define HWY_GATHER_LANES(T) HWY_LANES(T) +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 2021-06-02 10:56:05.235904392 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -14,6 +14,8 @@ + + // Per-target definitions shared by ops/*.h and user code. + ++#include ++ + // Separate header because foreach_target.h re-enables its include guard. + #include "hwy/ops/set_macros-inl.h" + +@@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si + } + + // Targets with non-constexpr Lanes define this themselves. +-#if HWY_TARGET != HWY_RVV ++#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE + + // (Potentially) non-constant actual size of the vector at runtime, subject to + // the limit imposed by the Simd. Useful for advancing loop counters. +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 2021-06-02 10:56:05.242904427 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -19,8 +19,6 @@ + #include + #include + +-#include +- + #include "hwy/base.h" + #include "hwy/ops/shared-inl.h" + +@@ -177,6 +175,16 @@ HWY_API Vec128 Undefined(Simd ++Vec128 Iota(const Simd d, const T2 first) { ++ HWY_ALIGN T lanes[16 / sizeof(T)]; ++ for (size_t i = 0; i < 16 / sizeof(T); ++i) { ++ lanes[i] = static_cast(first + static_cast(i)); ++ } ++ return Load(d, lanes); ++} ++ + // ================================================== ARITHMETIC + + // ------------------------------ Addition +@@ -273,24 +281,24 @@ HWY_API Vec128 operator-(const + template + HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_u8x16_add_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; + } + template + HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_u16x8_add_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; + } + + // Signed + template + HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_i8x16_add_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; + } + template + HWY_API Vec128 SaturatedAdd(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_i16x8_add_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; + } + + // ------------------------------ Saturating subtraction +@@ -301,24 +309,24 @@ HWY_API Vec128 SaturatedAdd( + template + HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_u8x16_sub_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; + } + template + HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_u16x8_sub_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; + } + + // Signed + template + HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_i8x16_sub_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; + } + template + HWY_API Vec128 SaturatedSub(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_i16x8_sub_saturate(a.raw, b.raw)}; ++ return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; + } + + // ------------------------------ Average +@@ -352,6 +360,12 @@ template + HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{wasm_i32x4_abs(v.raw)}; + } ++template ++HWY_API Vec128 Abs(const Vec128 v) { ++ // TODO(janwas): use wasm_i64x2_abs when available ++ const Vec128 mask = wasm_i64x2_shr(v.raw, 63); ++ return ((v ^ mask) - mask); ++} + + template + HWY_API Vec128 Abs(const Vec128 v) { +@@ -396,9 +410,38 @@ HWY_API Vec128 ShiftRight(co + return Vec128{wasm_i32x4_shr(v.raw, kBits)}; + } + ++// 8-bit ++template ++HWY_API Vec128 ShiftLeft(const Vec128 v) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; ++ return kBits == 1 ++ ? (v + v) ++ : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); ++} ++ ++template ++HWY_API Vec128 ShiftRight(const Vec128 v) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftRight(Vec128{v.raw}).raw}; ++ return shifted & Set(d8, 0xFF >> kBits); ++} ++ ++template ++HWY_API Vec128 ShiftRight(const Vec128 v) { ++ const Simd di; ++ const Simd du; ++ const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ Shift lanes by same variable #bits + +-// Unsigned (no u8) ++// Unsigned + template + HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { +@@ -420,7 +463,7 @@ HWY_API Vec128 ShiftRightSa + return Vec128{wasm_u32x4_shr(v.raw, bits)}; + } + +-// Signed (no i8) ++// Signed + template + HWY_API Vec128 ShiftLeftSame(const Vec128 v, + const int bits) { +@@ -442,6 +485,35 @@ HWY_API Vec128 ShiftRightSam + return Vec128{wasm_i32x4_shr(v.raw, bits)}; + } + ++// 8-bit ++template ++HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; ++ return shifted & Set(d8, (0xFF << bits) & 0xFF); ++} ++ ++template ++HWY_API Vec128 ShiftRightSame(Vec128 v, ++ const int bits) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftRightSame(Vec128{v.raw}, bits).raw}; ++ return shifted & Set(d8, 0xFF >> bits); ++} ++ ++template ++HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { ++ const Simd di; ++ const Simd du; ++ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ Minimum + + // Unsigned +@@ -607,29 +679,29 @@ template + HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. +- const auto al = wasm_i32x4_widen_low_u16x8(a.raw); +- const auto ah = wasm_i32x4_widen_high_u16x8(a.raw); +- const auto bl = wasm_i32x4_widen_low_u16x8(b.raw); +- const auto bh = wasm_i32x4_widen_high_u16x8(b.raw); ++ const auto al = wasm_u32x4_extend_low_u16x8(a.raw); ++ const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); ++ const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); ++ const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec128{ +- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; ++ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; + } + template + HWY_API Vec128 MulHigh(const Vec128 a, + const Vec128 b) { + // TODO(eustas): replace, when implemented in WASM. +- const auto al = wasm_i32x4_widen_low_i16x8(a.raw); +- const auto ah = wasm_i32x4_widen_high_i16x8(a.raw); +- const auto bl = wasm_i32x4_widen_low_i16x8(b.raw); +- const auto bh = wasm_i32x4_widen_high_i16x8(b.raw); ++ const auto al = wasm_i32x4_extend_low_i16x8(a.raw); ++ const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); ++ const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); ++ const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); + const auto l = wasm_i32x4_mul(al, bl); + const auto h = wasm_i32x4_mul(ah, bh); + // TODO(eustas): shift-right + narrow? + return Vec128{ +- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; ++ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; + } + + // Multiplies even lanes (0, 2 ..) and returns the double-width result. +@@ -765,53 +837,76 @@ HWY_API Vec128 ApproximateReci + // Toward nearest integer, ties to even + template + HWY_API Vec128 Round(const Vec128 v) { +- // TODO(eustas): is it f32x4.nearest? (not implemented yet) +- alignas(16) float input[4]; +- alignas(16) float output[4]; +- wasm_v128_store(input, v.raw); +- for (size_t i = 0; i < 4; ++i) { +- output[i] = std::nearbyint(input[i]); +- } +- return Vec128{wasm_v128_load(output)}; ++ // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not ++ // yet have an instruction for that (f32x4.nearest is not implemented). We ++ // rely on rounding after addition with a large value such that no mantissa ++ // bits remain (assuming the current mode is nearest-even). We may need a ++ // compiler flag for precise floating-point to prevent "optimizing" this out. ++ const Simd df; ++ const auto max = Set(df, MantissaEnd()); ++ const auto large = CopySignToAbs(max, v); ++ const auto added = large + v; ++ const auto rounded = added - large; ++ ++ // Keep original if NaN or the magnitude is large (already an int). ++ return IfThenElse(Abs(v) < max, rounded, v); + } + ++namespace detail { ++ ++// Truncating to integer and converting back to float is correct except when the ++// input magnitude is large, in which case the input was already an integer ++// (because mantissa >> exponent is zero). ++template ++HWY_API Mask128 UseInt(const Vec128 v) { ++ return Abs(v) < Set(Simd(), MantissaEnd()); ++} ++ ++} // namespace detail ++ + // Toward zero, aka truncate + template + HWY_API Vec128 Trunc(const Vec128 v) { + // TODO(eustas): is it f32x4.trunc? (not implemented yet) +- alignas(16) float input[4]; +- alignas(16) float output[4]; +- wasm_v128_store(input, v.raw); +- for (size_t i = 0; i < 4; ++i) { +- output[i] = std::trunc(input[i]); +- } +- return Vec128{wasm_v128_load(output)}; ++ const Simd df; ++ const RebindToSigned di; ++ ++ const auto integer = ConvertTo(di, v); // round toward 0 ++ const auto int_f = ConvertTo(df, integer); ++ ++ return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); + } + + // Toward +infinity, aka ceiling + template +-HWY_API Vec128 Ceil(const Vec128 v) { ++HWY_INLINE Vec128 Ceil(const Vec128 v) { + // TODO(eustas): is it f32x4.ceil? (not implemented yet) +- alignas(16) float input[4]; +- alignas(16) float output[4]; +- wasm_v128_store(input, v.raw); +- for (size_t i = 0; i < 4; ++i) { +- output[i] = std::ceil(input[i]); +- } +- return Vec128{wasm_v128_load(output)}; ++ const Simd df; ++ const RebindToSigned di; ++ ++ const auto integer = ConvertTo(di, v); // round toward 0 ++ const auto int_f = ConvertTo(df, integer); ++ ++ // Truncating a positive non-integer ends up smaller; if so, add 1. ++ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); ++ ++ return IfThenElse(detail::UseInt(v), int_f - neg1, v); + } + + // Toward -infinity, aka floor + template +-HWY_API Vec128 Floor(const Vec128 v) { ++HWY_INLINE Vec128 Floor(const Vec128 v) { + // TODO(eustas): is it f32x4.floor? (not implemented yet) +- alignas(16) float input[4]; +- alignas(16) float output[4]; +- wasm_v128_store(input, v.raw); +- for (size_t i = 0; i < 4; ++i) { +- output[i] = std::floor(input[i]); +- } +- return Vec128{wasm_v128_load(output)}; ++ const Simd df; ++ const RebindToSigned di; ++ ++ const auto integer = ConvertTo(di, v); // round toward 0 ++ const auto int_f = ConvertTo(df, integer); ++ ++ // Truncating a negative non-integer ends up larger; if so, subtract 1. ++ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); ++ ++ return IfThenElse(detail::UseInt(v), int_f + neg1, v); + } + + // ================================================== COMPARE +@@ -902,12 +997,12 @@ HWY_API Mask128 operator>(co + + // Otherwise, the lower half decides. + const auto m_eq = a32 == b32; +- const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0); ++ const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0); + const auto lo_gt = And(m_eq, lo_in_hi); + + const auto gt = Or(lo_gt, m_gt); + // Copy result in upper 32 bits to lower 32 bits. +- return Mask128{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)}; ++ return Mask128{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)}; + } + + template +@@ -935,6 +1030,14 @@ HWY_API Mask128 operator>=(con + return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; + } + ++// ------------------------------ FirstN (Iota, Lt) ++ ++template ++HWY_API Mask128 FirstN(const Simd d, size_t num) { ++ const RebindToSigned di; // Signed comparisons may be cheaper. ++ return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); ++} ++ + // ================================================== LOGICAL + + // ------------------------------ Not +@@ -1015,7 +1118,7 @@ HWY_API Vec128 BroadcastSignBit(co + } + template + HWY_API Vec128 BroadcastSignBit(const Vec128 v) { +- return VecFromMask(v < Zero(Simd())); ++ return VecFromMask(Simd(), v < Zero(Simd())); + } + + // ------------------------------ Mask +@@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128 v, Simd + wasm_v128_store(aligned, v.raw); + } + +-// ------------------------------ Gather ++// ------------------------------ Scatter (Store) ++ ++template ++HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); ++ } ++} ++ ++template ++HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ for (size_t i = 0; i < N; ++i) { ++ base[index_lanes[i]] = lanes[i]; ++ } ++} ++ ++// ------------------------------ Gather (Load/Store) + + template + HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { +- static_assert(N == 1, "Wasm does not support full gather"); +- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); +- const uintptr_t address = reinterpret_cast(base) + GetLane(offset); +- T val; +- CopyBytes(reinterpret_cast(address), &val); +- return Set(d, val); ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ alignas(16) T lanes[N]; ++ const uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); ++ } ++ return Load(d, lanes); + } + + template + HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, + const Vec128 index) { +- static_assert(N == 1, "Wasm does not support full gather"); +- static_assert(sizeof(T) == sizeof(Index), "T must match Index"); +- return Set(d, base[GetLane(index)]); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ alignas(16) T lanes[N]; ++ for (size_t i = 0; i < N; ++i) { ++ lanes[i] = base[index_lanes[i]]; ++ } ++ return Load(d, lanes); + } + + // ================================================== SWIZZLE +@@ -1346,12 +1496,12 @@ HWY_API Vec128 LowerHalf(Vec12 + template + HWY_API Vec128 UpperHalf(Vec128 v) { + // TODO(eustas): use swizzle? +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; + } + template <> + HWY_INLINE Vec128 UpperHalf(Vec128 v) { + // TODO(eustas): use swizzle? +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; + } + + // ------------------------------ Shift vector by constant #bytes +@@ -1366,64 +1516,64 @@ HWY_API Vec128 ShiftLeftBytes(const V + return v; + + case 1: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14)}; + + case 2: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13)}; + + case 3: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12)}; + + case 4: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11)}; + + case 5: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10)}; + + case 6: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; + + case 7: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; + + case 8: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; + + case 9: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; + + case 10: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; + + case 11: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; + + case 12: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; + + case 13: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; + + case 14: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 0, + 1)}; + + case 15: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 0)}; + } +@@ -1447,69 +1597,69 @@ HWY_API Vec128 ShiftRightBytes(const + return v; + + case 1: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16)}; + + case 2: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 16)}; + + case 3: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 16, 16)}; + + case 4: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 16, 16, 16)}; + + case 5: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 16, 16, 16, 16)}; + + case 6: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 16, 16, 16, 16)}; + + case 7: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 16, 16, 16, 16, 16, 16)}; + + case 8: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 16, 16, 16, 16, 16, 16, 16)}; + + case 9: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, + 15, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 10: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 11: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 12: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 13: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 14: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + + case 15: +- return Vec128{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, ++ return Vec128{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16)}; + } +@@ -1535,72 +1685,72 @@ HWY_API Vec128 CombineShiftRightBytes + return lo; + + case 1: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16)}; + + case 2: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17)}; + + case 3: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18)}; + + case 4: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19)}; + + case 5: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20)}; + + case 6: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21)}; + + case 7: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22)}; + + case 8: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23)}; + + case 9: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24)}; + + case 10: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25)}; + + case 11: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26)}; + + case 12: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27)}; + + case 13: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28)}; + + case 14: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29)}; + + case 15: +- return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, ++ return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30)}; + } +@@ -1613,28 +1763,28 @@ HWY_API Vec128 CombineShiftRightBytes + template + HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); +- return Vec128{wasm_v16x8_shuffle( ++ return Vec128{wasm_i16x8_shuffle( + v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; + } + template + HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{ +- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; ++ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + } + + // Signed + template + HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); +- return Vec128{wasm_v16x8_shuffle( ++ return Vec128{wasm_i16x8_shuffle( + v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; + } + template + HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{ +- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; ++ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + } + + // Float +@@ -1642,7 +1792,7 @@ template + HWY_API Vec128 Broadcast(const Vec128 v) { + static_assert(0 <= kLane && kLane < N, "Invalid lane"); + return Vec128{ +- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; ++ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + } + + // ------------------------------ Shuffle bytes with variable indices +@@ -1652,16 +1802,23 @@ HWY_API Vec128 Broadcast(const + template + HWY_API Vec128 TableLookupBytes(const Vec128 bytes, + const Vec128 from) { +- // TODO(eustas): use swizzle? (shuffle does not work for variable indices) ++// Not yet available in all engines, see ++// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md ++// V8 implementation of this had a bug, fixed on 2021-04-03: ++// https://chromium-review.googlesource.com/c/v8/v8/+/2822951 ++#if 0 ++ return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; ++#else + alignas(16) uint8_t control[16]; + alignas(16) uint8_t input[16]; + alignas(16) uint8_t output[16]; + wasm_v128_store(control, from.raw); + wasm_v128_store(input, bytes.raw); + for (size_t i = 0; i < 16; ++i) { +- output[i] = input[control[i]]; ++ output[i] = control[i] < 16 ? input[control[i]] : 0; + } + return Vec128{wasm_v128_load(output)}; ++#endif + } + + // ------------------------------ Hard-coded shuffles +@@ -1673,101 +1830,102 @@ HWY_API Vec128 TableLookupBytes(co + + // Swap 32-bit halves in 64-bit halves. + HWY_API Vec128 Shuffle2301(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + } + HWY_API Vec128 Shuffle2301(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + } + HWY_API Vec128 Shuffle2301(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + } + + // Swap 64-bit halves + HWY_API Vec128 Shuffle1032(const Vec128 v) { +- return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; ++ return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; + } + HWY_API Vec128 Shuffle1032(const Vec128 v) { +- return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; ++ return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; + } + HWY_API Vec128 Shuffle1032(const Vec128 v) { +- return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; ++ return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; + } + + // Rotate right 32 bits + HWY_API Vec128 Shuffle0321(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + } + HWY_API Vec128 Shuffle0321(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + } + HWY_API Vec128 Shuffle0321(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + } + // Rotate left 32 bits + HWY_API Vec128 Shuffle2103(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + } + HWY_API Vec128 Shuffle2103(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + } + HWY_API Vec128 Shuffle2103(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + } + + // Reverse + HWY_API Vec128 Shuffle0123(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + } + HWY_API Vec128 Shuffle0123(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + } + HWY_API Vec128 Shuffle0123(const Vec128 v) { +- return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; ++ return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + } + + // ------------------------------ TableLookupLanes + + // Returned by SetTableIndices for use by TableLookupLanes. +-template ++template + struct Indices128 { + __v128_u raw; + }; + +-template +-HWY_API Indices128 SetTableIndices(Full128, const int32_t* idx) { ++template ++HWY_API Indices128 SetTableIndices(Simd d, const int32_t* idx) { + #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) +- const size_t N = 16 / sizeof(T); + for (size_t i = 0; i < N; ++i) { + HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); + } + #endif + +- const Full128 d8; +- alignas(16) uint8_t control[16]; // = Lanes() +- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { +- const size_t idx_lane = idx_byte / sizeof(T); +- const size_t mod = idx_byte % sizeof(T); +- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; ++ const Repartition d8; ++ alignas(16) uint8_t control[16] = {0}; ++ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { ++ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { ++ control[idx_lane * sizeof(T) + idx_byte] = ++ static_cast(idx[idx_lane] * sizeof(T) + idx_byte); ++ } + } +- return Indices128{Load(d8, control).raw}; ++ return Indices128{Load(d8, control).raw}; + } + +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128{idx.raw}); ++template ++HWY_API Vec128 TableLookupLanes( ++ const Vec128 v, const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +- +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128{idx.raw}); ++template ++HWY_API Vec128 TableLookupLanes(const Vec128 v, ++ const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +- +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- const Full128 di; +- const Full128 df; ++template ++HWY_API Vec128 TableLookupLanes(const Vec128 v, ++ const Indices128 idx) { ++ const Simd di; ++ const Simd df; + return BitCast(df, +- TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); ++ TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); + } + + // ------------------------------ Zip lanes +@@ -1778,33 +1936,33 @@ HWY_API Vec128 TableLookupLanes(c + template + HWY_API Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v8x16_shuffle( ++ return Vec128{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; + } + template + HWY_API Vec128 ZipLower(const Vec128 a, + const Vec128 b) { + return Vec128{ +- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; ++ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; + } + + template + HWY_API Vec128 ZipLower(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v8x16_shuffle( ++ return Vec128{wasm_i8x16_shuffle( + a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; + } + template + HWY_API Vec128 ZipLower(const Vec128 a, + const Vec128 b) { + return Vec128{ +- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; ++ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; + } + + template + HWY_API Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, ++ return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, + 10, 26, 11, 27, 12, 28, 13, + 29, 14, 30, 15, 31)}; + } +@@ -1812,13 +1970,13 @@ template + HWY_API Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { + return Vec128{ +- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; ++ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; + } + + template + HWY_API Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, ++ return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, + 10, 26, 11, 27, 12, 28, 13, + 29, 14, 30, 15, 31)}; + } +@@ -1826,7 +1984,7 @@ template + HWY_API Vec128 ZipUpper(const Vec128 a, + const Vec128 b) { + return Vec128{ +- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; ++ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; + } + + // ------------------------------ Interleave lanes +@@ -1842,17 +2000,17 @@ HWY_API Vec128 InterleaveLower(const + template <> + HWY_INLINE Vec128 InterleaveLower( + const Vec128 a, const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + } + template <> + HWY_INLINE Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + } + template <> + HWY_INLINE Vec128 InterleaveLower(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + } + + template +@@ -1862,17 +2020,17 @@ HWY_API Vec128 InterleaveUpper(const + template <> + HWY_INLINE Vec128 InterleaveUpper( + const Vec128 a, const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + } + template <> + HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + } + template <> + HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + } + + // ------------------------------ Blocks +@@ -1880,13 +2038,13 @@ HWY_INLINE Vec128 InterleaveUpper + // hiH,hiL loH,loL |-> hiL,loL (= lower halves) + template + HWY_API Vec128 ConcatLowerLower(const Vec128 hi, const Vec128 lo) { +- return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)}; ++ return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; + } + + // hiH,hiL loH,loL |-> hiH,loH (= upper halves) + template + HWY_API Vec128 ConcatUpperUpper(const Vec128 hi, const Vec128 lo) { +- return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)}; ++ return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; + } + + // hiH,hiL loH,loL |-> hiL,loH (= inner halves) +@@ -1898,7 +2056,7 @@ HWY_API Vec128 ConcatLowerUpper(const + // hiH,hiL loH,loL |-> hiH,loL (= outer halves) + template + HWY_API Vec128 ConcatUpperLower(const Vec128 hi, const Vec128 lo) { +- return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)}; ++ return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)}; + } + + // ------------------------------ Odd/even lanes +@@ -1917,12 +2075,12 @@ HWY_API Vec128 odd_even_impl(hwy::Siz + template + HWY_API Vec128 odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; ++ return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; + } + template + HWY_API Vec128 odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; + } + // TODO(eustas): implement + // template +@@ -1939,7 +2097,7 @@ HWY_API Vec128 OddEven(const Vec128 + HWY_INLINE Vec128 OddEven(const Vec128 a, + const Vec128 b) { +- return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; ++ return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; + } + + // ================================================== CONVERT +@@ -1950,52 +2108,52 @@ HWY_INLINE Vec128 OddEven( + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i16x8_widen_low_u8x16(v.raw)}; ++ return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ +- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))}; ++ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i16x8_widen_low_u8x16(v.raw)}; ++ return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ +- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))}; ++ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i32x4_widen_low_u16x8(v.raw)}; ++ return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i32x4_widen_low_u16x8(v.raw)}; ++ return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; + } + + // Signed: replicate sign bit. + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i16x8_widen_low_i8x16(v.raw)}; ++ return Vec128{wasm_i16x8_extend_low_i8x16(v.raw)}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { + return Vec128{ +- wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))}; ++ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; + } + template + HWY_API Vec128 PromoteTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i32x4_widen_low_i16x8(v.raw)}; ++ return Vec128{wasm_i32x4_extend_low_i16x8(v.raw)}; + } + + template +@@ -2122,7 +2280,7 @@ HWY_API Vec128 U8FromU32(con + wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; + } + +-// ------------------------------ Convert i32 <=> f32 ++// ------------------------------ Convert i32 <=> f32 (Round) + + template + HWY_API Vec128 ConvertTo(Simd /* tag */, +@@ -2133,33 +2291,16 @@ HWY_API Vec128 ConvertTo(Simd< + template + HWY_API Vec128 ConvertTo(Simd /* tag */, + const Vec128 v) { +- return Vec128{wasm_i32x4_trunc_saturate_f32x4(v.raw)}; ++ return Vec128{wasm_i32x4_trunc_sat_f32x4(v.raw)}; + } + + template + HWY_API Vec128 NearestInt(const Vec128 v) { +- const __f32x4 c00 = wasm_f32x4_splat(0.0f); +- const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00)); +- const __f32x4 c05 = wasm_f32x4_splat(0.5f); +- // +0.5 for non-negative lane, -0.5 for other. +- const __f32x4 delta = wasm_f32x4_add(c05, corr); +- // Shift input by 0.5 away from 0. +- const __f32x4 fixed = wasm_f32x4_add(v.raw, delta); +- return Vec128{wasm_i32x4_trunc_saturate_f32x4(fixed)}; ++ return ConvertTo(Simd(), Round(v)); + } + + // ================================================== MISC + +-// Returns a vector with lane i=[0, N) set to "first" + i. +-template +-Vec128 Iota(const Simd d, const T2 first) { +- HWY_ALIGN T lanes[16 / sizeof(T)]; +- for (size_t i = 0; i < 16 / sizeof(T); ++i) { +- lanes[i] = static_cast(first + static_cast(i)); +- } +- return Load(d, lanes); +-} +- + // ------------------------------ Mask + + namespace detail { +@@ -2167,20 +2308,13 @@ namespace detail { + template + HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { +- const __i8x16 slice = +- wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8); +- // Each u32 lane has byte[i] = (1 << i) or 0. +- const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice); +- // OR together 4 bytes of each u32 to get the 4 bits. +- const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16); +- const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z); +- const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8); +- const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0); +- const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24); +- // Assemble four nibbles into 16 bits. +- alignas(16) uint32_t lanes[4]; +- wasm_v128_store(lanes, nibble_per_u32); +- return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12); ++ alignas(16) uint64_t lanes[2]; ++ wasm_v128_store(lanes, mask.raw); ++ ++ constexpr uint64_t kMagic = 0x103070F1F3F80ULL; ++ const uint64_t lo = ((lanes[0] * kMagic) >> 56); ++ const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; ++ return (hi + lo); + } + + template +@@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() { + + template + HWY_API uint64_t BitsFromMask(const Mask128 mask) { +- return OnlyActive( +- BitsFromMask(hwy::SizeTag(), mask)); ++ return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); + } + + template +@@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128 + HWY_API bool AllFalse(const Mask128 m) { +- return !wasm_i8x16_any_true(m.raw); ++#if 0 ++ // Casting followed by wasm_i8x16_any_true results in wasm error: ++ // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 ++ const auto v8 = BitCast(Full128(), VecFromMask(Full128(), m)); ++ return !wasm_i8x16_any_true(v8.raw); ++#else ++ return (wasm_i64x2_extract_lane(m.raw, 0) | ++ wasm_i64x2_extract_lane(m.raw, 1)) == 0; ++#endif + } + + // Full vector, type-dependent +@@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128 + namespace detail { + + template ++HWY_INLINE Vec128 Idx16x8FromBits(const uint64_t mask_bits) { ++ HWY_DASSERT(mask_bits < 256); ++ const Simd d; ++ const Rebind d8; ++ const Simd du; ++ ++ // We need byte indices for TableLookupBytes (one vector's worth for each of ++ // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We ++ // can instead store lane indices and convert to byte indices (2*lane + 0..1), ++ // with the doubling baked into the table. Unpacking nibbles is likely more ++ // costly than the higher cache footprint from storing bytes. ++ alignas(16) constexpr uint8_t table[256 * 8] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, ++ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, ++ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, ++ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, ++ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, ++ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, ++ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, ++ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, ++ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, ++ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, ++ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, ++ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, ++ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, ++ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, ++ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, ++ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, ++ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, ++ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, ++ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, ++ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, ++ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, ++ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, ++ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, ++ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, ++ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, ++ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, ++ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, ++ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, ++ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, ++ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, ++ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, ++ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, ++ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, ++ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, ++ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, ++ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, ++ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, ++ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, ++ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, ++ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, ++ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, ++ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, ++ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, ++ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, ++ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, ++ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, ++ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, ++ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, ++ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, ++ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, ++ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, ++ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, ++ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, ++ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, ++ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, ++ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, ++ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, ++ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, ++ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, ++ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, ++ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, ++ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, ++ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, ++ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, ++ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, ++ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, ++ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, ++ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, ++ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, ++ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, ++ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, ++ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, ++ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, ++ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, ++ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, ++ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, ++ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, ++ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, ++ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, ++ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, ++ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, ++ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, ++ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, ++ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, ++ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, ++ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, ++ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, ++ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, ++ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, ++ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, ++ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, ++ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, ++ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, ++ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, ++ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, ++ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, ++ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, ++ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, ++ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, ++ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, ++ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, ++ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, ++ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, ++ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, ++ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, ++ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, ++ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, ++ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, ++ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, ++ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, ++ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; ++ ++ const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; ++ const Vec128 pairs = ZipLower(byte_idx, byte_idx); ++ return BitCast(d, pairs + Set(du, 0x0100)); ++} ++ ++template + HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + +@@ -2383,57 +2657,37 @@ HWY_INLINE Vec128 Idx64x2FromBits( + + #endif + +-// Helper function called by both Compress and CompressStore - avoids a ++// Helper functions called by both Compress and CompressStore - avoids a + // redundant BitsFromMask in the latter. + +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); ++template ++HWY_API Vec128 Compress(hwy::SizeTag<2> /*tag*/, Vec128 v, ++ const uint64_t mask_bits) { ++ const auto idx = detail::Idx16x8FromBits(mask_bits); ++ using D = Simd; ++ const RebindToSigned di; ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + } +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); ++ ++template ++HWY_API Vec128 Compress(hwy::SizeTag<4> /*tag*/, Vec128 v, ++ const uint64_t mask_bits) { ++ const auto idx = detail::Idx32x4FromBits(mask_bits); ++ using D = Simd; ++ const RebindToSigned di; ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + } + +-#if HWY_CAP_INTEGER64 ++#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64 + +-template +-HWY_API Vec128 Compress(Vec128 v, ++template ++HWY_API Vec128 Compress(hwy::SizeTag<8> /*tag*/, ++ Vec128 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-} +- +-#endif +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); +-} +- +-#if HWY_CAP_FLOAT64 +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); ++ using D = Simd; ++ const RebindToSigned di; ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + } + + #endif +@@ -2442,7 +2696,8 @@ HWY_API Vec128 Compress(Vec12 + + template + HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { +- return detail::Compress(v, detail::BitsFromMask(mask)); ++ return detail::Compress(hwy::SizeTag(), v, ++ detail::BitsFromMask(mask)); + } + + // ------------------------------ CompressStore +@@ -2451,63 +2706,284 @@ template + HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, + Simd d, T* HWY_RESTRICT aligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); +- Store(detail::Compress(v, mask_bits), d, aligned); ++ Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); + return PopCount(mask_bits); + } + ++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, ++// TableLookupBytes) ++ ++// 128 bits ++HWY_API void StoreInterleaved3(const Vec128 a, const Vec128 b, ++ const Vec128 c, Full128 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const auto k5 = Set(d, 5); ++ const auto k6 = Set(d, 6); ++ ++ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d, tbl_r0); ++ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ StoreU(int0, d, unaligned + 0 * 16); ++ ++ // Second vector: g10,r10, bgr[9:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. ++ const auto r1 = TableLookupBytes(a, shuf_r1); ++ const auto g1 = TableLookupBytes(b, shuf_g1); ++ const auto b1 = TableLookupBytes(c, shuf_b1); ++ const auto int1 = r1 | g1 | b1; ++ StoreU(int1, d, unaligned + 1 * 16); ++ ++ // Third vector: bgr[15:11], b10 ++ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. ++ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. ++ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A ++ const auto r2 = TableLookupBytes(a, shuf_r2); ++ const auto g2 = TableLookupBytes(b, shuf_g2); ++ const auto b2 = TableLookupBytes(c, shuf_b2); ++ const auto int2 = r2 | g2 | b2; ++ StoreU(int2, d, unaligned + 2 * 16); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved3(const Vec128 a, ++ const Vec128 b, ++ const Vec128 c, Simd d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors for the shuffles and first result. ++ const Full128 d_full; ++ const auto k5 = Set(d_full, 5); ++ const auto k6 = Set(d_full, 6); ++ ++ const Vec128 full_a{a.raw}; ++ const Vec128 full_b{b.raw}; ++ const Vec128 full_c{c.raw}; ++ ++ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d_full, tbl_r0); ++ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ StoreU(int0, d_full, unaligned + 0 * 16); ++ ++ // Second (HALF) vector: bgr[7:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. ++ const auto r1 = TableLookupBytes(full_a, shuf_r1); ++ const auto g1 = TableLookupBytes(full_b, shuf_g1); ++ const auto b1 = TableLookupBytes(full_c, shuf_b1); ++ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; ++ StoreU(int1, d, unaligned + 1 * 16); ++} ++ ++// <= 32 bits ++template ++HWY_API void StoreInterleaved3(const Vec128 a, ++ const Vec128 b, ++ const Vec128 c, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors for the shuffles and result. ++ const Full128 d_full; ++ ++ const Vec128 full_a{a.raw}; ++ const Vec128 full_b{b.raw}; ++ const Vec128 full_c{c.raw}; ++ ++ // Shuffle (a,b,c) vector bytes to bgr[3:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // ++ 0x80, 0x80, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d_full, tbl_r0); ++ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0); ++ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0); ++ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 ++ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. ++ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ alignas(16) uint8_t buf[16]; ++ StoreU(int0, d_full, buf); ++ CopyBytes(buf, unaligned); ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++// 128 bits ++HWY_API void StoreInterleaved4(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ const Vec128 v3, Full128 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto ba8 = ZipUpper(v0, v1); ++ const auto dc8 = ZipUpper(v2, v3); ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 ++ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8 ++ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC ++ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16); ++ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16); ++ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16); ++ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved4(const Vec128 in0, ++ const Vec128 in1, ++ const Vec128 in2, ++ const Vec128 in3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors to reduce the number of stores. ++ const Vec128 v0{in0.raw}; ++ const Vec128 v1{in1.raw}; ++ const Vec128 v2{in2.raw}; ++ const Vec128 v3{in3.raw}; ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 ++ const Full128 d_full; ++ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16); ++ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16); ++} ++ ++// <= 32 bits ++template ++HWY_API void StoreInterleaved4(const Vec128 in0, ++ const Vec128 in1, ++ const Vec128 in2, ++ const Vec128 in3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors to reduce the number of stores. ++ const Vec128 v0{in0.raw}; ++ const Vec128 v1{in1.raw}; ++ const Vec128 v2{in2.raw}; ++ const Vec128 v3{in3.raw}; ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0 ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ alignas(16) uint8_t buf[16]; ++ const Full128 d_full; ++ StoreU(BitCast(d_full, dcba_0), d_full, buf); ++ CopyBytes<4 * N>(buf, unaligned); ++} ++ + // ------------------------------ Reductions + + namespace detail { + +-// For u32/i32/f32. +-template +-HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++// N=1 for any T: no-op ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++ ++// u32/i32/f32: ++ ++// N=2 ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; ++} ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); ++} ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); ++} ++ ++// N=4 (full) ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = v3210 + v1032; + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; + } +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); + } + +-// For u64/i64/f64. +-template +-HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++// u64/i64/f64: ++ ++// N=2 (full) ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return v10 + v01; + } +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); + } + + } // namespace detail + +-// Supported for u/i/f 32/64. Returns the sum in each lane. ++// Supported for u/i/f 32/64. Returns the same value in each lane. + template + HWY_API Vec128 SumOfLanes(const Vec128 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 2021-06-02 10:56:05.240904417 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -154,27 +154,28 @@ HWY_API Vec128 Zero(Simd + HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { +- return Vec128{_mm_set1_epi8(t)}; ++ return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const uint16_t t) { +- return Vec128{_mm_set1_epi16(t)}; ++ return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const uint32_t t) { +- return Vec128{_mm_set1_epi32(t)}; ++ return Vec128{_mm_set1_epi32(static_cast(t))}; + } + template + HWY_API Vec128 Set(Simd /* tag */, const uint64_t t) { +- return Vec128{_mm_set1_epi64x(t)}; ++ return Vec128{ ++ _mm_set1_epi64x(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { +- return Vec128{_mm_set1_epi8(t)}; ++ return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { +- return Vec128{_mm_set1_epi16(t)}; ++ return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { +@@ -182,7 +183,8 @@ HWY_API Vec128 Set(Simd + HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { +- return Vec128{_mm_set1_epi64x(t)}; ++ return Vec128{ ++ _mm_set1_epi64x(static_cast(t))}; // NOLINT + } + template + HWY_API Vec128 Set(Simd /* tag */, const float t) { +@@ -510,7 +512,8 @@ HWY_API Mask128 Xor(const Mask128< + template + HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); +- return Mask128{m.raw}; ++ const Simd d; ++ return MaskFromVec(BitCast(Simd(), VecFromMask(d, m))); + } + + // ------------------------------ Equality +@@ -683,6 +686,14 @@ HWY_API Mask128 operator>=(co + return Mask128{_mm_cmpge_pd(a.raw, b.raw)}; + } + ++// ------------------------------ FirstN (Iota, Lt) ++ ++template ++HWY_API Mask128 FirstN(const Simd d, size_t num) { ++ const RebindToSigned di; // Signed comparisons are cheaper. ++ return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); ++} ++ + // ================================================== ARITHMETIC + + // ------------------------------ Addition +@@ -894,7 +905,7 @@ template + HWY_API Vec128 Abs(const Vec128 v) { + return Vec128{_mm_abs_epi32(v.raw)}; + } +- ++// i64 is implemented after BroadcastSignBit. + template + HWY_API Vec128 Abs(const Vec128 v) { + const Vec128 mask{_mm_set1_epi32(0x7FFFFFFF)}; +@@ -959,7 +970,6 @@ HWY_API Vec128 Mu + + // ------------------------------ ShiftLeft + +-// Unsigned + template + HWY_API Vec128 ShiftLeft(const Vec128 v) { + return Vec128{_mm_slli_epi16(v.raw, kBits)}; +@@ -988,6 +998,16 @@ HWY_API Vec128 ShiftLeft(con + return Vec128{_mm_slli_epi64(v.raw, kBits)}; + } + ++template ++HWY_API Vec128 ShiftLeft(const Vec128 v) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; ++ return kBits == 1 ++ ? (v + v) ++ : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); ++} ++ + // ------------------------------ ShiftRight + + template +@@ -1004,6 +1024,15 @@ HWY_API Vec128 ShiftRight(c + } + + template ++HWY_API Vec128 ShiftRight(const Vec128 v) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftRight(Vec128{v.raw}).raw}; ++ return shifted & Set(d8, 0xFF >> kBits); ++} ++ ++template + HWY_API Vec128 ShiftRight(const Vec128 v) { + return Vec128{_mm_srai_epi16(v.raw, kBits)}; + } +@@ -1012,6 +1041,15 @@ HWY_API Vec128 ShiftRight(co + return Vec128{_mm_srai_epi32(v.raw, kBits)}; + } + ++template ++HWY_API Vec128 ShiftRight(const Vec128 v) { ++ const Simd di; ++ const Simd du; ++ const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // i64 is implemented after BroadcastSignBit. + + // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) +@@ -1039,15 +1077,24 @@ HWY_API Vec128 BroadcastSign + return VecFromMask(v < Zero(Simd())); + #else + // Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires +- // two constants and domain crossing. 32-bit compare only requires Zero() +- // plus a shuffle to replicate the upper 32 bits. ++ // two constants and domain crossing. 32-bit shift avoids generating a zero. + const Simd d32; +- const auto sign = BitCast(d32, v) < Zero(d32); ++ const auto sign = ShiftRight<31>(BitCast(d32, v)); + return Vec128{ + _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; + #endif + } + ++template ++HWY_API Vec128 Abs(const Vec128 v) { ++#if HWY_TARGET == HWY_AVX3 ++ return Vec128{_mm_abs_epi64(v.raw)}; ++#else ++ const auto zero = Zero(Simd()); ++ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); ++#endif ++} ++ + template + HWY_API Vec128 ShiftRight(const Vec128 v) { + #if HWY_TARGET == HWY_AVX3 +@@ -1097,6 +1144,15 @@ HWY_API Vec128 ShiftLeftSame + return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++template ++HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; ++ return shifted & Set(d8, (0xFF << bits) & 0xFF); ++} ++ + // ------------------------------ ShiftRightSame (BroadcastSignBit) + + template +@@ -1116,6 +1172,16 @@ HWY_API Vec128 ShiftRightSa + } + + template ++HWY_API Vec128 ShiftRightSame(Vec128 v, ++ const int bits) { ++ const Simd d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec128 shifted{ ++ ShiftRightSame(Vec128{v.raw}, bits).raw}; ++ return shifted & Set(d8, 0xFF >> bits); ++} ++ ++template + HWY_API Vec128 ShiftRightSame(const Vec128 v, + const int bits) { + return Vec128{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +@@ -1140,6 +1206,15 @@ HWY_API Vec128 ShiftRightSam + #endif + } + ++template ++HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { ++ const Simd di; ++ const Simd du; ++ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ Negate + + template +@@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128 ++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128 v, ++ Simd /* tag */, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ if (N == 4) { ++ _mm_i32scatter_epi32(base, offset.raw, v.raw, 1); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1); ++ } ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128 v, ++ Simd /* tag */, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ if (N == 4) { ++ _mm_i32scatter_epi32(base, index.raw, v.raw, 4); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4); ++ } ++} ++ ++template ++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128 v, ++ Simd /* tag */, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ if (N == 2) { ++ _mm_i64scatter_epi64(base, offset.raw, v.raw, 1); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1); ++ } ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128 v, ++ Simd /* tag */, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ if (N == 2) { ++ _mm_i64scatter_epi64(base, index.raw, v.raw, 8); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8); ++ } ++} ++ ++} // namespace detail ++ ++template ++HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); ++} ++template ++HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); ++} ++ ++template ++HWY_INLINE void ScatterOffset(Vec128 v, Simd /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec128 offset) { ++ if (N == 4) { ++ _mm_i32scatter_ps(base, offset.raw, v.raw, 1); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1); ++ } ++} ++template ++HWY_INLINE void ScatterIndex(Vec128 v, Simd /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec128 index) { ++ if (N == 4) { ++ _mm_i32scatter_ps(base, index.raw, v.raw, 4); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4); ++ } ++} ++ ++template ++HWY_INLINE void ScatterOffset(Vec128 v, Simd /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec128 offset) { ++ if (N == 2) { ++ _mm_i64scatter_pd(base, offset.raw, v.raw, 1); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1); ++ } ++} ++template ++HWY_INLINE void ScatterIndex(Vec128 v, Simd /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec128 index) { ++ if (N == 2) { ++ _mm_i64scatter_pd(base, index.raw, v.raw, 8); ++ } else { ++ const __mmask8 mask = (1u << N) - 1; ++ _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8); ++ } ++} ++#else // HWY_TARGET == HWY_AVX3 ++ ++template ++HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); ++ } ++} ++ ++template ++HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, ++ const Vec128 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ for (size_t i = 0; i < N; ++i) { ++ base[index_lanes[i]] = lanes[i]; ++ } ++} ++ ++#endif ++ ++// ------------------------------ Gather (Load/Store) ++ + #if HWY_TARGET == HWY_SSE4 + + template + HWY_API Vec128 GatherOffset(const Simd d, + const T* HWY_RESTRICT base, + const Vec128 offset) { +- static_assert(N == 1, "SSE4 does not support full gather"); +- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); +- const uintptr_t address = reinterpret_cast(base) + GetLane(offset); +- T val; +- CopyBytes(reinterpret_cast(address), &val); +- return Set(d, val); ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ alignas(16) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ alignas(16) T lanes[N]; ++ const uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); ++ } ++ return Load(d, lanes); + } + + template + HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, + const Vec128 index) { +- static_assert(N == 1, "SSE4 does not support full gather"); +- static_assert(sizeof(T) == sizeof(Index), "T must match Index"); +- return Set(d, base[GetLane(index)]); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ alignas(16) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ alignas(16) T lanes[N]; ++ for (size_t i = 0; i < N; ++i) { ++ lanes[i] = base[index_lanes[i]]; ++ } ++ return Load(d, lanes); + } + + #else +@@ -1832,6 +2071,8 @@ HWY_API Vec128 GatherIndex(Si + + #endif // HWY_TARGET != HWY_SSE4 + ++HWY_DIAGNOSTICS(pop) ++ + // ================================================== SWIZZLE + + // ------------------------------ Extract half +@@ -1859,10 +2100,10 @@ HWY_INLINE Vec128 UpperHalf(V + // ------------------------------ Shift vector by constant #bytes + + // 0x01..0F, kBytes = 1 => 0x02..0F00 +-template +-HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { ++template ++HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); +- return Vec128{_mm_slli_si128(v.raw, kBytes)}; ++ return Vec128{_mm_slli_si128(v.raw, kBytes)}; + } + + template +@@ -1873,10 +2114,10 @@ HWY_API Vec128 ShiftLeftLanes(cons + } + + // 0x01..0F, kBytes = 1 => 0x0001..0E +-template +-HWY_API Vec128 ShiftRightBytes(const Vec128 v) { ++template ++HWY_API Vec128 ShiftRightBytes(const Vec128 v) { + static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); +- return Vec128{_mm_srli_si128(v.raw, kBytes)}; ++ return Vec128{_mm_srli_si128(v.raw, kBytes)}; + } + + template +@@ -2041,44 +2282,47 @@ HWY_API Vec128 Shuffle0123(const + // ------------------------------ TableLookupLanes + + // Returned by SetTableIndices for use by TableLookupLanes. +-template ++template + struct Indices128 { + __m128i raw; + }; + +-template +-HWY_API Indices128 SetTableIndices(Full128, const int32_t* idx) { ++template ++HWY_API Indices128 SetTableIndices(Simd d, const int32_t* idx) { + #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) +- const size_t N = 16 / sizeof(T); + for (size_t i = 0; i < N; ++i) { + HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); + } + #endif + +- const Full128 d8; +- alignas(16) uint8_t control[16]; +- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { +- const size_t idx_lane = idx_byte / sizeof(T); +- const size_t mod = idx_byte % sizeof(T); +- control[idx_byte] = static_cast(idx[idx_lane] * sizeof(T) + mod); ++ const Repartition d8; ++ alignas(16) uint8_t control[16] = {0}; ++ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { ++ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { ++ control[idx_lane * sizeof(T) + idx_byte] = ++ static_cast(idx[idx_lane] * sizeof(T) + idx_byte); ++ } + } +- return Indices128{Load(d8, control).raw}; ++ return Indices128{Load(d8, control).raw}; + } + +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128{idx.raw}); ++template ++HWY_API Vec128 TableLookupLanes( ++ const Vec128 v, const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- return TableLookupBytes(v, Vec128{idx.raw}); ++template ++HWY_API Vec128 TableLookupLanes(const Vec128 v, ++ const Indices128 idx) { ++ return TableLookupBytes(v, Vec128{idx.raw}); + } +-HWY_API Vec128 TableLookupLanes(const Vec128 v, +- const Indices128 idx) { +- const Full128 di; +- const Full128 df; ++template ++HWY_API Vec128 TableLookupLanes(const Vec128 v, ++ const Indices128 idx) { ++ const Simd di; ++ const Simd df; + return BitCast(df, +- TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); ++ TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); + } + + // ------------------------------ Interleave lanes +@@ -2286,47 +2530,47 @@ HWY_INLINE Vec128 ConcatUpperLow + + namespace detail { + +-template +-HWY_API Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, +- const Vec128 b) { +- const Full128 d; +- const Full128 d8; ++template ++HWY_API Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, ++ const Vec128 b) { ++ const Simd d; ++ const Repartition d8; + alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, + 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; + return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); + } +-template +-HWY_API Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, +- const Vec128 b) { +- return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; ++template ++HWY_API Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, ++ const Vec128 b) { ++ return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; + } +-template +-HWY_API Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, +- const Vec128 b) { +- return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x33)}; ++template ++HWY_API Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, ++ const Vec128 b) { ++ return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x33)}; + } +-template +-HWY_API Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, +- const Vec128 b) { +- return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x0F)}; ++template ++HWY_API Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, ++ const Vec128 b) { ++ return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x0F)}; + } + + } // namespace detail + +-template +-HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { ++template ++HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { + return detail::OddEven(hwy::SizeTag(), a, b); + } +-template <> +-HWY_INLINE Vec128 OddEven(const Vec128 a, +- const Vec128 b) { +- return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; ++template ++HWY_INLINE Vec128 OddEven(const Vec128 a, ++ const Vec128 b) { ++ return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; + } + +-template <> +-HWY_INLINE Vec128 OddEven(const Vec128 a, +- const Vec128 b) { +- return Vec128{_mm_blend_pd(a.raw, b.raw, 1)}; ++template ++HWY_INLINE Vec128 OddEven(const Vec128 a, ++ const Vec128 b) { ++ return Vec128{_mm_blend_pd(a.raw, b.raw, 1)}; + } + + // ------------------------------ Shl (ZipLower, Mul) +@@ -2764,7 +3008,7 @@ HWY_API Vec128 U8FromU32(con + return LowerHalf(LowerHalf(BitCast(d8, quad))); + } + +-// ------------------------------ Convert integer <=> floating point ++// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + + template + HWY_API Vec128 ConvertTo(Simd /* tag */, +@@ -2779,13 +3023,20 @@ HWY_API Vec128 ConvertTo(Simd + (void)dd; + return Vec128{_mm_cvtepi64_pd(v.raw)}; + #else +- alignas(16) int64_t lanes_i[2]; +- Store(v, Simd(), lanes_i); +- alignas(16) double lanes_d[2]; +- for (size_t i = 0; i < N; ++i) { +- lanes_d[i] = static_cast(lanes_i[i]); +- } +- return Load(dd, lanes_d); ++ // Based on wim's approach (https://stackoverflow.com/questions/41144668/) ++ const Repartition d32; ++ const Repartition d64; ++ ++ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 ++ const auto k84_63 = Set(d64, 0x4530000080000000ULL); ++ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); ++ ++ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) ++ const auto k52 = Set(d32, 0x43300000); ++ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); ++ ++ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); ++ return (v_upper - k84_63_52) + v_lower; // order matters! + #endif + } + +@@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128 ++HWY_INLINE Vec128 Idx16x8FromBits(const uint64_t mask_bits) { ++ HWY_DASSERT(mask_bits < 256); ++ const Simd d; ++ const Rebind d8; ++ const Simd du; ++ ++ // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need ++ // byte indices for PSHUFB (one vector's worth for each of 256 combinations of ++ // 8 mask bits). Loading them directly would require 4 KiB. We can instead ++ // store lane indices and convert to byte indices (2*lane + 0..1), with the ++ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane ++ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. ++ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles ++ // is likely more costly than the higher cache footprint from storing bytes. ++ alignas(16) constexpr uint8_t table[256 * 8] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, ++ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, ++ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, ++ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, ++ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, ++ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, ++ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, ++ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, ++ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, ++ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, ++ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, ++ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, ++ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, ++ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, ++ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, ++ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, ++ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, ++ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, ++ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, ++ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, ++ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, ++ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, ++ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, ++ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, ++ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, ++ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, ++ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, ++ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, ++ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, ++ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, ++ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, ++ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, ++ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, ++ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, ++ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, ++ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, ++ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, ++ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, ++ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, ++ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, ++ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, ++ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, ++ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, ++ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, ++ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, ++ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, ++ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, ++ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, ++ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, ++ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, ++ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, ++ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, ++ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, ++ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, ++ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, ++ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, ++ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, ++ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, ++ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, ++ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, ++ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, ++ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, ++ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, ++ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, ++ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, ++ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, ++ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, ++ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, ++ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, ++ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, ++ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, ++ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, ++ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, ++ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, ++ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, ++ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, ++ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, ++ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, ++ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, ++ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, ++ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, ++ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, ++ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, ++ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, ++ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, ++ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, ++ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, ++ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, ++ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, ++ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, ++ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, ++ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, ++ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, ++ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, ++ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, ++ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, ++ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, ++ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, ++ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, ++ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, ++ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, ++ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, ++ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, ++ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, ++ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, ++ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, ++ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, ++ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, ++ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, ++ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, ++ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; ++ ++ const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; ++ const Vec128 pairs = ZipLower(byte_idx, byte_idx); ++ return BitCast(d, pairs + Set(du, 0x0100)); ++} ++ ++template + HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 16); + +@@ -2968,71 +3355,42 @@ HWY_INLINE Vec128 Idx64x2FromBits( + // Helper function called by both Compress and CompressStore - avoids a + // redundant BitsFromMask in the latter. + +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_epi32(mask_bits, v.raw)}; +-#else +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-#endif +-} +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_epi32(mask_bits, v.raw)}; +-#else +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-#endif +-} +- +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_epi64(mask_bits, v.raw)}; +-#else +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-#endif +-} +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_epi64(mask_bits, v.raw)}; +-#else +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- return TableLookupBytes(v, idx); +-#endif ++template ++HWY_API Vec128 Compress(hwy::SizeTag<2> /*tag*/, Vec128 v, ++ const uint64_t mask_bits) { ++ const auto idx = detail::Idx16x8FromBits(mask_bits); ++ using D = Simd; ++ const RebindToSigned di; ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + } + +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { ++template ++HWY_API Vec128 Compress(hwy::SizeTag<4> /*tag*/, Vec128 v, ++ const uint64_t mask_bits) { ++ using D = Simd; ++ using TI = MakeSigned; ++ const Rebind di; + #if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_ps(mask_bits, v.raw)}; ++ return BitCast(D(), Vec128{_mm_maskz_compress_epi32( ++ mask_bits, BitCast(di, v).raw)}); + #else +- const auto idx = detail::Idx32x4FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); ++ const auto idx = detail::Idx32x4FromBits(mask_bits); ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + #endif + } + +-template +-HWY_API Vec128 Compress(Vec128 v, +- const uint64_t mask_bits) { ++template ++HWY_API Vec128 Compress(hwy::SizeTag<8> /*tag*/, Vec128 v, ++ const uint64_t mask_bits) { ++ using D = Simd; ++ using TI = MakeSigned; ++ const Rebind di; + #if HWY_TARGET == HWY_AVX3 +- return Vec128{_mm_maskz_compress_pd(mask_bits, v.raw)}; ++ return BitCast(D(), Vec128{_mm_maskz_compress_epi64( ++ mask_bits, BitCast(di, v).raw)}); + #else +- const auto idx = detail::Idx64x2FromBits(mask_bits); +- const Simd df; +- const Simd di; +- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); ++ const auto idx = detail::Idx64x2FromBits(mask_bits); ++ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); + #endif + } + +@@ -3040,7 +3398,8 @@ HWY_API Vec128 Compress(Vec12 + + template + HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { +- return detail::Compress(v, detail::BitsFromMask(mask)); ++ return detail::Compress(hwy::SizeTag(), v, ++ detail::BitsFromMask(mask)); + } + + // ------------------------------ CompressStore +@@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128 d, T* HWY_RESTRICT aligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). +- Store(detail::Compress(v, mask_bits), d, aligned); ++ Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); + return PopCount(mask_bits); + } + ++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, ++// TableLookupBytes) ++ ++// 128 bits ++HWY_API void StoreInterleaved3(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, Full128 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const auto k5 = Set(d, 5); ++ const auto k6 = Set(d, 6); ++ ++ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d, tbl_r0); ++ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ StoreU(int0, d, unaligned + 0 * 16); ++ ++ // Second vector: g10,r10, bgr[9:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. ++ const auto r1 = TableLookupBytes(v0, shuf_r1); ++ const auto g1 = TableLookupBytes(v1, shuf_g1); ++ const auto b1 = TableLookupBytes(v2, shuf_b1); ++ const auto int1 = r1 | g1 | b1; ++ StoreU(int1, d, unaligned + 1 * 16); ++ ++ // Third vector: bgr[15:11], b10 ++ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. ++ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. ++ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A ++ const auto r2 = TableLookupBytes(v0, shuf_r2); ++ const auto g2 = TableLookupBytes(v1, shuf_g2); ++ const auto b2 = TableLookupBytes(v2, shuf_b2); ++ const auto int2 = r2 | g2 | b2; ++ StoreU(int2, d, unaligned + 2 * 16); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved3(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, Simd d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors for the shuffles and first result. ++ const Full128 d_full; ++ const auto k5 = Set(d_full, 5); ++ const auto k6 = Set(d_full, 6); ++ ++ const Vec128 full_a{v0.raw}; ++ const Vec128 full_b{v1.raw}; ++ const Vec128 full_c{v2.raw}; ++ ++ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d_full, tbl_r0); ++ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ StoreU(int0, d_full, unaligned + 0 * 16); ++ ++ // Second (HALF) vector: bgr[7:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. ++ const auto r1 = TableLookupBytes(full_a, shuf_r1); ++ const auto g1 = TableLookupBytes(full_b, shuf_g1); ++ const auto b1 = TableLookupBytes(full_c, shuf_b1); ++ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; ++ StoreU(int1, d, unaligned + 1 * 16); ++} ++ ++// <= 32 bits ++template ++HWY_API void StoreInterleaved3(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors for the shuffles and result. ++ const Full128 d_full; ++ ++ const Vec128 full_a{v0.raw}; ++ const Vec128 full_b{v1.raw}; ++ const Vec128 full_c{v2.raw}; ++ ++ // Shuffle (v0,v1,v2) vector bytes to bgr[3:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // ++ 0x80, 0x80, 0x80, 0x80}; ++ const auto shuf_r0 = Load(d_full, tbl_r0); ++ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0); ++ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0); ++ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 ++ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. ++ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. ++ const auto int0 = r0 | g0 | b0; ++ alignas(16) uint8_t buf[16]; ++ StoreU(int0, d_full, buf); ++ CopyBytes(buf, unaligned); ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++// 128 bits ++HWY_API void StoreInterleaved4(const Vec128 v0, ++ const Vec128 v1, ++ const Vec128 v2, ++ const Vec128 v3, Full128 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto ba8 = ZipUpper(v0, v1); ++ const auto dc8 = ZipUpper(v2, v3); ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 ++ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8 ++ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC ++ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16); ++ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16); ++ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16); ++ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16); ++} ++ ++// 64 bits ++HWY_API void StoreInterleaved4(const Vec128 in0, ++ const Vec128 in1, ++ const Vec128 in2, ++ const Vec128 in3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors to reduce the number of stores. ++ const Vec128 v0{in0.raw}; ++ const Vec128 v1{in1.raw}; ++ const Vec128 v2{in2.raw}; ++ const Vec128 v3{in3.raw}; ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 ++ const Full128 d_full; ++ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16); ++ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16); ++} ++ ++// <= 32 bits ++template ++HWY_API void StoreInterleaved4(const Vec128 in0, ++ const Vec128 in1, ++ const Vec128 in2, ++ const Vec128 in3, ++ Simd /*tag*/, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // Use full vectors to reduce the number of stores. ++ const Vec128 v0{in0.raw}; ++ const Vec128 v1{in1.raw}; ++ const Vec128 v2{in2.raw}; ++ const Vec128 v3{in3.raw}; ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0 ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 ++ alignas(16) uint8_t buf[16]; ++ const Full128 d_full; ++ StoreU(BitCast(d_full, dcba_0), d_full, buf); ++ CopyBytes<4 * N>(buf, unaligned); ++} ++ + // ------------------------------ Reductions + + namespace detail { + +-// For u32/i32/f32. +-template +-HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++// N=1 for any T: no-op ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, ++ const Vec128 v) { ++ return v; ++} ++ ++// u32/i32/f32: ++ ++// N=2 ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; ++} ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); ++} ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, ++ const Vec128 v10) { ++ return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); ++} ++ ++// N=4 (full) ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = v3210 + v1032; + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return v20_31_20_31 + v31_20_31_20; + } +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Min(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Min(v20_31_20_31, v31_20_31_20); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, +- const Vec128 v3210) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { + const Vec128 v1032 = Shuffle1032(v3210); + const Vec128 v31_20_31_20 = Max(v3210, v1032); + const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); + return Max(v20_31_20_31, v31_20_31_20); + } + +-// For u64/i64/f64. +-template +-HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++// u64/i64/f64: ++ ++// N=2 (full) ++template ++HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return v10 + v01; + } +-template +-HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Min(v10, v01); + } +-template +-HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, +- const Vec128 v10) { ++template ++HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { + const Vec128 v01 = Shuffle01(v10); + return Max(v10, v01); + } + + } // namespace detail + +-// Supported for u/i/f 32/64. Returns the sum in each lane. ++// Supported for u/i/f 32/64. Returns the same value in each lane. + template + HWY_API Vec128 SumOfLanes(const Vec128 v) { + return detail::SumOfLanes(hwy::SizeTag(), v); +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 2021-06-02 10:56:05.234904387 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -20,6 +20,20 @@ + // particular, "Broadcast", pack and zip behavior may be surprising. + + #include // AVX2+ ++ ++#if defined(_MSC_VER) && defined(__clang__) ++// Including should be enough, but Clang's headers helpfully skip ++// including these headers when _MSC_VER is defined, like when using clang-cl. ++// Include these directly here. ++#include ++// avxintrin defines __m256i and must come before avx2intrin. ++#include ++#include // _pext_u64 ++#include ++#include ++#include ++#endif ++ + #include + #include + +@@ -148,23 +162,24 @@ HWY_API Vec256 Set(Full256{_mm256_set1_epi16(static_cast(t))}; // NOLINT + } + HWY_API Vec256 Set(Full256 /* tag */, const uint32_t t) { +- return Vec256{_mm256_set1_epi32(static_cast(t))}; // NOLINT ++ return Vec256{_mm256_set1_epi32(static_cast(t))}; + } + HWY_API Vec256 Set(Full256 /* tag */, const uint64_t t) { + return Vec256{ + _mm256_set1_epi64x(static_cast(t))}; // NOLINT + } + HWY_API Vec256 Set(Full256 /* tag */, const int8_t t) { +- return Vec256{_mm256_set1_epi8(t)}; ++ return Vec256{_mm256_set1_epi8(static_cast(t))}; // NOLINT + } + HWY_API Vec256 Set(Full256 /* tag */, const int16_t t) { +- return Vec256{_mm256_set1_epi16(t)}; ++ return Vec256{_mm256_set1_epi16(static_cast(t))}; // NOLINT + } + HWY_API Vec256 Set(Full256 /* tag */, const int32_t t) { + return Vec256{_mm256_set1_epi32(t)}; + } + HWY_API Vec256 Set(Full256 /* tag */, const int64_t t) { +- return Vec256{_mm256_set1_epi64x(t)}; ++ return Vec256{ ++ _mm256_set1_epi64x(static_cast(t))}; // NOLINT + } + HWY_API Vec256 Set(Full256 /* tag */, const float t) { + return Vec256{_mm256_set1_ps(t)}; +@@ -340,6 +355,8 @@ HWY_API Vec256 VecFromMask(Full256 + return Vec256{v.raw}; + } + ++// ------------------------------ IfThenElse ++ + // mask ? yes : no + template + HWY_API Vec256 IfThenElse(const Mask256 mask, const Vec256 yes, +@@ -412,9 +429,9 @@ HWY_API Mask256 Xor(const Mask256 + // Comparisons fill a lane with 1-bits if the condition is true, else 0. + + template +-HWY_API Mask256 RebindMask(Full256 /*tag*/, Mask256 m) { ++HWY_API Mask256 RebindMask(Full256 d_to, Mask256 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); +- return Mask256{m.raw}; ++ return MaskFromVec(BitCast(d_to, VecFromMask(Full256(), m))); + } + + // ------------------------------ Equality +@@ -670,6 +687,14 @@ HWY_API Vec256 Max(const Vec256< + return Vec256{_mm256_max_pd(a.raw, b.raw)}; + } + ++// ------------------------------ FirstN (Iota, Lt) ++ ++template ++HWY_API Mask256 FirstN(const Full256 d, size_t n) { ++ const RebindToSigned di; // Signed comparisons are cheaper. ++ return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(n))); ++} ++ + // ================================================== ARITHMETIC + + // ------------------------------ Addition +@@ -832,7 +857,13 @@ HWY_API Vec256 AverageRound(co + + // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. + HWY_API Vec256 Abs(const Vec256 v) { ++#if HWY_COMPILER_MSVC ++ // Workaround for incorrect codegen? (wrong result) ++ const auto zero = Zero(Full256()); ++ return Vec256{_mm256_max_epi8(v.raw, (zero - v).raw)}; ++#else + return Vec256{_mm256_abs_epi8(v.raw)}; ++#endif + } + HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{_mm256_abs_epi16(v.raw)}; +@@ -840,6 +871,7 @@ HWY_API Vec256 Abs(const Vec256 + HWY_API Vec256 Abs(const Vec256 v) { + return Vec256{_mm256_abs_epi32(v.raw)}; + } ++// i64 is implemented after BroadcastSignBit. + + HWY_API Vec256 Abs(const Vec256 v) { + const Vec256 mask{_mm256_set1_epi32(0x7FFFFFFF)}; +@@ -925,6 +957,16 @@ HWY_API Vec256 ShiftLeft(const + return Vec256{_mm256_slli_epi64(v.raw, kBits)}; + } + ++template ++HWY_API Vec256 ShiftLeft(const Vec256 v) { ++ const Full256 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); ++ return kBits == 1 ++ ? (v + v) ++ : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); ++} ++ + // ------------------------------ ShiftRight + + template +@@ -943,6 +985,14 @@ HWY_API Vec256 ShiftRight(cons + } + + template ++HWY_API Vec256 ShiftRight(const Vec256 v) { ++ const Full256 d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec256 shifted{ShiftRight(Vec256{v.raw}).raw}; ++ return shifted & Set(d8, 0xFF >> kBits); ++} ++ ++template + HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{_mm256_srai_epi16(v.raw, kBits)}; + } +@@ -952,6 +1002,15 @@ HWY_API Vec256 ShiftRight(const + return Vec256{_mm256_srai_epi32(v.raw, kBits)}; + } + ++template ++HWY_API Vec256 ShiftRight(const Vec256 v) { ++ const Full256 di; ++ const Full256 du; ++ const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // i64 is implemented after BroadcastSignBit. + + // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) +@@ -989,6 +1048,15 @@ HWY_API Vec256 ShiftRight(const + #endif + } + ++HWY_API Vec256 Abs(const Vec256 v) { ++#if HWY_TARGET == HWY_AVX3 ++ return Vec256{_mm256_abs_epi64(v.raw)}; ++#else ++ const auto zero = Zero(Full256()); ++ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); ++#endif ++} ++ + // ------------------------------ ShiftLeftSame + + HWY_API Vec256 ShiftLeftSame(const Vec256 v, +@@ -1016,6 +1084,14 @@ HWY_API Vec256 ShiftLeftSame(co + return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++template ++HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { ++ const Full256 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); ++ return shifted & Set(d8, (0xFF << bits) & 0xFF); ++} ++ + // ------------------------------ ShiftRightSame (BroadcastSignBit) + + HWY_API Vec256 ShiftRightSame(const Vec256 v, +@@ -1031,6 +1107,13 @@ HWY_API Vec256 ShiftRightSame( + return Vec256{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { ++ const Full256 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); ++ return shifted & Set(d8, 0xFF >> bits); ++} ++ + HWY_API Vec256 ShiftRightSame(const Vec256 v, + const int bits) { + return Vec256{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +@@ -1053,6 +1136,14 @@ HWY_API Vec256 ShiftRightSame(c + #endif + } + ++HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { ++ const Full256 di; ++ const Full256 du; ++ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ Negate + + template +@@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256 + _mm256_stream_pd(aligned, v.raw); + } + ++// ------------------------------ Scatter ++ ++// Work around warnings in the intrinsic definitions (passing -1 as a mask). ++HWY_DIAGNOSTICS(push) ++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") ++ ++#if HWY_TARGET == HWY_AVX3 ++namespace detail { ++ ++template ++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256 v, ++ Full256 /* tag */, T* HWY_RESTRICT base, ++ const Vec256 offset) { ++ _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256 v, ++ Full256 /* tag */, T* HWY_RESTRICT base, ++ const Vec256 index) { ++ _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); ++} ++ ++template ++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256 v, ++ Full256 /* tag */, T* HWY_RESTRICT base, ++ const Vec256 offset) { ++ _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256 v, ++ Full256 /* tag */, T* HWY_RESTRICT base, ++ const Vec256 index) { ++ _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); ++} ++ ++} // namespace detail ++ ++template ++HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, ++ const Vec256 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); ++} ++template ++HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, ++ const Vec256 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); ++} ++ ++template <> ++HWY_INLINE void ScatterOffset(Vec256 v, Full256 /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec256 offset) { ++ _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); ++} ++template <> ++HWY_INLINE void ScatterIndex(Vec256 v, Full256 /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec256 index) { ++ _mm256_i32scatter_ps(base, index.raw, v.raw, 4); ++} ++ ++template <> ++HWY_INLINE void ScatterOffset(Vec256 v, ++ Full256 /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec256 offset) { ++ _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); ++} ++template <> ++HWY_INLINE void ScatterIndex(Vec256 v, ++ Full256 /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec256 index) { ++ _mm256_i64scatter_pd(base, index.raw, v.raw, 8); ++} ++ ++#else ++ ++template ++HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, ++ const Vec256 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ ++ constexpr size_t N = 32 / sizeof(T); ++ alignas(32) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(32) Offset offset_lanes[N]; ++ Store(offset, Simd(), offset_lanes); ++ ++ uint8_t* base_bytes = reinterpret_cast(base); ++ for (size_t i = 0; i < N; ++i) { ++ CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); ++ } ++} ++ ++template ++HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, ++ const Vec256 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ ++ constexpr size_t N = 32 / sizeof(T); ++ alignas(32) T lanes[N]; ++ Store(v, d, lanes); ++ ++ alignas(32) Index index_lanes[N]; ++ Store(index, Simd(), index_lanes); ++ ++ for (size_t i = 0; i < N; ++i) { ++ base[index_lanes[i]] = lanes[i]; ++ } ++} ++ ++#endif ++ + // ------------------------------ Gather + + namespace detail { +@@ -1374,13 +1582,13 @@ HWY_API Vec256 GatherIndex(hwy::SizeT + template + HWY_API Vec256 GatherOffset(Full256 d, const T* HWY_RESTRICT base, + const Vec256 offset) { +- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::GatherOffset(hwy::SizeTag(), d, base, offset); + } + template + HWY_API Vec256 GatherIndex(Full256 d, const T* HWY_RESTRICT base, + const Vec256 index) { +- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::GatherIndex(hwy::SizeTag(), d, base, index); + } + +@@ -1410,6 +1618,8 @@ HWY_INLINE Vec256 GatherIndex{_mm256_i64gather_pd(base, index.raw, 8)}; + } + ++HWY_DIAGNOSTICS(pop) ++ + // ================================================== SWIZZLE + + template +@@ -1861,38 +2071,26 @@ HWY_API Vec256 ZipUpper(const V + return Vec256{_mm256_unpackhi_epi32(a.raw, b.raw)}; + } + +-// ------------------------------ Blocks ++// ------------------------------ Blocks (LowerHalf, ZeroExtendVector) ++ ++// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is ++// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on ++// UpperUpper at the cost of one extra cycle/instruction. + + // hiH,hiL loH,loL |-> hiL,loL (= lower halves) + template + HWY_API Vec256 ConcatLowerLower(const Vec256 hi, const Vec256 lo) { +- return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)}; ++ return Vec256{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)}; + } + template <> + HWY_INLINE Vec256 ConcatLowerLower(const Vec256 hi, + const Vec256 lo) { +- return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)}; ++ return Vec256{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)}; + } + template <> + HWY_INLINE Vec256 ConcatLowerLower(const Vec256 hi, + const Vec256 lo) { +- return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)}; +-} +- +-// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +-template +-HWY_API Vec256 ConcatUpperUpper(const Vec256 hi, const Vec256 lo) { +- return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)}; +-} +-template <> +-HWY_INLINE Vec256 ConcatUpperUpper(const Vec256 hi, +- const Vec256 lo) { +- return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; +-} +-template <> +-HWY_INLINE Vec256 ConcatUpperUpper(const Vec256 hi, +- const Vec256 lo) { +- return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; ++ return Vec256{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)}; + } + + // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) +@@ -1927,6 +2125,12 @@ HWY_INLINE Vec256 ConcatUpperLow + return Vec256{_mm256_blend_pd(hi.raw, lo.raw, 3)}; + } + ++// hiH,hiL loH,loL |-> hiH,loH (= upper halves) ++template ++HWY_API Vec256 ConcatUpperUpper(const Vec256 hi, const Vec256 lo) { ++ return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo))); ++} ++ + // ------------------------------ Odd/even lanes + + namespace detail { +@@ -2211,11 +2415,18 @@ HWY_API Vec128 DemoteTo(Full128< + _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; + } + ++ // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". ++ // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. ++HWY_DIAGNOSTICS(push) ++HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") ++ + HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; + } + ++HWY_DIAGNOSTICS(pop) ++ + HWY_API Vec128 DemoteTo(Full128 /* tag */, + const Vec256 v) { + return Vec128{_mm256_cvtpd_ps(v.raw)}; +@@ -2241,7 +2452,7 @@ HWY_API Vec128 U8FromU32(con + return BitCast(Simd(), pair); + } + +-// ------------------------------ Convert integer <=> floating point ++// ------------------------------ Integer <=> fp (ShiftRight, OddEven) + + HWY_API Vec256 ConvertTo(Full256 /* tag */, + const Vec256 v) { +@@ -2253,13 +2464,20 @@ HWY_API Vec256 ConvertTo(Full256 + (void)dd; + return Vec256{_mm256_cvtepi64_pd(v.raw)}; + #else +- alignas(32) int64_t lanes_i[4]; +- Store(v, Full256(), lanes_i); +- alignas(32) double lanes_d[4]; +- for (size_t i = 0; i < 4; ++i) { +- lanes_d[i] = static_cast(lanes_i[i]); +- } +- return Load(dd, lanes_d); ++ // Based on wim's approach (https://stackoverflow.com/questions/41144668/) ++ const Repartition d32; ++ const Repartition d64; ++ ++ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 ++ const auto k84_63 = Set(d64, 0x4530000080000000ULL); ++ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); ++ ++ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) ++ const auto k52 = Set(d32, 0x43300000); ++ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); ++ ++ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); ++ return (v_upper - k84_63_52) + v_lower; // order matters! + #endif + } + +@@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT + const auto compressed = + _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0)); + return static_cast(_mm256_movemask_epi8(compressed)); +- +-#endif ++#endif // HWY_ARCH_X86_64 + } + + template +@@ -2473,75 +2690,100 @@ HWY_INLINE Vec256 Idx64x4FromB + return Load(d32, packed_array + 8 * mask_bits); + } + +-// Helper function called by both Compress and CompressStore - avoids a ++// Helper functions called by both Compress and CompressStore - avoids a + // redundant BitsFromMask in the latter. + +-HWY_API Vec256 Compress(Vec256 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)}; +-#else +- const Vec256 idx = detail::Idx32x8FromBits(mask_bits); +- return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +-#endif +-} +-HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { ++template ++HWY_API Vec256 Compress(hwy::SizeTag<4> /*tag*/, Vec256 v, ++ const uint64_t mask_bits) { ++ const auto vu = BitCast(Full256(), v); + #if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)}; ++ const __m256i ret = ++ _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw); + #else + const Vec256 idx = detail::Idx32x8FromBits(mask_bits); +- return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; ++ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw); + #endif ++ return BitCast(Full256(), Vec256{ret}); + } + +-HWY_API Vec256 Compress(Vec256 v, +- const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)}; +-#else +- const Vec256 idx = detail::Idx64x4FromBits(mask_bits); +- return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; +-#endif +-} +-HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { ++template ++HWY_API Vec256 Compress(hwy::SizeTag<8> /*tag*/, Vec256 v, ++ const uint64_t mask_bits) { ++ const auto vu = BitCast(Full256(), v); + #if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)}; ++ const __m256i ret = ++ _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw); + #else + const Vec256 idx = detail::Idx64x4FromBits(mask_bits); +- return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; ++ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw); + #endif ++ return BitCast(Full256(), Vec256{ret}); + } + +-HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)}; +-#else +- const Vec256 idx = detail::Idx32x8FromBits(mask_bits); +- return Vec256{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; +-#endif +-} ++// Otherwise, defined in x86_512-inl.h so it can use wider vectors. ++#if HWY_TARGET != HWY_AVX3 + +-HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { +-#if HWY_TARGET == HWY_AVX3 +- return Vec256{ +- _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)}; +-#else +- const Vec256 idx = detail::Idx64x4FromBits(mask_bits); +- return Vec256{_mm256_castsi256_pd( +- _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))}; +-#endif ++// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using ++// the native Compress is probably more efficient than 2 LUTs. ++template ++HWY_API Vec256 Compress(hwy::SizeTag<2> /*tag*/, Vec256 v, ++ const uint64_t mask_bits) { ++ using D = Full256; ++ const Rebind du; ++ const Repartition dw; ++ const auto vu16 = BitCast(du, v); // (required for float16_t inputs) ++ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16)); ++ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16)); ++ ++ const uint64_t mask_bits0 = mask_bits & 0xFF; ++ const uint64_t mask_bits1 = mask_bits >> 8; ++ const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0); ++ const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1); ++ ++ const Half dh; ++ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0)); ++ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1)); ++ ++ const size_t count0 = PopCount(mask_bits0); ++ // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with ++ // VPERMD for shifting at 4 byte granularity. ++ alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7}; ++ const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2); ++ const auto shift1_multiple4 = ++ BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices)); ++ ++ // Whole-register unconditional shift by 2 bytes. ++ // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead? ++ const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw, ++ shift1_multiple4.raw, 0x08); ++ const auto shift1_multiple2 = ++ Vec256{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)}; ++ ++ // Make the shift conditional on the lower bit of count0. ++ const auto m_odd = TestBit(Set(du, count0), Set(du, 1)); ++ const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4); ++ ++ // Blend the lower and shifted upper parts. ++ constexpr uint16_t on = 0xFFFF; ++ alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on), ++ HWY_REP4(on), HWY_REP4(on)}; ++ const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0)); ++ return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1)); + } + ++#endif // HWY_TARGET != HWY_AVX3 ++ + } // namespace detail + ++// Otherwise, defined in x86_512-inl.h after detail::Compress. ++#if HWY_TARGET != HWY_AVX3 ++ + template + HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { +- return detail::Compress(v, detail::BitsFromMask(mask)); ++ return detail::Compress(hwy::SizeTag(), v, ++ detail::BitsFromMask(mask)); + } + + // ------------------------------ CompressStore +@@ -2550,10 +2792,101 @@ template + HWY_API size_t CompressStore(Vec256 v, const Mask256 mask, Full256 d, + T* HWY_RESTRICT aligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); +- Store(detail::Compress(v, mask_bits), d, aligned); ++ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but ++ // using StoreU to concatenate the results would cause page faults if ++ // `aligned` is the last valid vector. Instead rely on in-register splicing. ++ Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); + return PopCount(mask_bits); + } + ++#endif // HWY_TARGET != HWY_AVX3 ++ ++// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, ++// TableLookupBytes, ConcatUpperLower) ++ ++HWY_API void StoreInterleaved3(const Vec256 v0, ++ const Vec256 v1, ++ const Vec256 v2, Full256 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const auto k5 = Set(d, 5); ++ const auto k6 = Set(d, 6); ++ ++ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = LoadDup128(d, tbl_r0); ++ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5 ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0.. ++ const auto interleaved_10_00 = r0 | g0 | b0; ++ ++ // Second vector: g10,r10, bgr[9:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. ++ const auto r1 = TableLookupBytes(v0, shuf_r1); ++ const auto g1 = TableLookupBytes(v1, shuf_g1); ++ const auto b1 = TableLookupBytes(v2, shuf_b1); ++ const auto interleaved_15_05 = r1 | g1 | b1; ++ ++ // We want to write the lower halves of the interleaved vectors, then the ++ // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but ++ // that would require two ununaligned stores. For the lower halves, we can ++ // merge two 128-bit stores for the same swizzling cost: ++ const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00); ++ StoreU(out0, d, unaligned + 0 * 32); ++ ++ // Third vector: bgr[15:11], b10 ++ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. ++ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. ++ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A ++ const auto r2 = TableLookupBytes(v0, shuf_r2); ++ const auto g2 = TableLookupBytes(v1, shuf_g2); ++ const auto b2 = TableLookupBytes(v2, shuf_b2); ++ const auto interleaved_1A_0A = r2 | g2 | b2; ++ ++ const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A); ++ StoreU(out1, d, unaligned + 1 * 32); ++ ++ const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05); ++ StoreU(out2, d, unaligned + 2 * 32); ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++HWY_API void StoreInterleaved4(const Vec256 v0, ++ const Vec256 v1, ++ const Vec256 v2, ++ const Vec256 v3, Full256 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto ba8 = ZipUpper(v0, v1); ++ const auto dc8 = ZipUpper(v2, v3); ++ const auto dcba_0 = ZipLower(ba0, dc0); // d..a13 d..a10 | d..a03 d..a00 ++ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a17 d..a14 | d..a07 d..a04 ++ const auto dcba_8 = ZipLower(ba8, dc8); // d..a1B d..a18 | d..a0B d..a08 ++ const auto dcba_C = ZipUpper(ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C ++ // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can ++ // efficiently combine two lower halves into 256 bits: ++ const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0)); ++ const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8)); ++ StoreU(out0, d, unaligned + 0 * 32); ++ StoreU(out1, d, unaligned + 1 * 32); ++ const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0)); ++ const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8)); ++ StoreU(out2, d, unaligned + 2 * 32); ++ StoreU(out3, d, unaligned + 3 * 32); ++} ++ + // ------------------------------ Reductions + + namespace detail { +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 2021-06-02 10:56:05.218904306 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -19,6 +19,23 @@ + // particular, "Broadcast", pack and zip behavior may be surprising. + + #include // AVX2+ ++#if defined(_MSC_VER) && defined(__clang__) ++// Including should be enough, but Clang's headers helpfully skip ++// including these headers when _MSC_VER is defined, like when using clang-cl. ++// Include these directly here. ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#endif ++ + #include + #include + +@@ -100,9 +117,8 @@ struct RawMask512<8> { + // Mask register: one bit per lane. + template + class Mask512 { +- using Raw = typename RawMask512::type; +- + public: ++ using Raw = typename RawMask512::type; + Raw raw; + }; + +@@ -167,23 +183,24 @@ HWY_API Vec512 Set(Full512{_mm512_set1_epi16(static_cast(t))}; // NOLINT + } + HWY_API Vec512 Set(Full512 /* tag */, const uint32_t t) { +- return Vec512{_mm512_set1_epi32(static_cast(t))}; // NOLINT ++ return Vec512{_mm512_set1_epi32(static_cast(t))}; + } + HWY_API Vec512 Set(Full512 /* tag */, const uint64_t t) { + return Vec512{ + _mm512_set1_epi64(static_cast(t))}; // NOLINT + } + HWY_API Vec512 Set(Full512 /* tag */, const int8_t t) { +- return Vec512{_mm512_set1_epi8(t)}; ++ return Vec512{_mm512_set1_epi8(static_cast(t))}; // NOLINT + } + HWY_API Vec512 Set(Full512 /* tag */, const int16_t t) { +- return Vec512{_mm512_set1_epi16(t)}; ++ return Vec512{_mm512_set1_epi16(static_cast(t))}; // NOLINT + } + HWY_API Vec512 Set(Full512 /* tag */, const int32_t t) { + return Vec512{_mm512_set1_epi32(t)}; + } + HWY_API Vec512 Set(Full512 /* tag */, const int64_t t) { +- return Vec512{_mm512_set1_epi64(t)}; ++ return Vec512{ ++ _mm512_set1_epi64(static_cast(t))}; // NOLINT + } + HWY_API Vec512 Set(Full512 /* tag */, const float t) { + return Vec512{_mm512_set1_ps(t)}; +@@ -329,7 +346,45 @@ HWY_API Vec512 CopySignToAbs(const Ve + return CopySign(abs, sign); + } + +-// ------------------------------ Select/blend ++// ------------------------------ FirstN ++ ++// Possibilities for constructing a bitmask of N ones: ++// - kshift* only consider the lowest byte of the shift count, so they would ++// not correctly handle large n. ++// - Scalar shifts >= 64 are UB. ++// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However, ++// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds. ++ ++#if HWY_ARCH_X86_32 ++namespace detail { ++ ++// 32 bit mask is sufficient for lane size >= 2. ++template ++HWY_API Mask512 FirstN(size_t n) { ++ using Bits = typename Mask512::Raw; ++ return Mask512{static_cast(_bzhi_u32(~uint32_t(0), n))}; ++} ++ ++template ++HWY_API Mask512 FirstN(size_t n) { ++ const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0); ++ return Mask512{static_cast<__mmask64>(bits)}; ++} ++ ++} // namespace detail ++#endif // HWY_ARCH_X86_32 ++ ++template ++HWY_API Mask512 FirstN(const Full512 /*tag*/, size_t n) { ++#if HWY_ARCH_X86_64 ++ using Bits = typename Mask512::Raw; ++ return Mask512{static_cast(_bzhi_u64(~uint64_t(0), n))}; ++#else ++ return detail::FirstN(n); ++#endif // HWY_ARCH_X86_64 ++} ++ ++// ------------------------------ IfThenElse + + // Returns mask ? b : a. + +@@ -626,7 +681,13 @@ HWY_API Vec512 AverageRound(co + + // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. + HWY_API Vec512 Abs(const Vec512 v) { ++#if HWY_COMPILER_MSVC ++ // Workaround for incorrect codegen? (untested due to internal compiler error) ++ const auto zero = Zero(Full512()); ++ return Vec512{_mm512_max_epi8(v.raw, (zero - v).raw)}; ++#else + return Vec512{_mm512_abs_epi8(v.raw)}; ++#endif + } + HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi16(v.raw)}; +@@ -634,6 +695,9 @@ HWY_API Vec512 Abs(const Vec512 + HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi32(v.raw)}; + } ++HWY_API Vec512 Abs(const Vec512 v) { ++ return Vec512{_mm512_abs_epi64(v.raw)}; ++} + + // These aren't native instructions, they also involve AND with constant. + HWY_API Vec512 Abs(const Vec512 v) { +@@ -675,6 +739,16 @@ HWY_API Vec512 ShiftLeft(const + return Vec512{_mm512_slli_epi64(v.raw, kBits)}; + } + ++template ++HWY_API Vec512 ShiftLeft(const Vec512 v) { ++ const Full512 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); ++ return kBits == 1 ++ ? (v + v) ++ : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); ++} ++ + // ------------------------------ ShiftRight + + template +@@ -693,6 +767,14 @@ HWY_API Vec512 ShiftRight(cons + } + + template ++HWY_API Vec512 ShiftRight(const Vec512 v) { ++ const Full512 d8; ++ // Use raw instead of BitCast to support N=1. ++ const Vec512 shifted{ShiftRight(Vec512{v.raw}).raw}; ++ return shifted & Set(d8, 0xFF >> kBits); ++} ++ ++template + HWY_API Vec512 ShiftRight(const Vec512 v) { + return Vec512{_mm512_srai_epi16(v.raw, kBits)}; + } +@@ -707,6 +789,15 @@ HWY_API Vec512 ShiftRight(const + return Vec512{_mm512_srai_epi64(v.raw, kBits)}; + } + ++template ++HWY_API Vec512 ShiftRight(const Vec512 v) { ++ const Full512 di; ++ const Full512 du; ++ const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ ShiftLeftSame + + HWY_API Vec512 ShiftLeftSame(const Vec512 v, +@@ -734,6 +825,14 @@ HWY_API Vec512 ShiftLeftSame(co + return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++template ++HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { ++ const Full512 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); ++ return shifted & Set(d8, (0xFF << bits) & 0xFF); ++} ++ + // ------------------------------ ShiftRightSame + + HWY_API Vec512 ShiftRightSame(const Vec512 v, +@@ -749,6 +848,13 @@ HWY_API Vec512 ShiftRightSame( + return Vec512{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { ++ const Full512 d8; ++ const RepartitionToWide d16; ++ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); ++ return shifted & Set(d8, 0xFF >> bits); ++} ++ + HWY_API Vec512 ShiftRightSame(const Vec512 v, + const int bits) { + return Vec512{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; +@@ -763,6 +869,14 @@ HWY_API Vec512 ShiftRightSame(c + return Vec512{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; + } + ++HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { ++ const Full512 di; ++ const Full512 du; ++ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); ++ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); ++ return (shifted ^ shifted_sign) - shifted_sign; ++} ++ + // ------------------------------ Shl + + HWY_API Vec512 operator<<(const Vec512 v, +@@ -1046,6 +1160,10 @@ HWY_API Vec512 ApproximateRecipro + + // ------------------------------ Floating-point rounding + ++// Work around warnings in the intrinsic definitions (passing -1 as a mask). ++HWY_DIAGNOSTICS(push) ++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") ++ + // Toward nearest integer, tie to even + HWY_API Vec512 Round(const Vec512 v) { + return Vec512{_mm512_roundscale_ps( +@@ -1086,6 +1204,8 @@ HWY_API Vec512 Floor(const Vec51 + _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; + } + ++HWY_DIAGNOSTICS(pop) ++ + // ================================================== COMPARE + + // Comparisons set a mask bit to 1 if the condition is true, else 0. +@@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512 + _mm512_stream_pd(aligned, v.raw); + } + ++// ------------------------------ Scatter ++ ++// Work around warnings in the intrinsic definitions (passing -1 as a mask). ++HWY_DIAGNOSTICS(push) ++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") ++ ++namespace detail { ++ ++template ++HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512 v, ++ Full512 /* tag */, T* HWY_RESTRICT base, ++ const Vec512 offset) { ++ _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1); ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512 v, ++ Full512 /* tag */, T* HWY_RESTRICT base, ++ const Vec512 index) { ++ _mm512_i32scatter_epi32(base, index.raw, v.raw, 4); ++} ++ ++template ++HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512 v, ++ Full512 /* tag */, T* HWY_RESTRICT base, ++ const Vec512 offset) { ++ _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1); ++} ++template ++HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512 v, ++ Full512 /* tag */, T* HWY_RESTRICT base, ++ const Vec512 index) { ++ _mm512_i64scatter_epi64(base, index.raw, v.raw, 8); ++} ++ ++} // namespace detail ++ ++template ++HWY_API void ScatterOffset(Vec512 v, Full512 d, T* HWY_RESTRICT base, ++ const Vec512 offset) { ++ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); ++ return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); ++} ++template ++HWY_API void ScatterIndex(Vec512 v, Full512 d, T* HWY_RESTRICT base, ++ const Vec512 index) { ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); ++ return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); ++} ++ ++template <> ++HWY_INLINE void ScatterOffset(Vec512 v, Full512 /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec512 offset) { ++ _mm512_i32scatter_ps(base, offset.raw, v.raw, 1); ++} ++template <> ++HWY_INLINE void ScatterIndex(Vec512 v, Full512 /* tag */, ++ float* HWY_RESTRICT base, ++ const Vec512 index) { ++ _mm512_i32scatter_ps(base, index.raw, v.raw, 4); ++} ++ ++template <> ++HWY_INLINE void ScatterOffset(Vec512 v, ++ Full512 /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec512 offset) { ++ _mm512_i64scatter_pd(base, offset.raw, v.raw, 1); ++} ++template <> ++HWY_INLINE void ScatterIndex(Vec512 v, ++ Full512 /* tag */, ++ double* HWY_RESTRICT base, ++ const Vec512 index) { ++ _mm512_i64scatter_pd(base, index.raw, v.raw, 8); ++} ++ + // ------------------------------ Gather + + namespace detail { +@@ -1713,13 +1910,13 @@ HWY_API Vec512 GatherIndex(hwy::SizeT + template + HWY_API Vec512 GatherOffset(Full512 d, const T* HWY_RESTRICT base, + const Vec512 offset) { +- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); ++static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::GatherOffset(hwy::SizeTag(), d, base, offset); + } + template + HWY_API Vec512 GatherIndex(Full512 d, const T* HWY_RESTRICT base, + const Vec512 index) { +- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); ++ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::GatherIndex(hwy::SizeTag(), d, base, index); + } + +@@ -1749,6 +1946,8 @@ HWY_INLINE Vec512 GatherIndex{_mm512_i64gather_pd(index.raw, base, 8)}; + } + ++HWY_DIAGNOSTICS(pop) ++ + // ================================================== SWIZZLE + + template +@@ -2439,7 +2638,11 @@ HWY_API Vec256 DemoteTo(Full256< + + HWY_API Vec256 DemoteTo(Full256 /* tag */, + const Vec512 v) { ++ // Work around warnings in the intrinsic definitions (passing -1 as a mask). ++ HWY_DIAGNOSTICS(push) ++ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + return Vec256{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; ++ HWY_DIAGNOSTICS(pop) + } + + HWY_API Vec256 DemoteTo(Full256 /* tag */, +@@ -2633,8 +2836,81 @@ HWY_API Vec512 Compress(Vec512{_mm512_maskz_compress_pd(mask.raw, v.raw)}; + } + ++namespace detail { ++ ++// Ignore IDE redefinition error for these two functions: if this header is ++// included, then the functions weren't actually defined in x86_256-inl.h. ++template ++HWY_API Vec256 Compress(hwy::SizeTag<2> /*tag*/, Vec256 v, ++ const uint64_t mask_bits) { ++ using D = Full256; ++ const Rebind du; ++ const Rebind dw; // 512-bit, not 256! ++ const auto vu16 = BitCast(du, v); // (required for float16_t inputs) ++ const Mask512 mask{static_cast<__mmask16>(mask_bits)}; ++ return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask))); ++} ++ ++} // namespace detail ++ ++template ++HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { ++ return detail::Compress(hwy::SizeTag(), v, ++ detail::BitsFromMask(mask)); ++} ++ ++// Expands to 32-bit, compresses, concatenate demoted halves. ++template ++HWY_API Vec512 Compress(Vec512 v, const Mask512 mask) { ++ using D = Full512; ++ const Rebind du; ++ const Repartition dw; ++ const auto vu16 = BitCast(du, v); // (required for float16_t inputs) ++ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16)); ++ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16)); ++ ++ const Mask512 mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)}; ++ const Mask512 mask1{static_cast<__mmask16>(mask.raw >> 16)}; ++ const auto compressed0 = Compress(promoted0, mask0); ++ const auto compressed1 = Compress(promoted1, mask1); ++ ++ const Half dh; ++ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0)); ++ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1)); ++ ++ // Concatenate into single vector by shifting upper with writemask. ++ const size_t num0 = CountTrue(mask0); ++ const __mmask32 m_upper = ~((1u << num0) - 1); ++ alignas(64) uint16_t iota[64] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; ++ const auto idx = LoadU(du, iota + 32 - num0); ++ return Vec512{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw, ++ demoted1.raw)}; ++} ++ + // ------------------------------ CompressStore + ++template ++HWY_API size_t CompressStore(Vec256 v, const Mask256 mask, Full256 d, ++ T* HWY_RESTRICT aligned) { ++ const uint64_t mask_bits = detail::BitsFromMask(mask); ++ Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); ++ return PopCount(mask_bits); ++} ++ ++template ++HWY_API size_t CompressStore(Vec512 v, const Mask512 mask, Full512 d, ++ T* HWY_RESTRICT aligned) { ++ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but ++ // using StoreU to concatenate the results would cause page faults if ++ // `aligned` is the last valid vector. Instead rely on in-register splicing. ++ Store(Compress(v, mask), d, aligned); ++ return CountTrue(mask); ++} ++ + HWY_API size_t CompressStore(Vec512 v, const Mask512 mask, + Full512 /* tag */, + uint32_t* HWY_RESTRICT aligned) { +@@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512 a, const Vec512 b, ++ const Vec512 c, Full512 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ const auto k5 = Set(d, 5); ++ const auto k6 = Set(d, 6); ++ ++ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. ++ // 0x80 so lanes to be filled from other vectors are 0 for blending. ++ alignas(16) static constexpr uint8_t tbl_r0[16] = { ++ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // ++ 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; ++ alignas(16) static constexpr uint8_t tbl_g0[16] = { ++ 0x80, 0, 0x80, 0x80, 1, 0x80, // ++ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; ++ const auto shuf_r0 = LoadDup128(d, tbl_r0); ++ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5 ++ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); ++ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0 ++ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0. ++ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0.. ++ const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00 ++ ++ // Second vector: g10,r10, bgr[9:6], b5,g5 ++ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. ++ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 ++ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. ++ const auto r1 = TableLookupBytes(a, shuf_r1); ++ const auto g1 = TableLookupBytes(b, shuf_g1); ++ const auto b1 = TableLookupBytes(c, shuf_b1); ++ const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05 ++ ++ // Third vector: bgr[15:11], b10 ++ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. ++ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. ++ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A ++ const auto r2 = TableLookupBytes(a, shuf_r2); ++ const auto g2 = TableLookupBytes(b, shuf_g2); ++ const auto b2 = TableLookupBytes(c, shuf_b2); ++ const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A ++ ++ // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns". ++ const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0)); ++ const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1)); ++ const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2)); ++ ++ // Alternating order, most-significant 128 bits from the second arg. ++ const __mmask8 m = 0xCC; ++ const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1); ++ const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2); ++ const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0); ++ ++ StoreU(Vec512{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00 ++ StoreU(Vec512{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15 ++ StoreU(Vec512{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A ++} ++ ++// ------------------------------ StoreInterleaved4 ++ ++HWY_API void StoreInterleaved4(const Vec512 v0, ++ const Vec512 v1, ++ const Vec512 v2, ++ const Vec512 v3, Full512 d, ++ uint8_t* HWY_RESTRICT unaligned) { ++ // let a,b,c,d denote v0..3. ++ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 ++ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 ++ const auto ba8 = ZipUpper(v0, v1); ++ const auto dc8 = ZipUpper(v2, v3); ++ const auto i = ZipLower(ba0, dc0).raw; // 4x128bit: d..a3 d..a0 ++ const auto j = ZipUpper(ba0, dc0).raw; // 4x128bit: d..a7 d..a4 ++ const auto k = ZipLower(ba8, dc8).raw; // 4x128bit: d..aB d..a8 ++ const auto l = ZipUpper(ba8, dc8).raw; // 4x128bit: d..aF d..aC ++ // 128-bit blocks were independent until now; transpose 4x4. ++ const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0)); ++ const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0)); ++ const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2)); ++ const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2)); ++ constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0); ++ constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1); ++ const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20); ++ const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31); ++ const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20); ++ const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31); ++ StoreU(Vec512{l0_k0_j0_i0}, d, unaligned + 0 * 64); ++ StoreU(Vec512{l1_k1_j1_i1}, d, unaligned + 1 * 64); ++ StoreU(Vec512{l2_k2_j2_i2}, d, unaligned + 2 * 64); ++ StoreU(Vec512{l3_k3_j3_i3}, d, unaligned + 3 * 64); ++} ++ + // ------------------------------ Reductions + + // Returns the sum in each lane. +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 2021-06-02 10:56:05.281904625 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -28,12 +28,12 @@ + + #if HWY_ARCH_X86 + #include +-#ifdef _MSC_VER ++#if HWY_COMPILER_MSVC + #include +-#else ++#else // HWY_COMPILER_MSVC + #include +-#endif +-#endif ++#endif // HWY_COMPILER_MSVC ++#endif // HWY_ARCH_X86 + + namespace hwy { + namespace { +@@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const + // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). + void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HWY_RESTRICT abcd) { +-#ifdef _MSC_VER ++#if HWY_COMPILER_MSVC + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +-#else ++#else // HWY_COMPILER_MSVC + uint32_t a; + uint32_t b; + uint32_t c; +@@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +-#endif ++#endif // HWY_COMPILER_MSVC + } + + // Returns the lower 32 bits of extended control register 0. + // Requires CPU support for "OSXSAVE" (see below). + uint32_t ReadXCR0() { +-#ifdef _MSC_VER ++#if HWY_COMPILER_MSVC + return static_cast(_xgetbv(0)); +-#else ++#else // HWY_COMPILER_MSVC + uint32_t xcr0, xcr0_high; + const uint32_t index = 0; + asm volatile(".byte 0x0F, 0x01, 0xD0" + : "=a"(xcr0), "=d"(xcr0_high) + : "c"(index)); + return xcr0; +-#endif ++#endif // HWY_COMPILER_MSVC + } + + #endif // HWY_ARCH_X86 +@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13; + constexpr uint32_t kAVX512DQ = 1u << 14; + constexpr uint32_t kAVX512BW = 1u << 15; + constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW; +-#endif ++#endif // HWY_ARCH_X86 + + } // namespace + +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 2021-06-02 10:56:05.267904554 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h 2021-05-31 10:37:11.000000000 -0400 +@@ -65,7 +65,9 @@ + // HWY_MAX_DYNAMIC_TARGETS in total. + #define HWY_HIGHEST_TARGET_BIT_X86 9 + +-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium ++#define HWY_SVE2 0x400 ++#define HWY_SVE 0x800 ++// 0x1000 reserved for Helium + #define HWY_NEON 0x2000 + + #define HWY_HIGHEST_TARGET_BIT_ARM 13 +@@ -90,6 +92,9 @@ + // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved + + #define HWY_SCALAR 0x20000000 ++ ++#define HWY_HIGHEST_TARGET_BIT_SCALAR 29 ++ + // Cannot use higher values, otherwise HWY_TARGETS computation might overflow. + + //------------------------------------------------------------------------------ +@@ -106,25 +111,26 @@ + #ifndef HWY_BROKEN_TARGETS + + // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid +-// SSE4 codegen (msan failure), so disable all those targets. ++// SSE4 codegen (possibly only for msan), so disable all those targets. + #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) +-// TODO: Disable all non-scalar targets for every build target once we have +-// clang-7 enabled in our builders. +-#ifdef MEMORY_SANITIZER + #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3) +-#else +-#define HWY_BROKEN_TARGETS 0 +-#endif + // This entails a major speed reduction, so warn unless the user explicitly + // opts in to scalar-only. + #if !defined(HWY_COMPILE_ONLY_SCALAR) + #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.") + #endif + +-// MSVC, or 32-bit may fail to compile AVX2/3. +-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32 ++// 32-bit may fail to compile AVX2/3. ++#elif HWY_ARCH_X86_32 + #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3) +-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds") ++ ++// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16 ++#elif HWY_COMPILER_MSVC != 0 ++#define HWY_BROKEN_TARGETS (HWY_AVX3) ++ ++// armv7be has not been tested and is not yet supported. ++#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN)) ++#define HWY_BROKEN_TARGETS (HWY_NEON) + + #else + #define HWY_BROKEN_TARGETS 0 +@@ -145,53 +151,74 @@ + // user to override this without any guarantee of success. + #ifndef HWY_BASELINE_TARGETS + +-#ifdef __wasm_simd128__ ++// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with ++// HWY_TARGET == HWY_SCALAR. ++ ++#if HWY_ARCH_WASM && defined(__wasm_simd128__) + #define HWY_BASELINE_WASM HWY_WASM + #else + #define HWY_BASELINE_WASM 0 + #endif + +-#ifdef __VSX__ ++// Avoid choosing the PPC target until we have an implementation. ++#if HWY_ARCH_PPC && defined(__VSX__) && 0 + #define HWY_BASELINE_PPC8 HWY_PPC8 + #else + #define HWY_BASELINE_PPC8 0 + #endif + +-// GCC 4.5.4 only defines the former; 5.4 defines both. +-#if defined(__ARM_NEON__) || defined(__ARM_NEON) ++// Avoid choosing the SVE[2] targets the implementation is ready. ++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0 ++#define HWY_BASELINE_SVE2 HWY_SVE2 ++#else ++#define HWY_BASELINE_SVE2 0 ++#endif ++ ++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0 ++#define HWY_BASELINE_SVE HWY_SVE ++#else ++#define HWY_BASELINE_SVE 0 ++#endif ++ ++// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both. ++#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON)) + #define HWY_BASELINE_NEON HWY_NEON + #else + #define HWY_BASELINE_NEON 0 + #endif + +-#ifdef __SSE4_1__ ++// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means ++// we at least get SSE4 on machines supporting AVX but not AVX2. ++// https://stackoverflow.com/questions/18563978/ ++#if HWY_ARCH_X86 && \ ++ (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__))) + #define HWY_BASELINE_SSE4 HWY_SSE4 + #else + #define HWY_BASELINE_SSE4 0 + #endif + +-#ifdef __AVX2__ ++#if HWY_ARCH_X86 && defined(__AVX2__) + #define HWY_BASELINE_AVX2 HWY_AVX2 + #else + #define HWY_BASELINE_AVX2 0 + #endif + +-#ifdef __AVX512F__ ++#if HWY_ARCH_X86 && defined(__AVX512F__) + #define HWY_BASELINE_AVX3 HWY_AVX3 + #else + #define HWY_BASELINE_AVX3 0 + #endif + +-#ifdef __riscv_vector ++#if HWY_ARCH_RVV && defined(__riscv_vector) + #define HWY_BASELINE_RVV HWY_RVV + #else + #define HWY_BASELINE_RVV 0 + #endif + + #define HWY_BASELINE_TARGETS \ +- (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \ +- HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ +- HWY_BASELINE_RVV) ++ (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \ ++ HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 | \ ++ HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV) + + #endif // HWY_BASELINE_TARGETS + +@@ -242,13 +269,12 @@ + #define HWY_TARGETS HWY_STATIC_TARGET + + // 3) For tests: include all attainable targets (in particular: scalar) +-#elif defined(HWY_COMPILE_ALL_ATTAINABLE) ++#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST) + #define HWY_TARGETS HWY_ATTAINABLE_TARGETS + + // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by + // excluding superseded targets, in particular scalar. + #else +- + #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1)) + + #endif // target policy +@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha + #endif + + #if HWY_ARCH_ARM ++ case HWY_SVE2: ++ return "SVE2"; ++ case HWY_SVE: ++ return "SVE"; + case HWY_NEON: + return "Neon"; + #endif +@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha + return "Scalar"; + + default: +- return "?"; ++ return "Unknown"; // must satisfy gtest IsValidParamName() + } + } + +@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha + nullptr, /* SSE3 */ \ + nullptr /* SSE2 */ + +-#endif // HWY_ARCH_X86 +- +-#if HWY_ARCH_ARM ++#elif HWY_ARCH_ARM + // See HWY_ARCH_X86 above for details. + #define HWY_MAX_DYNAMIC_TARGETS 4 + #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM + #define HWY_CHOOSE_TARGET_LIST(func_name) \ +- nullptr, /* reserved */ \ +- nullptr, /* reserved */ \ ++ HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ ++ HWY_CHOOSE_SVE(func_name), /* SVE */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_NEON(func_name) /* NEON */ + +-#endif // HWY_ARCH_ARM +- +-#if HWY_ARCH_PPC ++#elif HWY_ARCH_PPC + // See HWY_ARCH_X86 above for details. + #define HWY_MAX_DYNAMIC_TARGETS 5 + #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC +@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha + nullptr, /* VSX */ \ + nullptr /* AltiVec */ + +-#endif // HWY_ARCH_PPC +- +-#if HWY_ARCH_WASM ++#elif HWY_ARCH_WASM + // See HWY_ARCH_X86 above for details. + #define HWY_MAX_DYNAMIC_TARGETS 4 + #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM +@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha + nullptr, /* reserved */ \ + HWY_CHOOSE_WASM(func_name) /* WASM */ + +-#endif // HWY_ARCH_WASM +- +-#if HWY_ARCH_RVV ++#elif HWY_ARCH_RVV + // See HWY_ARCH_X86 above for details. + #define HWY_MAX_DYNAMIC_TARGETS 4 + #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV +@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha + nullptr, /* reserved */ \ + HWY_CHOOSE_RVV(func_name) /* RVV */ + +-#endif // HWY_ARCH_RVV ++#else ++// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though ++// still creating single-entry tables in HWY_EXPORT to ensure portability. ++#define HWY_MAX_DYNAMIC_TARGETS 1 ++#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR ++#endif + + struct ChosenTarget { + public: +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 2021-06-02 10:56:05.264904539 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR) + HWY_EXPORT(FakeFunction); + + void CheckFakeFunction() { +-#define CHECK_ARRAY_ENTRY(TGT) \ +- if ((HWY_TARGETS & HWY_##TGT) != 0) { \ +- hwy::SetSupportedTargetsForTest(HWY_##TGT); \ +- /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \ +- /* the pointer to the already cached function. */ \ +- hwy::chosen_target.Update(); \ +- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ +- /* Calling DeInit() will test that the initializer function */ \ +- /* also calls the right function. */ \ +- hwy::chosen_target.DeInit(); \ +- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ +- /* Second call uses the cached value from the previous call. */ \ +- EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ ++#define CHECK_ARRAY_ENTRY(TGT) \ ++ if ((HWY_TARGETS & HWY_##TGT) != 0) { \ ++ hwy::SetSupportedTargetsForTest(HWY_##TGT); \ ++ /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \ ++ /* the pointer to the already cached function. */ \ ++ hwy::chosen_target.Update(); \ ++ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ ++ /* Calling DeInit() will test that the initializer function */ \ ++ /* also calls the right function. */ \ ++ hwy::chosen_target.DeInit(); \ ++ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ ++ /* Second call uses the cached value from the previous call. */ \ ++ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ + } + CHECK_ARRAY_ENTRY(AVX3) + CHECK_ARRAY_ENTRY(AVX2) +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 2021-06-02 10:56:05.251904473 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -16,7 +16,6 @@ + #include + + #include +-#include + #include + + #undef HWY_TARGET_INCLUDE +@@ -173,16 +172,8 @@ struct TestFloatAbs { + }; + + HWY_NOINLINE void TestAllAbs() { +- const ForPartialVectors test; +- test(int8_t()); +- test(int16_t()); +- test(int32_t()); +- +- const ForPartialVectors test_float; +- test_float(float()); +-#if HWY_CAP_FLOAT64 +- test_float(double()); +-#endif ++ ForSignedTypes(ForPartialVectors()); ++ ForFloatTypes(ForPartialVectors()); + } + + template +@@ -199,6 +190,45 @@ struct TestLeftShifts { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + ++ const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift ++ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; ++ ++ // 0 ++ HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); ++ HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); ++ ++ // 1 ++ for (size_t i = 0; i < N; ++i) { ++ const T value = kSigned ? T(i) - T(N) : T(i); ++ expected[i] = T(TU(value) << 1); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); ++ ++ // max ++ for (size_t i = 0; i < N; ++i) { ++ const T value = kSigned ? T(i) - T(N) : T(i); ++ expected[i] = T(TU(value) << kMaxShift); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft(values)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); ++ } ++}; ++ ++template ++struct TestVariableLeftShifts { ++ template ++ HWY_NOINLINE void operator()(T t, D d) { ++ if (kSigned) { ++ // Also test positive values ++ TestVariableLeftShifts()(t, d); ++ } ++ ++ using TI = MakeSigned; ++ using TU = MakeUnsigned; ++ const size_t N = Lanes(d); ++ auto expected = AllocateAligned(N); ++ + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift +@@ -209,8 +239,6 @@ struct TestLeftShifts { + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 +- HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); +- HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); + HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); + + // Same: 1 +@@ -218,8 +246,6 @@ struct TestLeftShifts { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << 1); + } +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); + + // Same: max +@@ -227,8 +253,6 @@ struct TestLeftShifts { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft(values)); +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); + HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); + + // Variable: small +@@ -252,6 +276,37 @@ struct TestUnsignedRightShifts { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + ++ const auto values = Iota(d, 0); ++ ++ const T kMax = LimitsMax(); ++ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; ++ ++ // Shift by 0 ++ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); ++ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); ++ ++ // Shift by 1 ++ for (size_t i = 0; i < N; ++i) { ++ expected[i] = T(T(i & kMax) >> 1); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); ++ ++ // max ++ for (size_t i = 0; i < N; ++i) { ++ expected[i] = T(T(i & kMax) >> kMaxShift); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight(values)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); ++ } ++}; ++ ++struct TestVariableUnsignedRightShifts { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ const size_t N = Lanes(d); ++ auto expected = AllocateAligned(N); ++ + const auto v0 = Zero(d); + const auto v1 = Set(d, 1); + const auto values = Iota(d, 0); +@@ -265,21 +320,15 @@ struct TestUnsignedRightShifts { + const auto large_shifts = max_shift - small_shifts; + + // Same: 0 +- HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); +- HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); + + // Same: 1 + for (size_t i = 0; i < N; ++i) { +- expected[i] = T(i >> 1); ++ expected[i] = T(T(i & kMax) >> 1); + } +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); +- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); + + // Same: max +- HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(values)); +- HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); + HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); + + // Variable: small +@@ -296,33 +345,120 @@ struct TestUnsignedRightShifts { + } + }; + +-struct TestSignedRightShifts { ++template ++T RightShiftNegative(T val) { ++ // C++ shifts are implementation-defined for negative numbers, and we have ++ // seen divisions replaced with shifts, so resort to bit operations. ++ using TU = hwy::MakeUnsigned; ++ TU bits; ++ CopyBytes(&val, &bits); ++ ++ const TU shifted = bits >> kAmount; ++ ++ const TU all = ~TU(0); ++ const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; ++ const TU sign_extended = static_cast((all << num_zero) & LimitsMax()); ++ ++ bits = shifted | sign_extended; ++ CopyBytes(&bits, &val); ++ return val; ++} ++ ++class TestSignedRightShifts { ++ public: + template +- HWY_NOINLINE void operator()(T t, D d) { +- // Also test positive values +- TestUnsignedRightShifts()(t, d); ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ const size_t N = Lanes(d); ++ auto expected = AllocateAligned(N); ++ constexpr T kMin = LimitsMin(); ++ constexpr T kMax = LimitsMax(); ++ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; ++ ++ // First test positive values, negative are checked below. ++ const auto v0 = Zero(d); ++ const auto values = Iota(d, 0) & Set(d, kMax); ++ ++ // Shift by 0 ++ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); ++ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); ++ ++ // Shift by 1 ++ for (size_t i = 0; i < N; ++i) { ++ expected[i] = T(T(i & kMax) >> 1); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); ++ ++ // max ++ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(values)); ++ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); ++ ++ // Even negative value ++ Test<0>(kMin, d, __LINE__); ++ Test<1>(kMin, d, __LINE__); ++ Test<2>(kMin, d, __LINE__); ++ Test(kMin, d, __LINE__); ++ ++ const T odd = static_cast(kMin + 1); ++ Test<0>(odd, d, __LINE__); ++ Test<1>(odd, d, __LINE__); ++ Test<2>(odd, d, __LINE__); ++ Test(odd, d, __LINE__); ++ } ++ ++ private: ++ template ++ void Test(T val, D d, int line) { ++ const auto expected = Set(d, RightShiftNegative(val)); ++ const auto in = Set(d, val); ++ const char* file = __FILE__; ++ AssertVecEqual(d, expected, ShiftRight(in), file, line); ++ AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); ++ } ++}; + ++struct TestVariableSignedRightShifts { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + constexpr T kMin = LimitsMin(); +- const auto values = Iota(d, kMin); ++ constexpr T kMax = LimitsMax(); + + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; ++ ++ // First test positive values, negative are checked below. ++ const auto v0 = Zero(d); ++ const auto positive = Iota(d, 0) & Set(d, kMax); ++ ++ // Shift by 0 ++ HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); ++ HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); ++ ++ // Shift by 1 ++ for (size_t i = 0; i < N; ++i) { ++ expected[i] = T(T(i & kMax) >> 1); ++ } ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); ++ ++ // max ++ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(positive)); ++ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); ++ + const auto max_shift = Set(d, kMaxShift); + const auto small_shifts = And(Iota(d, 0), max_shift); + const auto large_shifts = max_shift - small_shifts; + +- // Test varying values to shift ++ const auto negative = Iota(d, kMin); ++ ++ // Test varying negative to shift + for (size_t i = 0; i < N; ++i) { +- // We want a right-shift here, which is undefined behavior for negative +- // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if +- // minT is odd and negative. +- T minT = static_cast(kMin + i); +- expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0)); ++ expected[i] = RightShiftNegative<1>(static_cast(kMin + i)); + } +- HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1))); ++ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); + + // Shift MSB right by small amounts + for (size_t i = 0; i < N; ++i) { +@@ -343,6 +479,13 @@ struct TestSignedRightShifts { + }; + + HWY_NOINLINE void TestAllShifts() { ++ ForUnsignedTypes(ForPartialVectors>()); ++ ForSignedTypes(ForPartialVectors>()); ++ ForUnsignedTypes(ForPartialVectors()); ++ ForSignedTypes(ForPartialVectors()); ++} ++ ++HWY_NOINLINE void TestAllVariableShifts() { + const ForPartialVectors> shl_u; + const ForPartialVectors> shl_s; + const ForPartialVectors shr_u; +@@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() { + ForFloatTypes(ForPartialVectors()); + } + ++struct TestNearestInt { ++ template ++ HWY_NOINLINE void operator()(TF tf, const DF df) { ++ using TI = MakeSigned; ++ const RebindToSigned di; ++ ++ size_t padded; ++ auto in = RoundTestCases(tf, df, padded); ++ auto expected = AllocateAligned(padded); ++ ++ constexpr double max = static_cast(LimitsMax()); ++ for (size_t i = 0; i < padded; ++i) { ++ if (std::isnan(in[i])) { ++ // We replace NaN with 0 below (no_nan) ++ expected[i] = 0; ++ } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) { ++ // Avoid undefined result for lrintf ++ expected[i] = std::signbit(in[i]) ? LimitsMin() : LimitsMax(); ++ } else { ++ expected[i] = lrintf(in[i]); ++ } ++ } ++ for (size_t i = 0; i < padded; i += Lanes(df)) { ++ const auto v = Load(df, &in[i]); ++ const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df)); ++ HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan)); ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllNearestInt() { ++ ForPartialVectors()(float()); ++} ++ + struct TestTrunc { + template + HWY_NOINLINE void operator()(T t, D d) { +@@ -909,8 +1086,7 @@ struct TestSumOfLanes { + }; + + HWY_NOINLINE void TestAllSumOfLanes() { +- // Only full vectors because lanes in partial vectors are undefined. +- const ForFullVectors sum; ++ const ForPartialVectors sum; + + // No u8/u16/i8/i16. + sum(uint32_t()); +@@ -976,9 +1152,8 @@ struct TestMaxOfLanes { + }; + + HWY_NOINLINE void TestAllMinMaxOfLanes() { +- // Only full vectors because lanes in partial vectors are undefined. +- const ForFullVectors min; +- const ForFullVectors max; ++ const ForPartialVectors min; ++ const ForPartialVectors max; + + // No u8/u16/i8/i16. + min(uint32_t()); +@@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyArithmeticTest); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts); ++HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs); +@@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound); ++HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff); + HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 2021-06-02 10:56:05.252904478 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyCombineTest); + HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf); + HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf); + HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector); + HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine); + HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif + + #else +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 2021-06-02 10:56:05.249904463 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyCompareTest); + HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask); + HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality); + HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt); + HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat); + HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 2021-06-02 10:56:05.261904523 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -16,8 +16,6 @@ + #include + #include + +-#include +- + #undef HWY_TARGET_INCLUDE + #define HWY_TARGET_INCLUDE "tests/convert_test.cc" + #include "hwy/foreach_target.h" +@@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() { + #endif + } + +-struct TestNearestInt { +- template +- HWY_NOINLINE void operator()(TI /*unused*/, const DI di) { +- using TF = MakeFloat; +- const Rebind df; +- const size_t N = Lanes(df); +- +- // Integer positive +- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f))); +- +- // Integer negative +- HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f))); +- +- // Above positive +- HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f))); +- +- // Below positive +- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f))); +- +- const TF eps = static_cast(0.0001); +- // Above negative +- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps))); +- +- // Below negative +- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps))); +- } +-}; +- +-HWY_NOINLINE void TestAllNearestInt() { +- ForPartialVectors()(int32_t()); +-} + + // NOLINTNEXTLINE(google-readability-namespace-comments) + } // namespace HWY_NAMESPACE +@@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyConvertTest); + HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast); + HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); +@@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te + HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); + HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); + HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64); +-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 2021-06-02 10:56:05.245904442 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -14,6 +14,7 @@ + + #include + #include ++#include // memcmp + + #include "hwy/base.h" + +@@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() { + ForFloatTypes(ForPartialVectors()); + } + ++struct TestFirstN { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ const size_t N = Lanes(d); ++ auto mask_lanes = AllocateAligned(N); ++ ++ // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we ++ // cannot reliably compare against all bits set (NaN for float types). ++ const T off = 1; ++ ++ for (size_t len = 0; len <= N; ++len) { ++ for (size_t i = 0; i < N; ++i) { ++ mask_lanes[i] = i < len ? T(0) : off; ++ } ++ const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d)); ++ HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len)); ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllFirstN() { ++ ForAllTypes(ForPartialVectors()); ++} ++ + struct TestIfThenElse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +@@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() { + ForAllTypes(ForPartialVectors()); + } + +-// Also tests MaskFromVec/VecFromMask ++struct TestMaskVec { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ RandomState rng; ++ ++ const size_t N = Lanes(d); ++ auto mask_lanes = AllocateAligned(N); ++ ++ // Each lane should have a chance of having mask=true. ++ for (size_t rep = 0; rep < 100; ++rep) { ++ for (size_t i = 0; i < N; ++i) { ++ mask_lanes[i] = static_cast(Random32(&rng) & 1); ++ } ++ ++ const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d))); ++ HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllMaskVec() { ++ const ForPartialVectors test; ++ ++ test(uint16_t()); ++ test(int16_t()); ++ // TODO(janwas): float16_t - cannot compare yet ++ ++ test(uint32_t()); ++ test(int32_t()); ++ test(float()); ++ ++#if HWY_CAP_INTEGER64 ++ test(uint64_t()); ++ test(int64_t()); ++#endif ++#if HWY_CAP_FLOAT64 ++ test(double()); ++#endif ++} ++ + struct TestCompress { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + ++ using TU = MakeUnsigned; ++ const Rebind du; + const size_t N = Lanes(d); + auto in_lanes = AllocateAligned(N); +- auto mask_lanes = AllocateAligned(N); ++ auto mask_lanes = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto actual = AllocateAligned(N); + +@@ -224,35 +290,56 @@ struct TestCompress { + for (size_t rep = 0; rep < 100; ++rep) { + size_t expected_pos = 0; + for (size_t i = 0; i < N; ++i) { +- in_lanes[i] = static_cast(Random32(&rng)); +- mask_lanes[i] = static_cast(Random32(&rng) & 1); ++ const uint64_t bits = Random32(&rng); ++ in_lanes[i] = T(); // cannot initialize float16_t directly. ++ CopyBytes(&bits, &in_lanes[i]); ++ mask_lanes[i] = static_cast(Random32(&rng) & 1); + if (mask_lanes[i] == 0) { // Zero means true (easier to compare) + expected[expected_pos++] = in_lanes[i]; + } + } + + const auto in = Load(d, in_lanes.get()); +- const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d)); ++ const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du))); + +- HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + Store(Compress(in, mask), d, actual.get()); + // Upper lanes are undefined. + for (size_t i = 0; i < expected_pos; ++i) { +- HWY_ASSERT(actual[i] == expected[i]); ++ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0); + } + + // Also check CompressStore in the same way. +- std::fill(actual.get(), actual.get() + N, T(0)); ++ memset(actual.get(), 0, N * sizeof(T)); + const size_t num_written = CompressStore(in, mask, d, actual.get()); + HWY_ASSERT_EQ(expected_pos, num_written); + for (size_t i = 0; i < expected_pos; ++i) { +- HWY_ASSERT_EQ(expected[i], actual[i]); ++ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0); + } + } + } + }; + + #if 0 ++namespace detail { // for code folding ++void PrintCompress16x8Tables() { ++ constexpr size_t N = 8; // 128-bit SIMD ++ for (uint64_t code = 0; code < 1ull << N; ++code) { ++ std::array indices{0}; ++ size_t pos = 0; ++ for (size_t i = 0; i < N; ++i) { ++ if (code & (1ull << i)) { ++ indices[pos++] = i; ++ } ++ } ++ ++ // Doubled (for converting lane to byte indices) ++ for (size_t i = 0; i < N; ++i) { ++ printf("%d,", 2 * indices[i]); ++ } ++ } ++ printf("\n"); ++} ++ + // Compressed to nibbles + void PrintCompress32x8Tables() { + constexpr size_t N = 8; // AVX2 +@@ -340,16 +427,22 @@ void PrintCompress64x2Tables() { + } + printf("\n"); + } +- ++} // namespace detail + #endif + + HWY_NOINLINE void TestAllCompress() { +- // PrintCompress32x8Tables(); +- // PrintCompress64x4Tables(); +- // PrintCompress32x4Tables(); +- // PrintCompress64x2Tables(); ++ // detail::PrintCompress32x8Tables(); ++ // detail::PrintCompress64x4Tables(); ++ // detail::PrintCompress32x4Tables(); ++ // detail::PrintCompress64x2Tables(); ++ // detail::PrintCompress16x8Tables(); + + const ForPartialVectors test; ++ ++ test(uint16_t()); ++ test(int16_t()); ++ test(float16_t()); ++ + test(uint32_t()); + test(int32_t()); + test(float()); +@@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() { + test(uint64_t()); + test(int64_t()); + #endif +- + #if HWY_CAP_FLOAT64 + test(double()); + #endif +@@ -432,7 +524,7 @@ struct TestTestBit { + }; + + HWY_NOINLINE void TestAllTestBit() { +- ForIntegerTypes(ForFullVectors()); ++ ForIntegerTypes(ForPartialVectors()); + } + + struct TestAllTrueFalse { +@@ -445,6 +537,8 @@ struct TestAllTrueFalse { + auto lanes = AllocateAligned(N); + std::fill(lanes.get(), lanes.get() + N, T(0)); + ++ auto mask_lanes = AllocateAligned(N); ++ + HWY_ASSERT(AllTrue(Eq(v, zero))); + HWY_ASSERT(!AllFalse(Eq(v, zero))); + +@@ -456,7 +550,13 @@ struct TestAllTrueFalse { + for (size_t i = 0; i < N; ++i) { + lanes[i] = T(1); + v = Load(d, lanes.get()); +- HWY_ASSERT(!AllTrue(Eq(v, zero))); ++ ++ // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be. ++ // Assigning to an lvalue is insufficient but storing to memory prevents ++ // the bug; so does Print of VecFromMask(d, Eq(v, zero)). ++ Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get()); ++ HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get())))); ++ + HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero))); + + lanes[i] = T(-1); +@@ -596,7 +696,7 @@ struct TestLogicalMask { + }; + + HWY_NOINLINE void TestAllLogicalMask() { +- ForAllTypes(ForFullVectors()); ++ ForAllTypes(ForPartialVectors()); + } + // NOLINTNEXTLINE(google-readability-namespace-comments) + } // namespace HWY_NAMESPACE +@@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyLogicalTest); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign); ++HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse); ++HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit); +@@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue); + HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 2021-06-02 10:56:05.247904453 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -12,6 +12,12 @@ + // See the License for the specific language governing permissions and + // limitations under the License. + ++// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are ++// detected. Must come before Highway headers. ++#if defined(_WIN32) || defined(_WIN64) ++#include ++#endif ++ + #include + #include + +@@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() { + ForAllTypes(ForPartialVectors()); + } + ++struct TestStoreInterleaved3 { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ const size_t N = Lanes(d); ++ ++ RandomState rng; ++ ++ // Data to be interleaved ++ auto bytes = AllocateAligned(3 * N); ++ for (size_t i = 0; i < 3 * N; ++i) { ++ bytes[i] = static_cast(Random32(&rng) & 0xFF); ++ } ++ const auto in0 = Load(d, &bytes[0 * N]); ++ const auto in1 = Load(d, &bytes[1 * N]); ++ const auto in2 = Load(d, &bytes[2 * N]); ++ ++ // Interleave here, ensure vector results match scalar ++ auto expected = AllocateAligned(4 * N); ++ auto actual_aligned = AllocateAligned(4 * N + 1); ++ T* actual = actual_aligned.get() + 1; ++ ++ for (size_t rep = 0; rep < 100; ++rep) { ++ for (size_t i = 0; i < N; ++i) { ++ expected[3 * i + 0] = bytes[0 * N + i]; ++ expected[3 * i + 1] = bytes[1 * N + i]; ++ expected[3 * i + 2] = bytes[2 * N + i]; ++ // Ensure we do not write more than 3*N bytes ++ expected[3 * N + i] = actual[3 * N + i] = 0; ++ } ++ StoreInterleaved3(in0, in1, in2, d, actual); ++ size_t pos = 0; ++ if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) { ++ Print(d, "in0", in0, pos / 3); ++ Print(d, "in1", in1, pos / 3); ++ Print(d, "in2", in2, pos / 3); ++ const size_t i = pos - pos % 3; ++ fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i], ++ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4], ++ actual[i + 5]); ++ HWY_ASSERT(false); ++ } ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllStoreInterleaved3() { ++#if HWY_TARGET == HWY_RVV ++ // Segments are limited to 8 registers, so we can only go up to LMUL=2. ++ const ForExtendableVectors test; ++#else ++ const ForPartialVectors test; ++#endif ++ test(uint8_t()); ++} ++ ++struct TestStoreInterleaved4 { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ const size_t N = Lanes(d); ++ ++ RandomState rng; ++ ++ // Data to be interleaved ++ auto bytes = AllocateAligned(4 * N); ++ for (size_t i = 0; i < 4 * N; ++i) { ++ bytes[i] = static_cast(Random32(&rng) & 0xFF); ++ } ++ const auto in0 = Load(d, &bytes[0 * N]); ++ const auto in1 = Load(d, &bytes[1 * N]); ++ const auto in2 = Load(d, &bytes[2 * N]); ++ const auto in3 = Load(d, &bytes[3 * N]); ++ ++ // Interleave here, ensure vector results match scalar ++ auto expected = AllocateAligned(5 * N); ++ auto actual_aligned = AllocateAligned(5 * N + 1); ++ T* actual = actual_aligned.get() + 1; ++ ++ for (size_t rep = 0; rep < 100; ++rep) { ++ for (size_t i = 0; i < N; ++i) { ++ expected[4 * i + 0] = bytes[0 * N + i]; ++ expected[4 * i + 1] = bytes[1 * N + i]; ++ expected[4 * i + 2] = bytes[2 * N + i]; ++ expected[4 * i + 3] = bytes[3 * N + i]; ++ // Ensure we do not write more than 4*N bytes ++ expected[4 * N + i] = actual[4 * N + i] = 0; ++ } ++ StoreInterleaved4(in0, in1, in2, in3, d, actual); ++ size_t pos = 0; ++ if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) { ++ Print(d, "in0", in0, pos / 4); ++ Print(d, "in1", in1, pos / 4); ++ Print(d, "in2", in2, pos / 4); ++ Print(d, "in3", in3, pos / 4); ++ const size_t i = pos; ++ fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i], ++ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4], ++ actual[i + 5], actual[i + 6], actual[i + 7]); ++ HWY_ASSERT(false); ++ } ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllStoreInterleaved4() { ++#if HWY_TARGET == HWY_RVV ++ // Segments are limited to 8 registers, so we can only go up to LMUL=2. ++ const ForExtendableVectors test; ++#else ++ const ForPartialVectors test; ++#endif ++ test(uint8_t()); ++} ++ + struct TestLoadDup128 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +@@ -86,13 +205,14 @@ struct TestLoadDup128 { + for (size_t i = 0; i < N128; ++i) { + lanes[i] = static_cast(1 + i); + } +- const auto v = LoadDup128(d, lanes); ++ + const size_t N = Lanes(d); +- auto out = AllocateAligned(N); +- Store(v, d, out.get()); ++ auto expected = AllocateAligned(N); + for (size_t i = 0; i < N; ++i) { +- HWY_ASSERT_EQ(T(i % N128 + 1), out[i]); ++ expected[i] = static_cast(i % N128 + 1); + } ++ ++ HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes)); + #else + (void)d; + #endif +@@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() { + ForFloatTypes(test); + } + ++// Assumes little-endian byte order! ++struct TestScatter { ++ template ++ HWY_NOINLINE void operator()(T /*unused*/, D d) { ++ using Offset = MakeSigned; ++ ++ const size_t N = Lanes(d); ++ const size_t range = 4 * N; // number of items to scatter ++ const size_t max_bytes = range * sizeof(T); // upper bound on offset ++ ++ RandomState rng; ++ ++ // Data to be scattered ++ auto bytes = AllocateAligned(max_bytes); ++ for (size_t i = 0; i < max_bytes; ++i) { ++ bytes[i] = static_cast(Random32(&rng) & 0xFF); ++ } ++ const auto data = Load(d, reinterpret_cast(bytes.get())); ++ ++ // Scatter into these regions, ensure vector results match scalar ++ auto expected = AllocateAligned(range); ++ auto actual = AllocateAligned(range); ++ ++ const Rebind d_offsets; ++ auto offsets = AllocateAligned(N); // or indices ++ ++ for (size_t rep = 0; rep < 100; ++rep) { ++ // Byte offsets ++ std::fill(expected.get(), expected.get() + range, T(0)); ++ std::fill(actual.get(), actual.get() + range, T(0)); ++ for (size_t i = 0; i < N; ++i) { ++ offsets[i] = ++ static_cast(Random32(&rng) % (max_bytes - sizeof(T))); ++ CopyBytes( ++ bytes.get() + i * sizeof(T), ++ reinterpret_cast(expected.get()) + offsets[i]); ++ } ++ const auto voffsets = Load(d_offsets, offsets.get()); ++ ScatterOffset(data, d, actual.get(), voffsets); ++ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { ++ Print(d, "Data", data); ++ Print(d_offsets, "Offsets", voffsets); ++ HWY_ASSERT(false); ++ } ++ ++ // Indices ++ std::fill(expected.get(), expected.get() + range, T(0)); ++ std::fill(actual.get(), actual.get() + range, T(0)); ++ for (size_t i = 0; i < N; ++i) { ++ offsets[i] = static_cast(Random32(&rng) % range); ++ CopyBytes(bytes.get() + i * sizeof(T), ++ &expected[offsets[i]]); ++ } ++ const auto vindices = Load(d_offsets, offsets.get()); ++ ScatterIndex(data, d, actual.get(), vindices); ++ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { ++ Print(d, "Data", data); ++ Print(d_offsets, "Indices", vindices); ++ HWY_ASSERT(false); ++ } ++ } ++ } ++}; ++ ++HWY_NOINLINE void TestAllScatter() { ++ // No u8,u16,i8,i16. ++ const ForPartialVectors test; ++ test(uint32_t()); ++ test(int32_t()); ++ ++#if HWY_CAP_INTEGER64 ++ test(uint64_t()); ++ test(int64_t()); ++#endif ++ ++ ForFloatTypes(test); ++} ++ + struct TestGather { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +@@ -183,21 +381,15 @@ struct TestGather { + + HWY_NOINLINE void TestAllGather() { + // No u8,u16,i8,i16. +- const ForPartialVectors test32; +- test32(uint32_t()); +- test32(int32_t()); ++ const ForPartialVectors test; ++ test(uint32_t()); ++ test(int32_t()); + + #if HWY_CAP_INTEGER64 +- const ForPartialVectors test64; +- test64(uint64_t()); +- test64(int64_t()); +-#endif +- +- ForPartialVectors()(float()); +- +-#if HWY_CAP_FLOAT64 +- ForPartialVectors()(double()); ++ test(uint64_t()); ++ test(int64_t()); + #endif ++ ForFloatTypes(test); + } + + HWY_NOINLINE void TestAllCache() { +@@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() { + int test = 0; + Prefetch(&test); + FlushCacheline(&test); ++ Pause(); + } + + // NOLINTNEXTLINE(google-readability-namespace-comments) +@@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwyMemoryTest); + HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); ++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3); ++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4); + HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); + HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); ++HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); + HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); + HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 2021-06-02 10:56:05.259904513 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc 2021-05-31 10:37:11.000000000 -0400 +@@ -223,6 +223,7 @@ struct TestTableLookupBytes { + HWY_NOINLINE void TestAllTableLookupBytes() { + ForIntegerTypes(ForPartialVectors()); + } ++ + struct TestTableLookupLanes { + #if HWY_TARGET == HWY_RVV + using Index = uint32_t; +@@ -242,12 +243,13 @@ struct TestTableLookupLanes { + if (N <= 8) { // Test all permutations + for (size_t i0 = 0; i0 < N; ++i0) { + idx[0] = static_cast(i0); ++ + for (size_t i1 = 0; i1 < N; ++i1) { +- idx[1] = static_cast(i1); ++ if (N >= 2) idx[1] = static_cast(i1); + for (size_t i2 = 0; i2 < N; ++i2) { +- idx[2] = static_cast(i2); ++ if (N >= 4) idx[2] = static_cast(i2); + for (size_t i3 = 0; i3 < N; ++i3) { +- idx[3] = static_cast(i3); ++ if (N >= 4) idx[3] = static_cast(i3); + + for (size_t i = 0; i < N; ++i) { + expected[i] = static_cast(idx[i] + 1); // == v[idx[i]] +@@ -286,7 +288,7 @@ struct TestTableLookupLanes { + }; + + HWY_NOINLINE void TestAllTableLookupLanes() { +- const ForFullVectors test; ++ const ForPartialVectors test; + test(uint32_t()); + test(int32_t()); + test(float()); +@@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() { + HWY_AFTER_NAMESPACE(); + + #if HWY_ONCE ++namespace hwy { + HWY_BEFORE_TEST(HwySwizzleTest); + HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes); + HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes); +@@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te + HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper); + HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower); + HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven); +-HWY_AFTER_TEST(); ++} // namespace hwy + #endif +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h +--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 2021-06-02 10:56:05.254904488 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h 2021-05-31 10:37:11.000000000 -0400 +@@ -23,7 +23,6 @@ + #include + #include + +-#include // isfinite + #include + #include + #include // std::forward +@@ -73,7 +72,8 @@ class TestWithParamTarget : public testi + + // Function to convert the test parameter of a TestWithParamTarget for + // displaying it in the gtest test name. +-std::string TestParamTargetName(const testing::TestParamInfo& info) { ++static inline std::string TestParamTargetName( ++ const testing::TestParamInfo& info) { + return TargetName(info.param); + } + +@@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT( + static_assert(true, "For requiring trailing semicolon") + + #define HWY_BEFORE_TEST(suite) \ +- namespace hwy { \ + class suite : public hwy::TestWithParamTarget {}; \ + HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \ + static_assert(true, "For requiring trailing semicolon") + +-#define HWY_AFTER_TEST() \ +- } /* namespace hwy */ \ +- static_assert(true, "For requiring trailing semicolon") +- +-// Calls test for each enabled and available target. +-template +-HWY_NOINLINE void RunTest(const Func& func, Args&&... args) { +- SetSupportedTargetsForTest(0); +- auto targets = SupportedAndGeneratedTargets(); +- +- for (uint32_t target : targets) { +- SetSupportedTargetsForTest(target); +- fprintf(stderr, "Testing for target %s.\n", +- TargetName(static_cast(target))); +- func(std::forward(args)...); +- } +- // Disable the mask after the test. +- SetSupportedTargetsForTest(0); +-} +- + // 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937, + // which triggers a compiler bug. + class RandomState { +@@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand + // built-in types. + template + inline void PreventElision(T&& output) { +-#ifndef _MSC_VER ++#if HWY_COMPILER_MSVC ++ (void)output; ++#else // HWY_COMPILER_MSVC + asm volatile("" : "+r"(output) : : "memory"); +-#endif ++#endif // HWY_COMPILER_MSVC + } + + // Returns a name for the vector/part/scalar. The type prefix is u/i/f for +@@ -234,23 +215,34 @@ inline void PreventElision(T&& output) { + // understanding which instantiation of a generic test failed. + template + static inline std::string TypeName(T /*unused*/, size_t N) { +- std::string prefix(IsFloat() ? "f" : (IsSigned() ? "i" : "u")); +- prefix += std::to_string(sizeof(T) * 8); +- +- // Scalars: omit the xN suffix. +- if (N == 1) return prefix; +- +- return prefix + 'x' + std::to_string(N); ++ const char prefix = IsFloat() ? 'f' : (IsSigned() ? 'i' : 'u'); ++ char name[64]; ++ // Omit the xN suffix for scalars. ++ if (N == 1) { ++ snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8); ++ } else { ++ snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N); ++ } ++ return name; + } + + // String comparison + + template +-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) { ++inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size, ++ size_t* pos = nullptr) { + const uint8_t* bytes1 = reinterpret_cast(p1); + const uint8_t* bytes2 = reinterpret_cast(p2); + for (size_t i = 0; i < size; ++i) { +- if (bytes1[i] != bytes2[i]) return false; ++ if (bytes1[i] != bytes2[i]) { ++ fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i, ++ size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(), ++ TypeName(T2(), 1).c_str()); ++ if (pos != nullptr) { ++ *pos = i; ++ } ++ return false; ++ } + } + return true; + } +@@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const + auto lanes = AllocateAligned(N); + Store(v, d, lanes.get()); + const size_t begin = static_cast(std::max(0, lane - 2)); +- const size_t end = std::min(begin + 5, N); ++ const size_t end = std::min(begin + 7, N); + fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption, + begin); + for (size_t i = begin; i < end; ++i) { +- fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str()); ++ fprintf(stderr, "%g,", double(lanes[i])); + } + if (begin >= end) fprintf(stderr, "(out of bounds)"); + fprintf(stderr, "\n"); +@@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex + const char* filename = "", const int line = -1, + const size_t lane = 0) { + if (!IsEqual(expected, actual)) { +- const std::string expected_str = std::to_string(expected); +- const std::string actual_str = std::to_string(actual); +- NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(), +- actual_str.c_str()); ++ char expected_str[100]; ++ snprintf(expected_str, sizeof(expected_str), "%g", double(expected)); ++ char actual_str[100]; ++ snprintf(actual_str, sizeof(actual_str), "%g", double(actual)); ++ NotifyFailure(filename, line, type_name.c_str(), lane, expected_str, ++ actual_str); + } + } + +@@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co + fprintf(stderr, "\n\n"); + Print(d, "expect", expected, i); + Print(d, "actual", actual, i); ++ ++ char expected_str[100]; ++ snprintf(expected_str, sizeof(expected_str), "%g", ++ double(expected_lanes[i])); ++ char actual_str[100]; ++ snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i])); ++ + NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i, +- std::to_string(expected_lanes[i]).c_str(), +- std::to_string(actual_lanes[i]).c_str()); ++ expected_str, actual_str); + } + } + } +@@ -458,11 +458,8 @@ struct ForeachSizeR ++// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes]. ++template + struct ForPartialVectors { + template + void operator()(T /*unused*/) const { +@@ -470,8 +467,8 @@ struct ForPartialVectors { + // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full. + ForeachSizeR::Do(); + #else +- ForeachSizeR::Do(); ++ ForeachSizeR::Do(); + #endif + } + }; +@@ -505,33 +502,19 @@ struct ForGE128Vectors { + } + }; + +-// Calls Test for all powers of two in [128 bits, max bits/2]. +-template ++// Calls Test for all vectors that can be expanded by kFactor. ++template + struct ForExtendableVectors { + template + void operator()(T /*unused*/) const { + #if HWY_TARGET == HWY_RVV +- ForeachSizeR::Do(); ++ ForeachSizeR::Do(); + #else +- ForeachSizeR::Do(); + #endif + } + }; +- +-// Calls Test for full vectors only. +-template +-struct ForFullVectors { +- template +- void operator()(T t) const { +-#if HWY_TARGET == HWY_RVV +- ForeachSizeR::Do(); +- (void)t; +-#else +- Test()(t, HWY_FULL(T)()); +-#endif +- } +-}; + + // Type lists to shorten call sites: + +diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE +diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in +diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE +diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in +diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE +diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE +diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE +diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile +diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE +diff -up chromium-91.0.4472.77/third_party/highway/src/README.md.12 chromium-91.0.4472.77/third_party/highway/src/README.md +--- chromium-91.0.4472.77/third_party/highway/src/README.md.12 2021-06-02 10:56:05.295904696 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/README.md 2021-05-31 10:37:11.000000000 -0400 +@@ -15,15 +15,19 @@ applying the same operation to 'lanes'. + ## Current status + + Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD. +-A port to RVV is in progress. ++Ports to RVV and SVE/SVE2 are in progress. + + Version 0.11 is considered stable enough to use in other projects, and is + expected to remain backwards compatible unless serious issues are discovered + while implementing SVE/RVV targets. After these targets are added, Highway will + reach version 1.0. + +-Continuous integration tests use a recent version of Clang and older version of +-MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1. ++Continuous integration tests build with a recent version of Clang (running on ++x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). ++ ++Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via ++GCC cross-compile and QEMU. See the ++[testing process](g3doc/release_testing_process.md) for details. + + The `contrib` directory contains SIMD-related utilities: an image class with + aligned rows, and a math library (16 functions already implemented, mostly +@@ -62,9 +66,11 @@ To test on all the attainable targets fo + default configuration skips baseline targets (e.g. scalar) that are superseded + by another baseline target. + ++Bazel is also supported for building, but it is not as widely used/tested. ++ + ## Quick start + +-You can use the `skeleton` examples inside examples/ as a starting point. ++You can use the `benchmark` inside examples/ as a starting point. + + A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations + and their parameters, and the [instruction_matrix][instmtx] indicates the +diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE +diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat +--- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 2021-06-02 10:56:05.293904685 -0400 ++++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat 2021-05-31 10:37:11.000000000 -0400 +@@ -2,9 +2,9 @@ + REM Switch directory of this batch file + cd %~dp0 + +-if not exist build mkdir build ++if not exist build_win mkdir build_win + +-cd build ++cd build_win + cmake .. -G Ninja || goto error + ninja || goto error + ctest -j || goto error +diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE +diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.sh.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.sh +diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE +diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time +diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1 diff --git a/chromium.spec b/chromium.spec index c606365..4370a44 100644 --- a/chromium.spec +++ b/chromium.spec @@ -7,7 +7,7 @@ # This flag is so I can build things very fast on a giant system. # Do not enable in Koji builds. -%global use_all_cpus 0 +%global use_all_cpus 1 %if %{use_all_cpus} %global numjobs %{_smp_build_ncpus} @@ -208,14 +208,14 @@ BuildRequires: libicu-devel >= 5.4 %global chromoting_client_id %nil %endif -%global majorversion 90 +%global majorversion 91 %if %{freeworld} Name: chromium%{chromium_channel}%{nsuffix} %else Name: chromium%{chromium_channel} %endif -Version: %{majorversion}.0.4430.212 +Version: %{majorversion}.0.4472.77 Release: 1%{?dist} %if %{?freeworld} %if %{?shared} @@ -234,7 +234,7 @@ License: BSD and LGPLv2+ and ASL 2.0 and IJG and MIT and GPLv2+ and ISC and Open ### Chromium Fedora Patches ### Patch0: chromium-70.0.3538.67-sandbox-pie.patch # Use /etc/chromium for initial_prefs -Patch1: chromium-89.0.4389.72-initial_prefs-etc-path.patch +Patch1: chromium-91.0.4472.77-initial_prefs-etc-path.patch # Use gn system files Patch2: chromium-67.0.3396.62-gn-system.patch # Do not prefix libpng functions @@ -249,7 +249,7 @@ Patch6: chromium-89.0.4389.72-norar.patch # https://gitweb.gentoo.org/repo/gentoo.git/tree/www-client/chromium/files/chromium-widevine-r3.patch Patch7: chromium-71.0.3578.98-widevine-r3.patch # Disable fontconfig cache magic that breaks remoting -Patch8: chromium-83.0.4103.61-disable-fontconfig-cache-magic.patch +Patch8: chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch # drop rsp clobber, which breaks gcc9 (thanks to Jeff Law) Patch9: chromium-78.0.3904.70-gcc9-drop-rsp-clobber.patch # Try to load widevine from other places @@ -274,16 +274,20 @@ Patch57: chromium-89.0.4389.72-missing-cstring-header.patch # prepare for using system ffmpeg (clean) # http://svnweb.mageia.org/packages/cauldron/chromium-browser-stable/current/SOURCES/chromium-53-ffmpeg-no-deprecation-errors.patch?view=markup Patch58: chromium-53-ffmpeg-no-deprecation-errors.patch -# https://github.com/stha09/chromium-patches/blob/master/chromium-90-angle-constexpr.patch -Patch59: chromium-90-angle-constexpr.patch -# https://github.com/stha09/chromium-patches/blob/master/chromium-90-CrossThreadCopier-qualification.patch -Patch60: chromium-90-CrossThreadCopier-qualification.patch -# https://github.com/stha09/chromium-patches/blob/master/chromium-90-quantization_utils-include.patch -Patch61: chromium-90-quantization_utils-include.patch +# https://github.com/stha09/chromium-patches/blob/master/chromium-91-pcscan-vector-types.patch +Patch59: chromium-91-pcscan-vector-types.patch +# https://github.com/stha09/chromium-patches/blob/master/chromium-91-libyuv-aarch64.patch +Patch60: chromium-91-libyuv-aarch64.patch +# Update third_party/highway to 0.12.2 +# this is needed for sane arm/aarch64 +Patch61: chromium-91.0.4472.77-update-highway-0.12.2.patch # https://github.com/stha09/chromium-patches/blob/master/chromium-90-ruy-include.patch Patch62: chromium-90-ruy-include.patch -# https://github.com/stha09/chromium-patches/blob/master/chromium-90-TokenizedOutput-include.patch -Patch63: chromium-90-TokenizedOutput-include.patch +# Extra CXXFLAGS for aarch64 +Patch63: chromium-91.0.4472.77-aarch64-cxxflags-addition.patch +# Fix issue where closure_compiler thinks java is only allowed in android builds +# https://bugs.chromium.org/p/chromium/issues/detail?id=1192875 +Patch64: chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch # Silence GCC warnings during gn compile Patch65: chromium-84.0.4147.105-gn-gcc-cleanup.patch @@ -300,9 +304,6 @@ Patch75: chromium-90.0.4430.72-fstatfix.patch Patch76: chromium-88.0.4324.182-rawhide-gcc-std-max-fix.patch # Fix symbol visibility with gcc on swiftshader's libEGL Patch77: chromium-88.0.4324.182-gcc-fix-swiftshader-libEGL-visibility.patch -# Include support for futex_time64 (64bit time on 32bit platforms) -# https://chromium.googlesource.com/chromium/src/+/955a586c63c4f99fb734e44221db63f5b2ca25a9%5E%21/#F0 -Patch78: chromium-89.0.4389.82-support-futex_time64.patch # Do not download proprietary widevine module in the background (thanks Debian) Patch79: chromium-90.0.4430.72-widevine-no-download.patch # Fix crashes with components/cast_* @@ -409,6 +410,7 @@ BuildRequires: harfbuzz-devel >= 2.4.0 %endif BuildRequires: libatomic BuildRequires: libcap-devel +BuildRequires: libcurl-devel %if 0%{?bundlelibdrm} #nothing %else @@ -924,12 +926,12 @@ udev. %patch56 -p1 -b .missing-cstdint %patch57 -p1 -b .missing-cstring %patch58 -p1 -b .ffmpeg-deprecations -%patch59 -p1 -b .angle-constexpr -%patch60 -p1 -b .CrossThreadCopier-qualification -%patch61 -p1 -b .quantization_utils-include +%patch59 -p1 -b .pcscan-vector-types +%patch60 -p1 -b .libyuv-aarch64 +%patch61 -p1 -b .update-highway-0.12.2 %patch62 -p1 -b .ruy-include -%patch63 -p1 -b .TokenizedOutput - +%patch63 -p1 -b .aarch64-cxxflags-addition +%patch64 -p1 -b .java-only-allowed %patch65 -p1 -b .gn-gcc-cleanup %patch66 -p1 -b .remoting-cstring %patch67 -p1 -b .i686-textrels @@ -939,7 +941,6 @@ udev. %patch76 -p1 -b .sigstkszfix %endif %patch77 -p1 -b .gcc-swiftshader-visibility -%patch78 -p1 -b .futex-time64 %patch79 -p1 -b .widevine-no-download %patch80 -p1 -b .EnumTable-crash @@ -1209,6 +1210,7 @@ build/linux/unbundle/remove_bundled_libraries.py \ 'third_party/cros_system_api' \ 'third_party/dav1d' \ 'third_party/dawn' \ + 'third_party/dawn/third_party/khronos' \ 'third_party/depot_tools' \ 'third_party/devscripts' \ 'third_party/devtools-frontend' \ @@ -1247,6 +1249,7 @@ build/linux/unbundle/remove_bundled_libraries.py \ 'third_party/googletest' \ 'third_party/grpc' \ 'third_party/harfbuzz-ng' \ + 'third_party/highway' \ 'third_party/hunspell' \ 'third_party/iccjpeg' \ 'third_party/icu' \ @@ -1268,6 +1271,7 @@ build/linux/unbundle/remove_bundled_libraries.py \ 'third_party/libgifcodec' \ 'third_party/libjingle' \ 'third_party/libjpeg_turbo' \ + 'third_party/libjxl' \ 'third_party/libphonenumber' \ 'third_party/libpng' \ 'third_party/libsecret' \ @@ -1341,7 +1345,6 @@ build/linux/unbundle/remove_bundled_libraries.py \ 'third_party/rnnoise' \ 'third_party/ruy' \ 'third_party/s2cellid' \ - 'third_party/schema_org' \ 'third_party/securemessage' \ 'third_party/shell-encryption' \ 'third_party/simplejson' \ @@ -1378,6 +1381,7 @@ build/linux/unbundle/remove_bundled_libraries.py \ 'third_party/wayland' \ 'third_party/web-animations-js' \ 'third_party/webdriver' \ + 'third_party/webgpu-cts' \ 'third_party/webrtc' \ 'third_party/webrtc/common_audio/third_party/ooura' \ 'third_party/webrtc/common_audio/third_party/spl_sqrt_floor' \ @@ -1943,6 +1947,10 @@ getent group chrome-remote-desktop >/dev/null || groupadd -r chrome-remote-deskt %lang(vi) %{chromium_path}/locales/vi.pak* %lang(zh_CN) %{chromium_path}/locales/zh-CN.pak* %lang(zh_TW) %{chromium_path}/locales/zh-TW.pak* +# These are psuedolocales, not real ones. +# So we just include them always. +%{chromium_path}/locales/ar-XB.pak* +%{chromium_path}/locales/en-XA.pak* %if %{build_headless} %files headless @@ -1999,6 +2007,9 @@ getent group chrome-remote-desktop >/dev/null || groupadd -r chrome-remote-deskt %changelog +* Tue Jun 1 2021 Tom Callaway - 91.0.4472.77-1 +- update to 91.0.4472.77 + * Tue May 18 2021 Tom Callaway - 90.0.4430.212-1 - update to 90.0.4430.212 diff --git a/clean_ffmpeg.sh b/clean_ffmpeg.sh index cf9c5f2..ac34ca2 100755 --- a/clean_ffmpeg.sh +++ b/clean_ffmpeg.sh @@ -127,6 +127,7 @@ header_files=" libavcodec/x86/inline_asm.h \ libavcodec/pixblockdsp.h \ libavcodec/pixels.h \ libavcodec/png.h \ + libavcodec/pngdsp.h \ libavcodec/put_bits.h \ libavcodec/qpeldsp.h \ libavcodec/ratecontrol.h \ @@ -297,7 +298,6 @@ mp3_files=" libavcodec/aarch64/aacpsdsp_init_aarch64.c \ libavcodec/sbrdsp.c \ libavcodec/sbrdsp_template.c \ libavcodec/sinewin.c \ - libavcodec/sinewin_fixed.c \ libavcodec/x86/dct_init.c \ libavcodec/x86/dct32.asm \ libavcodec/x86/imdct36.asm \ diff --git a/sources b/sources index 5fdb3e3..727f450 100644 --- a/sources +++ b/sources @@ -20,4 +20,4 @@ SHA512 (xcb-proto-1.14.tar.xz) = de66d568163b6da2be9d6c59984f3afa3acd119a7813786 SHA512 (depot_tools.git-master.tar.gz) = dc323888812b66cc92c53a24a8a58ccf9e2961be67aa21852bd091b8b49569071f06ae9104cb58950e6253ac3a29f0db0663e9f35ef2b1ea28696efb38b42708 SHA512 (NotoSansSymbols2-Regular.ttf) = 2644b42c3fdccfe12395f9b61553aced169a0f1dc09f5a0fd7898e9d0a372ee4422b6b1cdab3c86ecc91db437e9ae8a951e64e85edc3ac9e9fca428852dbb2ad SHA512 (NotoSansTibetan-Regular.ttf) = fb5a48fcaea80eebe7d692f6fcf00d59d47658a358d0ec8e046fc559873f88bd595b2da474d2826abd9e9305f3741c69058d867b1e6048f37fe7d71b5d3af36a -SHA512 (chromium-90.0.4430.212-clean.tar.xz) = 53c16fcb899ae5de73599a67c7652801b4779c9642c2dacc2f211e6c6accd455507594138e59dcbabe9f80493d78fd4d0d118a58284d9d62f149e549dbba8ccc +SHA512 (chromium-91.0.4472.77-clean.tar.xz) = 52e4daec5cbaaa91851d33c0699bb0529c2b84bf2d95937cd043914eaf7c75c9e2d512904038acd367888bc465dfe6e4217f2eb1670f2f9ee3cae1f2c2a57d0a