
12204 lines
504 KiB
Raw Permalink Normal View History

2021-06-02 16:11:50 -04:00
diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time
diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10
diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt
--- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 2021-06-02 10:56:05.305904746 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt 2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,7 @@ if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
-project(hwy VERSION 0.1)
+project(hwy VERSION 0.12.2) # Keep in sync with highway.h version
@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE RelWithDebInfo)
+set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
"int main() {
@@ -51,10 +53,13 @@ check_cxx_source_compiles(
+ hwy/contrib/image/
+ hwy/contrib/image/image.h
+ hwy/contrib/math/math-inl.h
- contrib/image/
- contrib/image/image.h
- contrib/math/math-inl.h
@@ -64,6 +69,7 @@ set(HWY_SOURCES
+ hwy/ops/arm_sve-inl.h
@@ -146,13 +152,28 @@ else()
+ -march=armv7-a
+ -mfpu=neon-vfpv4
+ -mfloat-abi=hard # must match the toolchain specified as CXX=
+ -mfp16-format=ieee # required for vcvt_f32_f16
+ )
+ endif() # HWY_CMAKE_ARM7
+endif() # !MSVC
add_library(hwy STATIC ${HWY_SOURCES})
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
+target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
+target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
# -------------------------------------------------------- install library
install(TARGETS hwy
@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES})
-# Add a pkg-config file for libhwy and the test library.
+install(TARGETS hwy_contrib
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_CONTRIB_SOURCES})
+ if ("${source}" MATCHES "\.h$")
+ get_filename_component(dirname "${source}" DIRECTORY)
+ install(FILES "${source}"
+ endif()
+# Add a pkg-config file for libhwy and the contrib/test libraries.
-foreach (pc libhwy.pc libhwy-test.pc)
+foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
@@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD
# Avoids mismatch between GTest's static CRT and our dynamic.
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-add_executable(skeleton hwy/examples/
-target_sources(skeleton PRIVATE
- hwy/examples/skeleton-inl.h
- hwy/examples/
- hwy/examples/skeleton.h
- hwy/examples/skeleton_shared.h)
-# observe the difference in targets printed.
-target_compile_options(skeleton PRIVATE ${HWY_FLAGS})
-target_link_libraries(skeleton hwy)
-# Similar: shared headers but without the runtime dispatch in
-add_executable(skeleton_static hwy/examples/
-target_sources(skeleton_static PRIVATE
- hwy/examples/skeleton-inl.h
- hwy/examples/skeleton_shared.h)
-target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS})
-target_link_libraries(skeleton_static hwy)
# Programming exercise with integrated benchmark
add_executable(hwy_benchmark hwy/examples/
target_sources(hwy_benchmark PRIVATE
+# observe the difference in targets printed.
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
target_link_libraries(hwy_benchmark hwy)
@@ -272,19 +284,21 @@ endif()
- contrib/image/
- # contrib/math/
+ hwy/contrib/image/
+ # hwy/contrib/math/
+ hwy/
+ hwy/
+ hwy/
+ hwy/
- hwy/tests/
- hwy/
- hwy/
+ hwy/tests/
@@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
add_executable(${TESTNAME} ${TESTFILE})
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
+ # Test all targets, not just the best/baseline. This changes the default
+ # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
+ # cause compile errors because only one may be set, and other CMakeLists.txt
+ # that include us may set them.
+ target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
- target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
+ target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
- target_link_libraries(${TESTNAME} hwy gtest gtest_main)
+ target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
# Output test targets in the test directory.
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE
diff -up chromium-91.0.4472.77/third_party/highway/src/ chromium-91.0.4472.77/third_party/highway/src/
diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/ chromium-91.0.4472.77/third_party/highway/src/contrib/image/
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/ chromium-91.0.4472.77/third_party/highway/src/contrib/image/
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/ chromium-91.0.4472.77/third_party/highway/src/contrib/math/
diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING
diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog
--- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 2021-06-02 10:56:05.151903967 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog 2021-05-31 10:37:11.000000000 -0400
@@ -1,3 +1,26 @@
+highway (0.12.2-1) UNRELEASED; urgency=medium
+ * fix scalar-only test and Windows macro conflict with Load/StoreFence
+ * replace deprecated wasm intrinsics
+ -- Jan Wassenberg <> Mon, 31 May 2021 16:00:00 +0200
+highway (0.12.1-1) UNRELEASED; urgency=medium
+ * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
+ * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
+ * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
+ -- Jan Wassenberg <> Wed, 19 May 2021 15:00:00 +0200
+highway (0.12.0-1) UNRELEASED; urgency=medium
+ * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
+ * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
+ * Proper IEEE rounding, reduce libstdc++ usage, inlined math
+ -- Jan Wassenberg <> Thu, 15 Apr 2021 20:00:00 +0200
highway (0.11.1-1) UNRELEASED; urgency=medium
* Fix clang7 asan error, finish f16 conversions and add test
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format
diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf
Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/ chromium-91.0.4472.77/third_party/highway/src/g3doc/
--- chromium-91.0.4472.77/third_party/highway/src/g3doc/ 2021-06-02 10:56:05.117903795 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/g3doc/ 2021-05-31 10:37:11.000000000 -0400
@@ -33,6 +33,12 @@ The public headers are:
* hwy/cache_control.h: defines stand-alone functions to control caching (e.g.
prefetching) and memory barriers, independent of actual SIMD.
+* hwy/nanobenchmark.h: library for precisely measuring elapsed time (under
+ varying inputs) for benchmarking small/medium regions of code.
+* hwy/tests/test_util-inl.h: defines macros for invoking tests on all
+ available targets, plus per-target functions useful in tests (e.g. Print).
SIMD implementations must be preceded and followed by the following:
@@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE();
## Vector and descriptor types
-Highway vectors consist of one or more 'lanes' of the same built-in type `T =
-uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16,
-32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load,
-store, and conversion to/from `float32_t`; infinity or NaN have
-implementation-defined results.
-Each vector has `N` lanes (a power of two, possibly unknown at compile time).
-Platforms such as x86 support multiple vector types, and other platforms require
-that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be
-wrapped inside a class. The Highway API satisfies these constraints because it
-is designed around overloaded functions selected via a zero-sized tag parameter
-`d` of type `D = Simd<T, N>`. These are typically constructed using aliases:
-* `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native
- vector for the current target. For targets (e.g. RVV) that support register
- groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers
- in the group. This effectively multiplies the lane count in each operation
- by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of
- the sizes of the largest and smallest type. `LMUL > 1` is more efficient on
- single-issue machines, but larger values reduce the effective number of
- registers, which may cause the compiler to spill them to memory.
+Highway vectors consist of one or more 'lanes' of the same built-in type
+`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32,
+In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store,
+and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN
+are implementation-defined due to ARMv7.
+On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway
+API allows using built-in types as vectors because operations are expressed as
+overloaded functions. Instead of constructors, overloaded initialization
+functions such as `Set` take a zero-sized tag argument called `d` of type `D =
+Simd<T, N>` and return an actual vector of unspecified type.
+`T` is one of the lane types above, and may be retrieved via `TFromD<D>`.
+`N` is target-dependent and not directly user-specified. The actual lane count
+may not be known at compile time, but can be obtained via `Lanes(d)`. Use this
+value, which is potentially different from `N`, to increment loop counters etc.
+It is typically a power of two, but that is not guaranteed e.g. on SVE.
+`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases:
+* Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a
+ native vector. This is preferred because it fully utilizes vector lanes.
+ For targets (e.g. RVV) that support register groups, the optional `LMUL` (1,
+ 2, 4, 8) specifies the number of registers in the group. This effectively
+ multiplies the lane count in each operation by `LMUL`. For mixed-precision
+ code, `LMUL` must be at least the ratio of the sizes of the largest and
+ smallest type. `LMUL > 1` is more efficient on single-issue machines, but
+ larger values reduce the effective number of registers, which may cause the
+ compiler to spill them to memory.
+* Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector
+ which may be native width, but no more than `N` lanes have observable
+ effects such as loading/storing to memory. This is less performance-portable
+ because it may not use all available lanes. Note that the resulting lane
+ count may also be less than `N`.
+ For targets (e.g. RVV) that have compile-time-unknown lane counts, such
+ vectors incur additional runtime cost in `Load` etc.
+User-specified lane counts or tuples of vectors could cause spills on targets
+with fewer or smaller vectors. By contrast, Highway encourages vector-length
+agnostic code, which is more performance-portable.
+Given that lane counts are potentially compile-time-unknown, storage for vectors
+should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For
+applications that require a compile-time estimate, `MaxLanes(d)` returns the `N`
+from `Simd<T, N>`, which is NOT necessarily the actual lane count. This is
+DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may
+be very large) and some compilers are not able to interpret it as constexpr.
-* `const HWY_CAPPED(T, N) d;` for up to `N` lanes.
-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors
-for the smaller types must be obtained from those of the larger type (e.g. via
+For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for
+the smaller types must be obtained from those of the larger type (e.g. via
`Rebind<uint8_t, HWY_FULL(float)>`).
-The type `T` may be accessed as `TFromD<D>`. There are three possibilities for
-the template parameter `N`:
-1. Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a
- target with compile-time constant vectors.
+## Using unspecified vector types
-1. Less than the hardware vector width. This is the result of a compile-time
- decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of
- lanes, even when the hardware vector width could be greater.
-1. Unrelated to the hardware vector width, e.g. when the hardware vector width
- is not known at compile-time and may be very large.
-In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by
-which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd<T, N>`,
-which is NOT necessarily the actual vector size (see above) and some compilers
-are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use
-alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of
-`Lanes(d)` elements.
-Highway is designed to map a vector variable to a (possibly partial) hardware
-register or register group. By discouraging user-specified `N` and tuples of
-vector variables, we improve performance portability (e.g. by reducing spills to
-memory for platforms that have smaller vectors than the developer expected).
-To construct vectors, call factory functions (see "Initialization" below) with
-a tag parameter `d`.
-Local variables typically use auto for type deduction. For some generic
-functions, a template argument `V` is sufficient: `template<class V> V Squared(V
-v) { return v * v; }`. In general, functions have a `D` template argument and
-can return vectors of type `Vec<D>`.
-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
-functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway
-functions generally take either a `Simd` or vector argument, which are also
-defined in namespace `hwy`, they will typically be found via Argument-Dependent
-Lookup and namespace qualifiers are not necessary. As an exception, Highway
-functions that are templates (e.g. because they require a compile-time argument
-such as a lane index or shift count) require a using-declaration such as
-`using hwy::HWY_NAMESPACE::ShiftLeft`.
+Because vector types are unspecified, local vector variables are typically
+defined using `auto` for type deduction. A template argument `V` suffices for
+simple generic functions: `template<class V> V Squared(V v) { return v * v; }`.
+Many functions will need a `D` template argument in order to initialize any
+constants. They can use a separate `V` template argument for vectors, or use
+`Vec<D>`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such
+aliases instead of auto may improve readability of mixed-type code. They can
+also be used for member variables, which are discouraged because compilers often
+have difficulty mapping them to registers.
## Operations
@@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ
bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and
bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used.
+Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
+functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions
+generally take either a `Simd` or vector/mask argument. For targets where
+vectors and masks are defined in namespace `hwy`, the functions will be found
+via Argument-Dependent Lookup. However, this does not work for function
+templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using
+hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`.
### Initialization
* <code>V **Zero**(D)</code>: returns N-lane vector with all bits set to 0.
@@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f
* `V`: `{i,f}` \
<code>V **Neg**(V a)</code>: returns `-a[i]`.
-* `V`: `{i}{8,16,32}, {f}` \
+* `V`: `{i,f}` \
<code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
integers, `LimitsMin()` maps to `LimitsMax() + 1`.
@@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift
shifting `MakeUnsigned<T>` and casting to `T`. Right-shifting negative signed
`T` is the same as an unsigned shift, except that 1-bits are shifted in.
-Compile-time constant shifts, generally the most efficient variant:
+Compile-time constant shifts, generally the most efficient variant (though 8-bit
+shifts are potentially slower than other lane sizes):
-* `V`: `{u,i}{16,32,64}` \
+* `V`: `{u,i}` \
<code>V **ShiftLeft**&lt;int&gt;(V a)</code> returns `a[i] << int`.
-* `V`: `{u,i}{16,32,64}` \
+* `V`: `{u,i}` \
<code>V **ShiftRight**&lt;int&gt;(V a)</code> returns `a[i] >> int`.
Shift all lanes by the same (not necessarily compile-time constant) amount:
-* `V`: `{u,i}{16,32,64}` \
+* `V`: `{u,i}` \
<code>V **ShiftLeftSame**(V a, int bits)</code> returns `a[i] << bits`.
-* `V`: `{u,i}{16,32,64}` \
+* `V`: `{u,i}` \
<code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.
-Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2):
+Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2):
* `V`: `{u,i}{16,32,64}` \
<code>V **operator<<**(V a, V b)</code> returns `a[i] << b[i]`.
@@ -332,12 +347,17 @@ Special functions for signed types:
slightly more efficient; requires the first argument to be non-negative.
* `V`: `i32/64` \
- <code>V **BroadcastSignBit(V a)</code> returns `a[i] < 0 ? -1 : 0`.
+ <code>V **BroadcastSignBit**(V a)</code> returns `a[i] < 0 ? -1 : 0`.
### Masks
Let `M` denote a mask capable of storing true/false for each lane.
+* <code>M **FirstN**(D, size_t N)</code>: returns mask with the first `N`
+ lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in
+ an all-true mask. Useful for implementing "masked" stores by loading `prev`
+ followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`.
* <code>M1 **RebindMask**(D, M2 m)</code>: returns same mask bits as `m`, but
reinterpreted as a mask for lanes of type `TFromD<D>`. `M1` and `M2` must
have the same number of lanes.
@@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing
* <code>size_t **CountTrue**(M m)</code>: returns how many of `m[i]` are true
[0, N]. This is typically more expensive than AllTrue/False.
-* `V`: `{u,i,f}{32,64}` \
+* `V`: `{u,i,f}{16,32,64}` \
<code>V **Compress**(V v, M m)</code>: returns `r` such that `r[n]` is
`v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true.
Compacts lanes whose mask is set into the lower lanes; upper lanes are
- implementation-defined.
+ implementation-defined. Slow with 16-bit lanes.
-* `V`: `{u,i,f}{32,64}` \
+* `V`: `{u,i,f}{16,32,64}` \
<code>size_t **CompressStore**(V v, M m, D, T* aligned)</code>: writes lanes
whose mask is set into `aligned`, starting from lane 0. Returns
`CountTrue(m)`, the number of valid lanes. All subsequent lanes may be
- overwritten! Alignment ensures inactive lanes will not cause faults.
+ overwritten! Alignment ensures inactive lanes will not cause faults. Slow
+ with 16-bit lanes.
### Comparisons
@@ -429,10 +450,16 @@ Memory operands are little-endian, other
lane configuration. Pointers are the addresses of `N` consecutive `T` values,
either naturally-aligned (`aligned`) or possibly unaligned (`p`).
+**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
+bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
+are naturally aligned. An unaligned access may require two load ports.
#### Load
* <code>Vec&lt;D&gt; **Load**(D, const T* aligned)</code>: returns
- `aligned[i]`.
+ `aligned[i]`. May fault if the pointer is not aligned to the vector size.
+ Using this whenever possible improves codegen on SSE4: unlike `LoadU`,
+ `Load` can be fused into a memory operand, which reduces register pressure.
* <code>Vec&lt;D&gt; **LoadU**(D, const T* p)</code>: returns `p[i]`.
* <code>Vec&lt;D&gt; **LoadDup128**(D, const T* p)</code>: returns one 128-bit
@@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or
be faster than broadcasting single values, and is more convenient than
preparing constants for the actual vector length.
-#### Gather
+#### Scatter/Gather
-**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`:
+**Note**: Offsets/indices are of type `VI = Vec<RebindToSigned<D>>` and need not
+be unique. The results are implementation-defined if any are negative.
-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
- <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>.
- Returns elements of base selected by possibly repeated *byte* `offsets[i]`.
- Results are implementation-defined if `offsets[i]` is negative.
-* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \
- <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>.
- Returns vector of `base[indices[i]]`. Indices need not be unique, but
- results are implementation-defined if they are negative.
+**Note**: Where possible, applications should `Load/Store/TableLookup*` entire
+vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form
+`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] =
+F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
+* `D`: `{u,i,f}{32,64}` \
+ <code>void **ScatterOffset**(Vec&lt;D&gt; v, D, const T* base, VI
+ offsets)</code>: stores `v[i]` to the base address plus *byte* `offsets[i]`.
+* `D`: `{u,i,f}{32,64}` \
+ <code>void **ScatterIndex**(Vec&lt;D&gt; v, D, const T* base, VI
+ indices)</code>: stores `v[i]` to `base[indices[i]]`.
+* `D`: `{u,i,f}{32,64}` \
+ <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>:
+ returns elements of base selected by *byte* `offsets[i]`.
+* `D`: `{u,i,f}{32,64}` \
+ <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>:
+ returns vector of `base[indices[i]]`.
#### Store
@@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or
* <code>void **StoreU**(Vec&lt;D&gt; a, D, T* p)</code>: as Store, but without
the alignment requirement.
+* `D`: `u8` \
+ <code>void **StoreInterleaved3**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
+ Vec&lt;D&gt; v2, D, T* p)</code>: equivalent to shuffling `v0, v1, v2`
+ followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0],
+ p[2] == v1[0]`. Useful for RGB samples.
+* `D`: `u8` \
+ <code>void **StoreInterleaved4**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
+ Vec&lt;D&gt; v2, Vec&lt;D&gt; v3, D, T* p)</code>: as above, but for four
+ vectors (e.g. RGBA samples).
### Cache control
All functions except Stream are defined in cache_control.h.
@@ -483,6 +533,9 @@ All functions except Stream are defined
* <code>void **Prefetch**(const T* p)</code>: begins loading the cache line
containing "p".
+* <code>void **Pause**()</code>: when called inside a spin-loop, may reduce
+ power consumption.
### Type conversion
* <code>Vec&lt;D&gt; **BitCast**(D, V)</code>: returns the bits of `V`
@@ -525,7 +578,8 @@ if the input exceeds the destination ran
zero and converts the value to same-sized integer.
* `V`: `f32`; `Ret`: `i32` \
- <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`.
+ <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
+ results are undefined for NaN.
### Swizzle
@@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with
### Reductions
-**Note**: the following are only available for full vectors (including scalar).
-These 'reduce' all lanes to a single result. This result is broadcasted to all
-lanes at no extra cost; you can use `GetLane` to obtain the value.
+**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
+broadcasted to all lanes at no extra cost. To obtain a scalar, you can call
Being a horizontal operation (across lanes of the same vector), these are slower
than normal SIMD operations and are typically used outside critical loops.
@@ -697,9 +751,6 @@ generate such instructions (implying the
finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to
provide an alternative to functions which are not supported by HWY_SCALAR.
-* `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by
- HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof.
* `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as
`#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out.
@@ -707,26 +758,15 @@ The following signal capabilities and ex
* `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes.
* `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes.
+The following were used to signal the maximum number of lanes for certain
+operations, but this is no longer necessary (nor possible on SVE/RVV), so they
* `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits.
* `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits.
-The following indicate the maximum number of lanes for certain operations. For
-targets that support the feature/operation, the macro evaluates to
-`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))`
-generates the best possible code (or scalar fallback) from the same source code.
-* `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset.
-* `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2).
- DEPRECATED, this always matches HWY_LANES(T) and will be removed.
-As above, but the feature implies the type so there is no T parameter, thus
-these can be used in `#if` expressions.
-* `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this
- always matches HWY_LANES(int64_t) and will be removed.
-* `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED,
- this always matches HWY_LANES(int64_t) and will be removed.
## Detecting supported targets
`SupportedTargets()` returns a cached (initialized on-demand) bitfield of the
@@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`:
and permitted by the compiler, independently of autovectorization), which
maximizes coverage in tests.
-If none are defined, the default is to select all attainable targets except any
-non-best baseline (typically `HWY_SCALAR`), which reduces code size.
+If none are defined, but `HWY_IS_TEST` is defined, the default is
+`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
+targets except any non-best baseline (typically `HWY_SCALAR`), which reduces
+code size.
## Compiler support
@@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags
However, this enables AVX2 instructions in the entire translation unit, which
may violate the one-definition rule and cause crashes. Instead, we use
target-specific attributes introduced via #pragma. Function using SIMD must
+reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively,
+individual functions or lambdas may be prefixed with `HWY_ATTR`.
Immediates (compile-time constants) are specified as template arguments to avoid
constant-propagation issues with Clang on ARM.
diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 2021-06-02 10:56:05.278904609 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h 2021-05-31 10:37:11.000000000 -0400
@@ -111,6 +111,32 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
+// Helpers for array allocators (avoids overflow)
+namespace detail {
+// Returns x such that 1u << x == n (if n is a power of two).
+static inline constexpr size_t ShiftCount(size_t n) {
+ return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
+template <typename T>
+T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
+ constexpr size_t size = sizeof(T);
+ constexpr bool is_pow2 = (size & (size - 1)) == 0;
+ constexpr size_t bits = ShiftCount(size);
+ static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
+ const size_t bytes = is_pow2 ? items << bits : items * size;
+ const size_t check = is_pow2 ? bytes >> bits : bytes / size;
+ if (check != items) {
+ return nullptr; // overflowed
+ }
+ return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
+} // namespace detail
// Aligned memory equivalent of make_unique<T[]> for array types using the
// custom allocators alloc/free. This function calls the constructor with the
// passed Args... on every created item. The destructor of each element will be
@@ -118,10 +144,11 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
- T* ptr =
- static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
- for (size_t i = 0; i < items; i++) {
- new (ptr + i) T(std::forward<Args>(args)...);
+ T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
+ if (ptr != nullptr) {
+ for (size_t i = 0; i < items; i++) {
+ new (ptr + i) T(std::forward<Args>(args)...);
+ }
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
@@ -165,7 +192,7 @@ template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
FreePtr free, void* opaque) {
return AlignedFreeUniquePtr<T[]>(
- static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
+ detail::AllocateAlignedItems<T>(items, alloc, opaque),
AlignedFreer(free, opaque));
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-06-02 10:56:05.273904584 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-05-31 10:37:11.000000000 -0400
@@ -16,6 +16,7 @@
#include <stddef.h>
+#include <array>
#include <new>
#include <random>
#include <vector>
@@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr)
+TEST(AlignedAllocatorTest, Log2) {
+ EXPECT_EQ(0u, detail::ShiftCount(1));
+ EXPECT_EQ(1u, detail::ShiftCount(2));
+ EXPECT_EQ(3u, detail::ShiftCount(8));
+// Allocator returns null when it detects overflow of items * sizeof(T).
+TEST(AlignedAllocatorTest, Overflow) {
+ constexpr size_t max = ~size_t(0);
+ constexpr size_t msb = (max >> 1) + 1;
+ using Size5 = std::array<uint8_t, 5>;
+ using Size10 = std::array<uint8_t, 10>;
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
+ EXPECT_EQ(nullptr,
+ detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
const size_t kSize = 7777;
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
ASSERT_NE(nullptr, ptr);
// Make sure the pointer is actually aligned.
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
char* p = static_cast<char*>(ptr);
size_t ret = 0;
for (size_t i = 0; i < kSize; i++) {
@@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP
p[i] = static_cast<char>(i & 0x7F);
if (i) ret += p[i] * p[i - 1];
- EXPECT_NE(0, ret);
+ EXPECT_NE(0U, ret);
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
@@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc)
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
ASSERT_NE(nullptr, ptr);
// We should have only requested one alloc from the allocator.
- EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+ EXPECT_EQ(1U, fake_alloc.PendingAllocs());
// Make sure the pointer is actually aligned.
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
- EXPECT_EQ(0u, fake_alloc.PendingAllocs());
+ EXPECT_EQ(0U, fake_alloc.PendingAllocs());
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
@@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
TEST(AlignedAllocatorTest, AllocSingleInt) {
auto ptr = AllocateAligned<uint32_t>(1);
ASSERT_NE(nullptr, ptr.get());
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
// Force delete of the unique_ptr now to check that it doesn't crash.
EXPECT_EQ(nullptr, ptr.get());
@@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
const size_t kSize = 7777;
auto ptr = AllocateAligned<uint32_t>(kSize);
ASSERT_NE(nullptr, ptr.get());
- EXPECT_EQ(0, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+ EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
@@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple
ptr[i] = static_cast<uint32_t>(i);
if (i) ret += ptr[i] * ptr[i - 1];
- EXPECT_NE(0, ret);
+ EXPECT_NE(0U, ret);
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
- // An array shold still only call a single allocation.
+ ASSERT_NE(nullptr, arr.get());
+ // An array should still only call a single allocation.
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
EXPECT_EQ(7, counter);
for (size_t i = 0; i < 7; i++) {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 2021-06-02 10:56:05.266904549 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h 2021-05-31 10:37:11.000000000 -0400
@@ -34,7 +34,10 @@
// Detect compiler using predefined macros
-#ifdef _MSC_VER
+// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
+// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
+// purpose.
+#if defined(_MSC_VER) && !defined(__clang__)
@@ -200,6 +203,10 @@
#define HWY_ARCH_X86_64 0
+#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
+#error "Cannot have both x86-32 and x86-64"
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
#define HWY_ARCH_X86 1
@@ -212,14 +219,29 @@
#define HWY_ARCH_PPC 0
-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
+#define HWY_ARCH_ARM_A64 1
+#define HWY_ARCH_ARM_A64 0
+#if defined(__arm__) || defined(_M_ARM)
+#define HWY_ARCH_ARM_V7 1
+#define HWY_ARCH_ARM_V7 0
+#error "Cannot have both A64 and V7"
#define HWY_ARCH_ARM 1
#define HWY_ARCH_ARM 0
-// There isn't yet a standard __wasm or __wasm__.
-#ifdef __EMSCRIPTEN__
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
#define HWY_ARCH_WASM 1
#define HWY_ARCH_WASM 0
@@ -231,9 +253,11 @@
#define HWY_ARCH_RVV 0
+// It is an error to detect multiple architectures at the same time, but OK to
+// detect none of the above.
- HWY_ARCH_RVV) != 1
-#error "Must detect exactly one platform"
+#error "Must not detect more than one architecture"
@@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
// by concatenating base type and bits.
-// RVV already has a builtin type.
+// RVV already has a builtin type and the GCC intrinsics require it.
+#define HWY_NATIVE_FLOAT16 1
+#define HWY_NATIVE_FLOAT16 0
+using float16_t = __fp16;
+// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
+// arguments, so use a wrapper.
+// TODO(janwas): replace with _Float16 when that is supported?
+#pragma pack(push, 1)
struct float16_t {
- // __fp16 cannot be used as a function parameter in clang, so use a wrapper.
uint16_t bits;
+#pragma pack(pop)
using float32_t = float;
using float64_t = double;
@@ -506,6 +543,13 @@ struct Relations<int64_t> {
using Narrow = int32_t;
template <>
+struct Relations<float16_t> {
+ using Unsigned = uint16_t;
+ using Signed = int16_t;
+ using Float = float16_t;
+ using Wide = float;
+template <>
struct Relations<float> {
using Unsigned = uint32_t;
using Signed = int32_t;
@@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t
// Undefined results for x == 0.
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
-#ifdef _MSC_VER
unsigned long index; // NOLINT
_BitScanForward(&index, x);
return index;
return static_cast<size_t>(__builtin_ctz(x));
HWY_API size_t PopCount(uint64_t x) {
@@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) {
return static_cast<size_t>(__builtin_popcountll(x));
return _mm_popcnt_u64(x);
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
x -= ((x >> 1) & 0x55555555U);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 2021-06-02 10:56:05.280904620 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h 2021-05-31 10:37:11.000000000 -0400
@@ -20,7 +20,9 @@
#include "hwy/base.h"
-#ifndef __SSE2__
+// Requires SSE2; fails to compile on 32-bit Clang 7 (see
+#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
@@ -30,6 +32,14 @@
#include <emmintrin.h> // SSE2
+// Windows.h #defines these, which causes infinite recursion. Temporarily
+// undefine them in this header; these functions are anyway deprecated.
+// TODO(janwas): remove when these functions are removed.
+#pragma push_macro("LoadFence")
+#pragma push_macro("StoreFence")
+#undef LoadFence
+#undef StoreFence
namespace hwy {
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
@@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach
+// Reduces power consumption in spin-loops. No effect on non-x86.
+ _mm_pause();
} // namespace hwy
+// TODO(janwas): remove when these functions are removed. (See above.)
+#pragma pop_macro("StoreFence")
+#pragma pop_macro("LoadFence")
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-06-02 10:56:05.195904190 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,6 @@
#include <stddef.h>
#include <stdio.h>
-#include <cmath>
#include <memory>
#include <numeric> // iota
@@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi
class TwoArray {
- // Passed to ctor as a value NOT known to the compiler. Must be a multiple of
- // the vector lane count * 8.
+ // Must be a multiple of the vector lane count * 8.
static size_t NumItems() { return 3456; }
- explicit TwoArray(const size_t num_items)
- : a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
- const float init = num_items / NumItems(); // 1, but compiler doesn't know
- std::iota(a_.get(), a_.get() + num_items, init);
- std::iota(b_, b_ + num_items, init);
+ TwoArray()
+ : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
+ // = 1, but compiler doesn't know
+ const float init = static_cast<float>(Unpredictable1());
+ std::iota(a_.get(), a_.get() + NumItems(), init);
+ std::iota(b_, b_ + NumItems(), init);
@@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) {
const FuncInput inputs[kNumInputs] = {num_items};
Result results[kNumInputs];
- Benchmark benchmark(num_items);
+ Benchmark benchmark;
Params p;
p.verbose = false;
@@ -101,7 +100,7 @@ void Intro() {
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
class BenchmarkDot : public TwoArray {
- explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
+ BenchmarkDot() : dot_{-1.0f} {}
FuncOutput operator()(const size_t num_items) {
HWY_FULL(float) d;
@@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray {
sum[i] += sum[i + power];
- return dot_ = GetLane(SumOfLanes(sum[0]));
+ dot_ = GetLane(SumOfLanes(sum[0]));
+ return static_cast<FuncOutput>(dot_);
void Verify(size_t num_items) {
if (dot_ == -1.0f) {
@@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray {
// INTERMEDIATE: delta coding
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
struct BenchmarkDelta : public TwoArray {
- explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
FuncOutput operator()(const size_t num_items) const {
b_[0] = a_[0];
@@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray
Store(a - shifted, df, &b_[i]);
- return b_[num_items - 1];
+ return static_cast<FuncOutput>(b_[num_items - 1]);
void Verify(size_t num_items) {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-06-02 10:56:05.189904159 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-05-31 10:37:11.000000000 -0400
@@ -22,27 +22,62 @@
// For runtime dispatch, specify the name of the current file (unfortunately
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
#define HWY_TARGET_INCLUDE "hwy/examples/"
-// Re-include this file once per enabled target to generate code for it.
+// Generates code for each enabled target by re-including this source file.
#include "hwy/foreach_target.h"
-#include "hwy/examples/skeleton_shared.h"
#include "hwy/highway.h"
-// Optional: factor out parts of the implementation into *-inl.h
-#include "hwy/examples/skeleton-inl.h"
// Optional, can instead add HWY_ATTR to all functions.
namespace skeleton {
namespace HWY_NAMESPACE {
-// Compiled once per target via multiple inclusion.
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
- float* HWY_RESTRICT out) {
- printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET),
- ExampleGatherStrategy());
+// Highway ops reside here; ADL does not find templates nor builtins.
+using namespace hwy::HWY_NAMESPACE;
+// Computes log2 by converting to a vector of floats. Compiled once per target.
+template <class DF>
+HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
+ uint8_t* HWY_RESTRICT log2) {
+ // Type tags for converting to other element types (Rebind = same count).
+ const Rebind<int32_t, DF> d32;
+ const Rebind<uint8_t, DF> d8;
+ const auto u8 = Load(d8, values);
+ const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
+ const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
+ Store(DemoteTo(d8, exponent), d8, log2);
+HWY_NOINLINE void CodepathDemo() {
+ // Highway defaults to portability, but per-target codepaths may be selected
+ // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
+ const char* gather = "Has int64";
+ const char* gather = "No int64";
+ printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
- ExampleMulAdd(in1, in2, out);
+HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+ uint8_t* HWY_RESTRICT log2) {
+ CodepathDemo();
+ // Second argument is necessary on RVV until it supports fractional lengths.
+ HWY_FULL(float, 4) df;
+ const size_t N = Lanes(df);
+ size_t i = 0;
+ for (; i + N <= count; i += N) {
+ OneFloorLog2(df, values + i, log2 + i);
+ }
+ // TODO(janwas): implement
+ for (; i < count; ++i) {
+ OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
+ }
// NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE();
namespace skeleton {
-// This macro declares a static array SkeletonHighwayDispatchTable used for
-// dynamic dispatch. This macro should be placed in the same namespace that
-// defines the Skeleton function above.
+// This macro declares a static array used for dynamic dispatch; it resides in
+// the same outer namespace that contains FloorLog2.
// This function is optional and only needed in the case of exposing it in the
-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in
-// this module is equivalent to inlining this optional function..
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
- float* HWY_RESTRICT out) {
- return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out);
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
+// is equivalent to inlining this function.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+ uint8_t* HWY_RESTRICT out) {
+ return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
// Optional: anything to compile only once, e.g. non-SIMD implementations of
-// public functions provided by this module, can go inside #if HWY_ONCE
-// (after end_target-inl.h).
+// public functions provided by this module, can go inside #if HWY_ONCE.
} // namespace skeleton
#endif // HWY_ONCE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 2021-06-02 10:56:05.213904281 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h 2021-05-31 10:37:11.000000000 -0400
@@ -18,15 +18,17 @@
-// Tiny subset of Highway API: essentials for declaring an interface, without
-// any implementation details.
+#include <stddef.h>
+// Platform-specific definitions used for declaring an interface, independent of
+// the SIMD instruction set.
#include "hwy/base.h" // HWY_RESTRICT
namespace skeleton {
-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
-void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2,
- float* HWY_RESTRICT out);
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+ uint8_t* HWY_RESTRICT out);
} // namespace skeleton
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 2021-06-02 10:56:05.164904033 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -29,41 +29,31 @@
// It is fine to #include normal or *-inl headers.
#include <stddef.h>
-#include "hwy/examples/skeleton_shared.h"
#include "hwy/highway.h"
namespace skeleton {
namespace HWY_NAMESPACE {
-using hwy::HWY_NAMESPACE::MulAdd;
+using namespace hwy::HWY_NAMESPACE;
-// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256.
-HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1,
- const float* HWY_RESTRICT in2,
- float* HWY_RESTRICT out) {
- // Descriptor(s) for all vector types used in this function.
- HWY_FULL(float) df;
- const auto mul = Set(df, kMultiplier);
- for (size_t i = 0; i < 256; i += Lanes(df)) {
- const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i));
- Store(result, df, out + i);
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
+// (uses best available instruction set) function in a header.
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
+template <class D, typename T>
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
+ const T* HWY_RESTRICT add_array,
+ const size_t size, T* HWY_RESTRICT x_array) {
+ for (size_t i = 0; i < size; i += Lanes(d)) {
+ const auto mul = Load(d, mul_array + i);
+ const auto add = Load(d, add_array + i);
+ auto x = Load(d, x_array + i);
+ x = MulAdd(mul, x, add);
+ Store(x, d, x_array + i);
-// (This doesn't generate SIMD instructions, so is not required here)
-HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() {
- // Highway functions generate per-target implementations from the same source
- // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed,
- // entirely different codepaths can also be selected like so:
- return "Has gather";
- return "Gather is limited to one lane";
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-06-02 10:56:05.170904063 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/ 2021-05-31 10:37:11.000000000 -0400
@@ -12,30 +12,96 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Example of unit test for the "skeleton" module.
+// Example of unit test for the "skeleton" library.
-#include "hwy/examples/skeleton.h" // Skeleton
+#include "hwy/examples/skeleton.h"
#include <stdio.h>
-#include "hwy/tests/test_util-inl.h" // RunTest
+#define HWY_TARGET_INCLUDE "examples/"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+// Optional: factor out parts of the implementation into *-inl.h
+#include "hwy/examples/skeleton-inl.h"
namespace skeleton {
+namespace HWY_NAMESPACE {
+using namespace hwy::HWY_NAMESPACE;
+// Calls function defined in
+struct TestFloorLog2 {
+ template <class T, class DF>
+ HWY_NOINLINE void operator()(T /*unused*/, DF df) {
+ const size_t count = 5 * Lanes(df);
+ auto in = hwy::AllocateAligned<uint8_t>(count);
+ auto expected = hwy::AllocateAligned<uint8_t>(count);
+ hwy::RandomState rng;
+ for (size_t i = 0; i < count; ++i) {
+ expected[i] = Random32(&rng) & 7;
+ in[i] = static_cast<uint8_t>(1u << expected[i]);
+ }
+ auto out = hwy::AllocateAligned<uint8_t>(count);
+ CallFloorLog2(in.get(), count, out.get());
+ int sum = 0;
+ for (size_t i = 0; i < count; ++i) {
+ // TODO(janwas): implement
+ HWY_ASSERT_EQ(expected[i], out[i]);
+ sum += out[i];
+ }
+ hwy::PreventElision(sum);
+ }
+HWY_NOINLINE void TestAllFloorLog2() {
+ ForPartialVectors<TestFloorLog2>()(float());
+// Calls function defined in skeleton-inl.h.
+struct TestSumMulAdd {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ hwy::RandomState rng;
+ const size_t count = 4096;
+ EXPECT_TRUE(count % Lanes(d) == 0);
+ auto mul = hwy::AllocateAligned<T>(count);
+ auto x = hwy::AllocateAligned<T>(count);
+ auto add = hwy::AllocateAligned<T>(count);
+ for (size_t i = 0; i < count; ++i) {
+ mul[i] = static_cast<T>(Random32(&rng) & 0xF);
+ x[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ add[i] = static_cast<T>(Random32(&rng) & 0xFF);
+ }
+ double expected_sum = 0.0;
+ for (size_t i = 0; i < count; ++i) {
+ expected_sum += mul[i] * x[i] + add[i];
+ }
-TEST(SkeletonTest, MainTest) {
- HWY_ALIGN_MAX float in1[256];
- HWY_ALIGN_MAX float in2[256];
- HWY_ALIGN_MAX float out[256];
- for (size_t i = 0; i < 256; ++i) {
- in1[i] = static_cast<float>(i);
- in2[i] = in1[i] + 300;
+ MulAddLoop(d, mul.get(), add.get(), count, x.get());
+ HWY_ASSERT_EQ(4344240.0, expected_sum);
- // Tests will run for all compiled targets to ensure all are OK.
- hwy::RunTest([&in1, &in2, &out]() {
- Skeleton(in1, in2, out);
- // Add EXPECT_... calls here.
- });
+HWY_NOINLINE void TestAllSumMulAdd() {
+ ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace skeleton
+namespace skeleton {
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
} // namespace skeleton
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 2021-06-02 10:56:05.269904564 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h 2021-05-31 10:37:11.000000000 -0400
@@ -25,10 +25,10 @@
namespace hwy {
-// API version (
+// API version (; keep in sync with CMakeLists.txt.
#define HWY_MAJOR 0
-#define HWY_MINOR 11
-#define HWY_PATCH 1
+#define HWY_MINOR 12
+#define HWY_PATCH 2
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
@@ -49,7 +49,7 @@ namespace hwy {
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
-// Vector of up to MAX_N lanes.
+// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
#define HWY_CAPPED(T, MAX_N) \
@@ -75,6 +75,10 @@ namespace hwy {
@@ -143,6 +147,18 @@ FunctionCache<RetType, Args...> Function
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
+#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
+#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
@@ -261,8 +277,11 @@ FunctionCache<RetType, Args...> Function
#include "hwy/ops/x86_512-inl.h"
+#error "PPC is not yet supported"
#include "hwy/ops/arm_neon-inl.h"
+#include "hwy/ops/arm_sve-inl.h"
#include "hwy/ops/wasm_128-inl.h"
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-06-02 10:56:05.276904599 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-05-31 10:37:11.000000000 -0400
@@ -29,128 +29,43 @@
#include <string>
#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif // NOMINMAX
+#include <windows.h>
+#if defined(__MACH__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#if defined(__HAIKU__)
+#include <OS.h>
#include "hwy/base.h"
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
#elif HWY_ARCH_X86
-#ifdef _MSC_VER
#include <intrin.h>
#include <cpuid.h> // NOLINT
-#endif // _MSC_VER
#endif // HWY_ARCH_X86
namespace hwy {
-namespace platform {
-namespace {
-#if HWY_ARCH_X86
-void Cpuid(const uint32_t level, const uint32_t count,
- uint32_t* HWY_RESTRICT abcd) {
- int regs[4];
- __cpuidex(regs, level, count);
- for (int i = 0; i < 4; ++i) {
- abcd[i] = regs[i];
- }
- uint32_t a;
- uint32_t b;
- uint32_t c;
- uint32_t d;
- __cpuid_count(level, count, a, b, c, d);
- abcd[0] = a;
- abcd[1] = b;
- abcd[2] = c;
- abcd[3] = d;
-std::string BrandString() {
- char brand_string[49];
- std::array<uint32_t, 4> abcd;
- // Check if brand string is supported (it is on all reasonable Intel/AMD)
- Cpuid(0x80000000U, 0,;
- if (abcd[0] < 0x80000004U) {
- return std::string();
- }
- for (size_t i = 0; i < 3; ++i) {
- Cpuid(0x80000002U + i, 0,;
- memcpy(brand_string + i * 16,, sizeof(abcd));
- }
- brand_string[48] = 0;
- return brand_string;
-// Returns the frequency quoted inside the brand string. This does not
-// account for throttling nor Turbo Boost.
-double NominalClockRate() {
- const std::string& brand_string = BrandString();
- // Brand strings include the maximum configured frequency. These prefixes are
- // defined by Intel CPUID documentation.
- const char* prefixes[3] = {"MHz", "GHz", "THz"};
- const double multipliers[3] = {1E6, 1E9, 1E12};
- for (size_t i = 0; i < 3; ++i) {
- const size_t pos_prefix = brand_string.find(prefixes[i]);
- if (pos_prefix != std::string::npos) {
- const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
- if (pos_space != std::string::npos) {
- const std::string digits =
- brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
- return std::stod(digits) * multipliers[i];
- }
- }
- }
- return 0.0;
-#endif // HWY_ARCH_X86
-} // namespace
-// Returns tick rate. Invariant means the tick counter frequency is independent
-// of CPU throttling or sleep. May be expensive, caller should cache the result.
-double InvariantTicksPerSecond() {
- return __ppc_get_timebase_freq();
-#elif HWY_ARCH_X86
- // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
- return NominalClockRate();
- // Fall back to clock_gettime nanoseconds.
- return 1E9;
-} // namespace platform
namespace {
-// Prevents the compiler from eliding the computations that led to "output".
-template <class T>
-inline void PreventElision(T&& output) {
- // Works by indicating to the compiler that "output" is being read and
- // modified. The +r constraint avoids unnecessary writes to memory, but only
- // works for built-in types (typically FuncOutput).
- asm volatile("" : "+r"(output) : : "memory");
- // MSVC does not support inline assembly anymore (and never supported GCC's
- // RTL constraints). Self-assignment with #pragma optimize("off") might be
- // expected to prevent elision, but it does not with MSVC 2015. Type-punning
- // with volatile pointers generates inefficient code on MSVC 2017.
- static std::atomic<T> dummy(T{});
-, std::memory_order_relaxed);
namespace timer {
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
@@ -202,8 +117,8 @@ namespace timer {
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
-inline uint64_t Start64() {
- uint64_t t;
+inline Ticks Start() {
+ Ticks t;
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
@@ -228,8 +143,15 @@ inline uint64_t Start64() {
: "rdx", "memory", "cc");
asm volatile("rdcycle %0" : "=r"(t));
- // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
+#elif defined(_WIN32) || defined(_WIN64)
+ LARGE_INTEGER counter;
+ (void)QueryPerformanceCounter(&counter);
+ t = counter.QuadPart;
+#elif defined(__MACH__)
+ t = mach_absolute_time();
+#elif defined(__HAIKU__)
+ t = system_time_nsecs(); // since boot
+#else // POSIX
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
@@ -237,7 +159,7 @@ inline uint64_t Start64() {
return t;
-inline uint64_t Stop64() {
+inline Ticks Stop() {
uint64_t t;
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
@@ -261,61 +183,7 @@ inline uint64_t Stop64() {
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
- t = Start64();
- return t;
-// Returns a 32-bit timestamp with about 4 cycles less overhead than
-// Start64. Only suitable for measuring very short regions because the
-// timestamp overflows about once a second.
-inline uint32_t Start32() {
- uint32_t t;
- _ReadWriteBarrier();
- _mm_lfence();
- _ReadWriteBarrier();
- t = static_cast<uint32_t>(__rdtsc());
- _ReadWriteBarrier();
- _mm_lfence();
- _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
- asm volatile(
- "lfence\n\t"
- "rdtsc\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rdx = TSC >> 32.
- : "rdx", "memory");
- asm volatile("rdcycle %0" : "=r"(t));
- t = static_cast<uint32_t>(Start64());
- return t;
-inline uint32_t Stop32() {
- uint32_t t;
- _ReadWriteBarrier();
- unsigned aux;
- t = static_cast<uint32_t>(__rdtscp(&aux));
- _ReadWriteBarrier();
- _mm_lfence();
- _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
- asm volatile(
- "rdtscp\n\t"
- "lfence"
- : "=a"(t)
- :
- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
- : "rcx", "rdx", "memory");
- t = static_cast<uint32_t>(Stop64());
+ t = Start();
return t;
@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value
} // namespace robust_statistics
+} // namespace
+namespace platform {
+namespace {
-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
-// read than 64 bit.
-using Ticks = uint32_t;
+// Prevents the compiler from eliding the computations that led to "output".
+template <class T>
+inline void PreventElision(T&& output) {
+ // Works by indicating to the compiler that "output" is being read and
+ // modified. The +r constraint avoids unnecessary writes to memory, but only
+ // works for built-in types (typically FuncOutput).
+ asm volatile("" : "+r"(output) : : "memory");
+ // MSVC does not support inline assembly anymore (and never supported GCC's
+ // RTL constraints). Self-assignment with #pragma optimize("off") might be
+ // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+ // with volatile pointers generates inefficient code on MSVC 2017.
+ static std::atomic<T> dummy(T{});
+, std::memory_order_relaxed);
+#if HWY_ARCH_X86
+void Cpuid(const uint32_t level, const uint32_t count,
+ uint32_t* HWY_RESTRICT abcd) {
+ int regs[4];
+ __cpuidex(regs, level, count);
+ for (int i = 0; i < 4; ++i) {
+ abcd[i] = regs[i];
+ }
+ uint32_t a;
+ uint32_t b;
+ uint32_t c;
+ uint32_t d;
+ __cpuid_count(level, count, a, b, c, d);
+ abcd[0] = a;
+ abcd[1] = b;
+ abcd[2] = c;
+ abcd[3] = d;
+std::string BrandString() {
+ char brand_string[49];
+ std::array<uint32_t, 4> abcd;
+ // Check if brand string is supported (it is on all reasonable Intel/AMD)
+ Cpuid(0x80000000U, 0,;
+ if (abcd[0] < 0x80000004U) {
+ return std::string();
+ }
+ for (size_t i = 0; i < 3; ++i) {
+ Cpuid(static_cast<uint32_t>(0x80000002U + i), 0,;
+ memcpy(brand_string + i * 16,, sizeof(abcd));
+ }
+ brand_string[48] = 0;
+ return brand_string;
+// Returns the frequency quoted inside the brand string. This does not
+// account for throttling nor Turbo Boost.
+double NominalClockRate() {
+ const std::string& brand_string = BrandString();
+ // Brand strings include the maximum configured frequency. These prefixes are
+ // defined by Intel CPUID documentation.
+ const char* prefixes[3] = {"MHz", "GHz", "THz"};
+ const double multipliers[3] = {1E6, 1E9, 1E12};
+ for (size_t i = 0; i < 3; ++i) {
+ const size_t pos_prefix = brand_string.find(prefixes[i]);
+ if (pos_prefix != std::string::npos) {
+ const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+ if (pos_space != std::string::npos) {
+ const std::string digits =
+ brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+ return std::stod(digits) * multipliers[i];
+ }
+ }
+ }
+ return 0.0;
+#endif // HWY_ARCH_X86
+} // namespace
+double InvariantTicksPerSecond() {
+ return __ppc_get_timebase_freq();
+#elif HWY_ARCH_X86
+ // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
+ return NominalClockRate();
+#elif defined(_WIN32) || defined(_WIN64)
+ (void)QueryPerformanceFrequency(&freq);
+ return double(freq.QuadPart);
+#elif defined(__MACH__)
+ //
+ mach_timebase_info_data_t timebase;
+ (void)mach_timebase_info(&timebase);
+ return double(timebase.denom) / timebase.numer * 1E9;
+ // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
+ return 1E9; // Haiku and clock_gettime return nanoseconds.
-// Returns timer overhead / minimum measurable difference.
-Ticks TimerResolution() {
+double Now() {
+ static const double mul = 1.0 / InvariantTicksPerSecond();
+ return static_cast<double>(timer::Start()) * mul;
+uint64_t TimerResolution() {
// Nested loop avoids exceeding stack/L1 capacity.
- Ticks repetitions[Params::kTimerSamples];
+ timer::Ticks repetitions[Params::kTimerSamples];
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
- Ticks samples[Params::kTimerSamples];
+ timer::Ticks samples[Params::kTimerSamples];
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
- const Ticks t0 = timer::Start32();
- const Ticks t1 = timer::Stop32();
+ const timer::Ticks t0 = timer::Start();
+ const timer::Ticks t1 = timer::Stop();
samples[i] = t1 - t0;
repetitions[rep] = robust_statistics::Mode(samples);
@@ -462,18 +439,21 @@ Ticks TimerResolution() {
return robust_statistics::Mode(repetitions);
-static const Ticks timer_resolution = TimerResolution();
+} // namespace platform
+namespace {
+static const timer::Ticks timer_resolution = platform::TimerResolution();
// Estimates the expected value of "lambda" values with a variable number of
// samples until the variability "rel_mad" is less than "max_rel_mad".
template <class Lambda>
-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
- const Params& p, const Lambda& lambda) {
+timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+ const Params& p, const Lambda& lambda) {
// Choose initial samples_per_eval based on a single estimated duration.
- Ticks t0 = timer::Start32();
+ timer::Ticks t0 = timer::Start();
- Ticks t1 = timer::Stop32();
- Ticks est = t1 - t0;
+ timer::Ticks t1 = timer::Stop();
+ timer::Ticks est = t1 - t0;
static const double ticks_per_second = platform::InvariantTicksPerSecond();
const size_t ticks_per_eval =
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max
est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
- std::vector<Ticks> samples;
+ std::vector<timer::Ticks> samples;
samples.reserve(1 + samples_per_eval);
// Percentage is too strict for tiny differences, so also allow a small
// absolute "median absolute deviation".
- const Ticks max_abs_mad = (timer_resolution + 99) / 100;
+ const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
*rel_mad = 0.0; // ensure initialized
for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
samples.reserve(samples.size() + samples_per_eval);
for (size_t i = 0; i < samples_per_eval; ++i) {
- t0 = timer::Start32();
+ t0 = timer::Start();
- t1 = timer::Stop32();
+ t1 = timer::Stop();
samples.push_back(t1 - t0);
@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max
// Median absolute deviation (mad) is a robust measure of 'variability'.
- const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+ const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(, samples.size(), est);
- *rel_mad = static_cast<double>(int(abs_mad)) / est;
+ *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
if (p.verbose) {
- printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
- samples.size(), est, abs_mad, *rel_mad * 100.0);
+ printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
+ samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
return est;
@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i
return unique;
-// Returns how often we need to call func for sufficient precision, or zero
-// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
+// Returns how often we need to call func for sufficient precision.
size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
const Params& p) {
// Min elapsed ticks for any input.
- Ticks min_duration = ~0u;
+ timer::Ticks min_duration = ~timer::Ticks(0);
for (const FuncInput input : unique) {
- // Make sure a 32-bit timer is sufficient.
- const uint64_t t0 = timer::Start64();
- PreventElision(func(arg, input));
- const uint64_t t1 = timer::Stop64();
- const uint64_t elapsed = t1 - t0;
- if (elapsed >= (1ULL << 30)) {
- fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
- input);
- return 0;
- }
double rel_mad;
- const Ticks total = SampleUntilStable(
+ const timer::Ticks total = SampleUntilStable(
p.target_rel_mad, &rel_mad, p,
- [func, arg, input]() { PreventElision(func(arg, input)); });
+ [func, arg, input]() { platform::PreventElision(func(arg, input)); });
min_duration = std::min(min_duration, total - timer_resolution);
@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui
const size_t num_skip =
min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
if (p.verbose) {
- printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
- max_skip, min_duration, num_skip);
+ printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
+ size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
return num_skip;
@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co
// Returns total ticks elapsed for all inputs.
-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
- const Params& p, double* max_rel_mad) {
+timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
+ const InputVec* inputs, const Params& p,
+ double* max_rel_mad) {
double rel_mad;
- const Ticks duration =
+ const timer::Ticks duration =
SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
for (const FuncInput input : *inputs) {
- PreventElision(func(arg, input));
+ platform::PreventElision(func(arg, input));
*max_rel_mad = std::max(*max_rel_mad, rel_mad);
@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const
// Returns overhead of accessing inputs[] and calling a function; this will
// be deducted from future TotalDuration return values.
-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
+timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
+ const Params& p) {
double rel_mad;
// Zero tolerance because repeatability is crucial and EmptyFunc is fast.
return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
for (const FuncInput input : *inputs) {
- PreventElision(EmptyFunc(arg, input));
+ platform::PreventElision(EmptyFunc(arg, input));
} // namespace
-int Unpredictable1() { return timer::Start64() != ~0ULL; }
+int Unpredictable1() { return timer::Start() != ~0ULL; }
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results, const Params& p) {
@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui
ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
InputVec subset(full.size() - num_skip);
- const Ticks overhead = Overhead(arg, &full, p);
- const Ticks overhead_skip = Overhead(arg, &subset, p);
+ const timer::Ticks overhead = Overhead(arg, &full, p);
+ const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
if (overhead < overhead_skip) {
- fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
- overhead_skip);
+ fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
+ size_t(overhead), size_t(overhead_skip));
return 0;
if (p.verbose) {
- printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
- overhead, overhead_skip);
+ printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
+ size_t(overhead), size_t(overhead_skip));
double max_rel_mad = 0.0;
- const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+ const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
for (size_t i = 0; i < unique.size(); ++i) {
FillSubset(full, unique[i], num_skip, &subset);
- const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
+ const timer::Ticks total_skip =
+ TotalDuration(func, arg, &subset, p, &max_rel_mad);
if (total < total_skip) {
- fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
+ fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
+ size_t(total_skip));
return 0;
- const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
+ const timer::Ticks duration =
+ (total - overhead) - (total_skip - overhead_skip);
results[i].input = unique[i];
results[i].ticks = static_cast<float>(duration) * mul;
results[i].variability = static_cast<float>(max_rel_mad);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 2021-06-02 10:56:05.272904579 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h 2021-05-31 10:37:11.000000000 -0400
@@ -44,11 +44,6 @@
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
-// WARNING if included from multiple translation units compiled with distinct
-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
-// macro that is unique to the current compile flags. We must also avoid
-// standard library headers such as vector and functional that define functions.
#include <stddef.h>
#include <stdint.h>
@@ -79,6 +74,16 @@ namespace platform {
// This call may be expensive, callers should cache the result.
double InvariantTicksPerSecond();
+// Returns current timestamp [in seconds] relative to an unspecified origin.
+// Features: monotonic (no negative elapsed time), steady (unaffected by system
+// time changes), high-resolution (on the order of microseconds).
+double Now();
+// Returns ticks elapsed in back to back timer calls, i.e. a function of the
+// timer resolution (minimum measurable difference) and overhead.
+// This call is expensive, callers should cache the result.
+uint64_t TimerResolution();
} // namespace platform
// Returns 1, but without the compiler knowing what the value is. This prevents
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-06-02 10:56:05.275904594 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-05-31 10:37:11.000000000 -0400
@@ -15,11 +15,11 @@
#include "hwy/nanobenchmark.h"
#include <stdio.h>
-#include <stdlib.h> // strtol
-#include <unistd.h> // sleep
#include <random>
+#include "hwy/tests/test_util-inl.h"
namespace hwy {
namespace {
@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in
template <size_t N>
void MeasureDiv(const FuncInput (&inputs)[N]) {
+ printf("Measuring integer division (output on final two lines)\n");
Result results[N];
Params params;
params.max_evals = 4; // avoid test timeout
@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp
-template <size_t N>
-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
- printf("Expect a 'measurement failed' below:\n");
- Result results[N];
- const size_t num_results = Measure(
- [](const void*, const FuncInput input) -> FuncOutput {
- // Loop until the sleep succeeds (not interrupted by signal). We assume
- // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
- while (sleep(2) != 0) {
- }
- return input;
- },
- nullptr, inputs, N, results);
- NANOBENCHMARK_CHECK(num_results == 0);
- (void)num_results;
-void RunAll(const int argc, char** /*argv*/) {
- // unpredictable == 1 but the compiler doesn't know that.
- const int unpredictable = argc != 999;
+TEST(NanobenchmarkTest, RunAll) {
+ const int unpredictable = Unpredictable1(); // == 1, unknown to compiler.
static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
static_cast<FuncInput>(unpredictable + 9)};
- EnsureLongMeasurementFails(inputs);
} // namespace
} // namespace hwy
-int main(int argc, char* argv[]) {
- hwy::RunAll(argc, argv);
- return 0;
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 2021-06-02 10:56:05.239904412 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
+namespace detail { // for code folding and Raw128
// Macros used to define single and double function calls for multiple types
// for full and half vectors. These macros are undefined at the end of the file.
@@ -133,7 +135,7 @@ namespace HWY_NAMESPACE {
HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
// float and double
-#if defined(__aarch64__)
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
@@ -181,7 +183,7 @@ namespace HWY_NAMESPACE {
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
// Emulation of some intrinsics on armv7.
-#if !defined(__aarch64__)
#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
@@ -294,7 +296,7 @@ struct Raw128<float, 4> {
using type = float32x4_t;
-#if defined(__aarch64__)
template <>
struct Raw128<double, 2> {
using type = float64x2_t;
@@ -352,7 +354,7 @@ struct Raw128<float, 2> {
using type = float32x2_t;
-#if defined(__aarch64__)
template <>
struct Raw128<double, 1> {
using type = float64x1_t;
@@ -437,12 +439,14 @@ struct Raw128<int8_t, 1> {
using type = int8x8_t;
+} // namespace detail
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T)>;
template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
- using Raw = typename Raw128<T, N>::type;
+ using Raw = typename detail::Raw128<T, N>::type;
HWY_INLINE Vec128() {}
@@ -480,7 +484,8 @@ class Vec128 {
// FF..FF or 0, also for floating-point - see README.
template <typename T, size_t N = 16 / sizeof(T)>
class Mask128 {
- using Raw = typename Raw128<T, N>::type;
+ // ARM C Language Extensions return and expect unsigned type.
+ using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
HWY_INLINE Mask128() {}
@@ -573,7 +578,7 @@ HWY_INLINE Vec128<int64_t, 1> BitCastFro
Vec128<uint8_t, 1 * 8> v) {
return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> BitCastFromByte(Simd<double, 1> /* tag */,
Vec128<uint8_t, 1 * 8> v) {
return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
@@ -615,7 +620,7 @@ HWY_INLINE Vec128<int64_t> BitCastFromBy
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
Vec128<uint8_t> v) {
return Vec128<double>(vreinterpretq_f64_u8(v.raw));
@@ -664,15 +669,25 @@ template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
- typename Raw128<T, N>::type a;
+ typename detail::Raw128<T, N>::type a;
return Vec128<T, N>(a);
-// ------------------------------ Extract lane
+// Returns a vector with lane i=[0, N) set to "first" + i.
+template <typename T, size_t N, typename T2>
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+ HWY_ALIGN T lanes[16 / sizeof(T)];
+ for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+ lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+ }
+ return Load(d, lanes);
+// ------------------------------ GetLane
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
- return vget_lane_u8(vget_low_u8(v.raw), 0);
+ return vgetq_lane_u8(v.raw, 0);
template <size_t N>
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128<
HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
- return vget_lane_s8(vget_low_s8(v.raw), 0);
+ return vgetq_lane_s8(v.raw, 0);
template <size_t N>
HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128<i
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
- return vget_lane_u16(vget_low_u16(v.raw), 0);
+ return vgetq_lane_u16(v.raw, 0);
template <size_t N>
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128
HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
- return vget_lane_s16(vget_low_s16(v.raw), 0);
+ return vgetq_lane_s16(v.raw, 0);
template <size_t N>
HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128<
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
- return vget_lane_u32(vget_low_u32(v.raw), 0);
+ return vgetq_lane_u32(v.raw, 0);
template <size_t N>
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128
HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
- return vget_lane_s32(vget_low_s32(v.raw), 0);
+ return vgetq_lane_s32(v.raw, 0);
template <size_t N>
HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128<
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
- return vget_lane_u64(vget_low_u64(v.raw), 0);
+ return vgetq_lane_u64(v.raw, 0);
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
return vget_lane_u64(v.raw, 0);
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
- return vget_lane_s64(vget_low_s64(v.raw), 0);
+ return vgetq_lane_s64(v.raw, 0);
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
return vget_lane_s64(v.raw, 0);
HWY_INLINE float GetLane(const Vec128<float, 4> v) {
- return vget_lane_f32(vget_low_f32(v.raw), 0);
+ return vgetq_lane_f32(v.raw, 0);
HWY_INLINE float GetLane(const Vec128<float, 2> v) {
return vget_lane_f32(v.raw, 0);
@@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128<fl
HWY_INLINE float GetLane(const Vec128<float, 1> v) {
return vget_lane_f32(v.raw, 0);
-#if defined(__aarch64__)
HWY_INLINE double GetLane(const Vec128<double, 2> v) {
- return vget_lane_f64(vget_low_f64(v.raw), 0);
+ return vgetq_lane_f64(v.raw, 0);
HWY_INLINE double GetLane(const Vec128<double, 1> v) {
return vget_lane_f64(v.raw, 0);
@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu
// ------------------------------ Average
// Returns (a + b + 1) / 2
-// Unsigned
HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
@@ -802,6 +815,7 @@ HWY_INLINE Vec128<int16_t> Abs(const Vec
HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
return Vec128<int32_t>(vabsq_s32(v.raw));
+// i64 is implemented after BroadcastSignBit.
HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
return Vec128<float>(vabsq_f32(v.raw));
@@ -823,7 +837,7 @@ HWY_INLINE Vec128<float, N> Abs(const Ve
return Vec128<float, N>(vabs_f32(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
return Vec128<double>(vabsq_f64(v.raw));
@@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below
HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
-#if defined(__aarch64__)
return Vec128<int64_t, 1>(vneg_s64(v.raw));
return Zero(Simd<int64_t, 1>()) - v;
@@ -847,7 +861,7 @@ HWY_INLINE Vec128<int64_t, 1> Neg(const
HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
-#if defined(__aarch64__)
return Vec128<int64_t>(vnegq_s64(v.raw));
return Zero(Full128<int64_t>()) - v;
@@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v
// ------------------------------ Shl
+HWY_INLINE Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
+ const Vec128<uint8_t> bits) {
+ return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
+ const Vec128<uint8_t, N> bits) {
+ return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
HWY_INLINE Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
const Vec128<uint16_t> bits) {
return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
@@ -905,6 +929,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator<
return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
+HWY_INLINE Vec128<int8_t> operator<<(const Vec128<int8_t> v,
+ const Vec128<int8_t> bits) {
+ return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
+template <size_t N, HWY_IF_LE64(int8_t, N)>
+HWY_INLINE Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
+ const Vec128<int8_t, N> bits) {
+ return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
HWY_INLINE Vec128<int16_t> operator<<(const Vec128<int16_t> v,
const Vec128<int16_t> bits) {
return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
@@ -936,6 +970,18 @@ HWY_INLINE Vec128<int64_t, 1> operator<<
// ------------------------------ Shr (Neg)
+HWY_INLINE Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
+ const Vec128<uint8_t> bits) {
+ const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
+ return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
+ const Vec128<uint8_t, N> bits) {
+ const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
+ return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
HWY_INLINE Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
const Vec128<uint16_t> bits) {
const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
@@ -971,6 +1017,16 @@ HWY_INLINE Vec128<uint64_t, 1> operator>
return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
+HWY_INLINE Vec128<int8_t> operator>>(const Vec128<int8_t> v,
+ const Vec128<int8_t> bits) {
+ return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
+template <size_t N, HWY_IF_LE64(int8_t, N)>
+HWY_INLINE Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
+ const Vec128<int8_t, N> bits) {
+ return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
HWY_INLINE Vec128<int16_t> operator>>(const Vec128<int16_t> v,
const Vec128<int16_t> bits) {
return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
@@ -1059,7 +1115,7 @@ HWY_INLINE Vec128<int32_t, N> operator*(
HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
-#if defined(__aarch64__)
int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
@@ -1070,7 +1126,7 @@ HWY_INLINE Vec128<int16_t> MulHigh(const
HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
-#if defined(__aarch64__)
uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
@@ -1139,24 +1195,37 @@ HWY_INLINE Vec128<float, N> ApproximateR
return Vec128<float, N>(vrecpe_f32(v.raw));
-#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
-// Emulated with approx reciprocal + Newton-Raphson + mul
+// Not defined on armv7: approximate
+namespace detail {
+HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
+ const Vec128<float> recip, const Vec128<float> divisor) {
+ return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
+template <size_t N>
+HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
+ const Vec128<float, N> recip, Vec128<float, N> divisor) {
+ return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
+} // namespace detail
template <size_t N>
HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
const Vec128<float, N> b) {
auto x = ApproximateReciprocal(b);
- // Newton-Raphson on 1/x - b
- const auto two = Set(Simd<float, N>(), 2);
- x = x * (two - b * x);
- x = x * (two - b * x);
- x = x * (two - b * x);
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+ x *= detail::ReciprocalNewtonRaphsonStep(x, b);
return a * x;
-// Absolute value of difference.
+// ------------------------------ Absolute value of difference.
HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
return Vec128<float>(vabdq_f32(a.raw, b.raw));
@@ -1169,7 +1238,7 @@ HWY_INLINE Vec128<float, N> AbsDiff(cons
// ------------------------------ Floating-point multiply-add variants
// Returns add + mul * x
-#if defined(__aarch64__)
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
@@ -1180,6 +1249,17 @@ HWY_INLINE Vec128<float> MulAdd(const Ve
const Vec128<float> add) {
return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
+// Emulate FMA for floats.
+template <size_t N>
+HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> add) {
+ return mul * x + add;
HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
const Vec128<double, 1> x,
const Vec128<double, 1> add) {
@@ -1190,18 +1270,10 @@ HWY_INLINE Vec128<double> MulAdd(const V
const Vec128<double> add) {
return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
-// Emulate FMA for floats.
-template <size_t N>
-HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
- const Vec128<float, N> x,
- const Vec128<float, N> add) {
- return mul * x + add;
// Returns add - mul * x
-#if defined(__aarch64__)
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
@@ -1213,7 +1285,17 @@ HWY_INLINE Vec128<float> NegMulAdd(const
const Vec128<float> add) {
return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
+// Emulate FMA for floats.
+template <size_t N>
+HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
+ const Vec128<float, N> x,
+ const Vec128<float, N> add) {
+ return add - mul * x;
HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
const Vec128<double, 1> x,
const Vec128<double, 1> add) {
@@ -1224,14 +1306,6 @@ HWY_INLINE Vec128<double> NegMulAdd(cons
const Vec128<double> add) {
return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
-// Emulate FMA for floats.
-template <size_t N>
-HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
- const Vec128<float, N> x,
- const Vec128<float, N> add) {
- return add - mul * x;
// Returns mul * x - sub
@@ -1241,12 +1315,6 @@ HWY_INLINE Vec128<float, N> MulSub(const
const Vec128<float, N> sub) {
return MulAdd(mul, x, Neg(sub));
-template <size_t N>
-HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
- const Vec128<double, N> x,
- const Vec128<double, N> sub) {
- return MulAdd(mul, x, Neg(sub));
// Returns -mul * x - sub
template <size_t N>
@@ -1255,14 +1323,23 @@ HWY_INLINE Vec128<float, N> NegMulSub(co
const Vec128<float, N> sub) {
return Neg(MulAdd(mul, x, sub));
+template <size_t N>
+HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
+ const Vec128<double, N> x,
+ const Vec128<double, N> sub) {
+ return MulAdd(mul, x, Neg(sub));
template <size_t N>
HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
const Vec128<double, N> x,
const Vec128<double, N> sub) {
return Neg(MulAdd(mul, x, sub));
-// ------------------------------ Floating-point square root
+// ------------------------------ Floating-point square root (IfThenZeroElse)
// Approximate reciprocal square root
HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
@@ -1275,80 +1352,36 @@ HWY_INLINE Vec128<float, N> ApproximateR
// Full precision square root
-#if defined(__aarch64__)
-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
-template <size_t N>
-HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
- auto b = v;
- auto Y = ApproximateReciprocalSqrt(v);
- auto x = v * Y;
- const auto half = Set(Simd<float, N>(), 0.5);
- const auto oneandhalf = Set(Simd<float, N>(), 1.5);
- for (size_t i = 0; i < 3; i++) {
- b = b * Y * Y;
- Y = oneandhalf - half * b;
- x = x * Y;
- }
- return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
-// ================================================== COMPARE
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
+namespace detail {
-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
- static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
- return Mask128<TTo, N>{m.raw};
+HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
+ const Vec128<float> recip) {
+ return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
+template <size_t N>
+HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
+ Vec128<float, N> recip) {
+ return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
- const Vec128<type, size> a, const Vec128<type, size> b
-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
-// ------------------------------ Equality
-#if defined(__aarch64__)
-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+} // namespace detail
-// ------------------------------ Strict inequality
+// Not defined on armv7: approximate
+template <size_t N>
+HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
+ auto recip = ApproximateReciprocalSqrt(v);
-// Signed/float < (no unsigned)
-#if defined(__aarch64__)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+ recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-// Signed/float > (no unsigned)
-#if defined(__aarch64__)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
+ const auto root = v * recip;
+ return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
-// ------------------------------ Weak inequality
-// Float <= >=
// ================================================== LOGICAL
@@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato
// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
template <typename T>
HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
- const Full128<uint8_t> d8;
- return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
+ const Full128<T> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
- const Repartition<uint8_t, Simd<T, N>> d8;
- return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
+ const Simd<T, N> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ using V8 = decltype(Zero(d8));
+ return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
// ------------------------------ And
@@ -1463,33 +1499,38 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
return ShiftRight<sizeof(T) * 8 - 1>(v);
-// ------------------------------ Make mask
+// ================================================== MASK
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
- static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
- return (v & bit) == bit;
+// ------------------------------ To/from vector
-// Mask and Vec are the same (true = FF..FF).
+// Mask and Vec have the same representation (true = FF..FF).
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
- return Mask128<T, N>(v.raw);
+ const Simd<MakeUnsigned<T>, N> du;
+ return Mask128<T, N>(BitCast(du, v).raw);
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
- return Vec128<T, N>(v.raw);
+ return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
- const Mask128<T, N> v) {
- return Vec128<T, N>(v.raw);
+HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
+ return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
-// IfThenElse(mask, yes, no)
-// Returns mask ? b : a.
+// ------------------------------ RebindMask
+template <typename TFrom, typename TTo, size_t N>
+HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
+ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+ return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
+// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
@@ -1524,7 +1565,6 @@ HWY_INLINE Vec128<T, N> ZeroIfNegative(V
return Max(zero, v);
// ------------------------------ Mask logical
template <typename T, size_t N>
@@ -1557,30 +1597,183 @@ HWY_API Mask128<T, N> Xor(const Mask128<
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
+// ================================================== COMPARE
-namespace detail {
+// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-#if defined(__aarch64__)
+// ------------------------------ Shuffle2301 (for i64 compares)
-HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
- return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
+// Swap 32-bit halves in 64-bits
+HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
+ return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
+HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
+ return Vec128<int32_t, 2>(vrev64_s32(v.raw));
+HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
+ return Vec128<float, 2>(vrev64_f32(v.raw));
+HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
+ return Vec128<uint32_t>(vrev64q_u32(v.raw));
-HWY_INLINE Vec128<uint64_t, 1> Gt(Vec128<uint64_t, 1> a,
- Vec128<uint64_t, 1> b) {
- return Vec128<uint64_t, 1>(vcgt_u64(a.raw, b.raw));
+HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
+ return Vec128<int32_t>(vrev64q_s32(v.raw));
+HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
+ return Vec128<float>(vrev64q_f32(v.raw));
-HWY_INLINE Vec128<int64_t> Gt(Vec128<int64_t> a, Vec128<int64_t> b) {
- return Vec128<int64_t>(vcgtq_s64(a.raw, b.raw));
+#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
+#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
+ const Vec128<type, size> a, const Vec128<type, size> b
+#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
+// ------------------------------ Equality
+// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+// ------------------------------ Strict inequality (signed, float)
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
+// ------------------------------ Weak inequality (float)
+// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
+ const Vec128<int64_t, N> b) {
+ const Simd<int32_t, N * 2> d32;
+ const Simd<int64_t, N> d64;
+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
+ const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+ return MaskFromVec(BitCast(d64, cmp64));
+template <size_t N>
+HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
+ const Vec128<uint64_t, N> b) {
+ const Simd<uint32_t, N * 2> d32;
+ const Simd<uint64_t, N> d64;
+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
+ const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+ return MaskFromVec(BitCast(d64, cmp64));
-HWY_INLINE Vec128<int64_t, 1> Gt(Vec128<int64_t, 1> a, Vec128<int64_t, 1> b) {
- return Vec128<int64_t, 1>(vcgt_s64(a.raw, b.raw));
+HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
+ const Vec128<int64_t> b) {
+ const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
+ return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
+HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
+ const Vec128<int64_t, 1> b) {
+ const int64x1_t sub = vqsub_s64(a.raw, b.raw);
+ return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
-} // namespace detail
+// ------------------------------ Reversed comparisons
+template <typename T, size_t N>
+HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+ return operator<(b, a);
+template <typename T, size_t N>
+HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
+ return operator<=(b, a);
+// ------------------------------ FirstN (Iota, Lt)
+template <typename T, size_t N>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
+// ------------------------------ TestBit (Eq)
+#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
+#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
+ Vec128<type, size> v, Vec128<type, size> bit
+#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
+// No 64-bit versions on armv7
+template <size_t N>
+HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
+ Vec128<uint64_t, N> bit) {
+ return (v & bit) == bit;
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
+ Vec128<int64_t, N> bit) {
+ return (v & bit) == bit;
+// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
+HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
+ return Vec128<int64_t>(vabsq_s64(v.raw));
+ const auto zero = Zero(Full128<int64_t>());
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
+ return Vec128<int64_t, 1>(vabs_s64(v.raw));
+ const auto zero = Zero(Simd<int64_t, 1>());
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
+// ------------------------------ Min (IfThenElse, BroadcastSignBit)
+HWY_INLINE Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+ return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
+HWY_INLINE Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
+ Vec128<uint64_t, 1> b) {
+ return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
// Unsigned
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
@@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min,
template <size_t N>
HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
-#if defined(__aarch64__)
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
+ return IfThenElse(b < a, b, a);
const Simd<uint64_t, N> du;
const Simd<int64_t, N> di;
@@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v
template <size_t N>
HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
-#if defined(__aarch64__)
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
+ return IfThenElse(b < a, b, a);
const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
@@ -1612,7 +1805,7 @@ HWY_INLINE Vec128<int64_t, N> Min(const
// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
-#if defined(__aarch64__)
@@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max,
template <size_t N>
HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
-#if defined(__aarch64__)
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
+ return IfThenElse(b < a, a, b);
const Simd<uint64_t, N> du;
const Simd<int64_t, N> di;
@@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v
template <size_t N>
HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
-#if defined(__aarch64__)
- return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
+ return IfThenElse(b < a, a, b);
const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
@@ -1650,7 +1843,7 @@ HWY_INLINE Vec128<int64_t, N> Max(const
// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
-#if defined(__aarch64__)
@@ -1696,7 +1889,7 @@ HWY_INLINE Vec128<float> LoadU(Full128<f
const float* HWY_RESTRICT aligned) {
return Vec128<float>(vld1q_f32(aligned));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
const double* HWY_RESTRICT aligned) {
return Vec128<double>(vld1q_f64(aligned));
@@ -1741,7 +1934,7 @@ HWY_INLINE Vec128<float, 2> LoadU(Simd<f
const float* HWY_RESTRICT p) {
return Vec128<float, 2>(vld1_f32(p));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
const double* HWY_RESTRICT p) {
return Vec128<double, 1>(vld1_f64(p));
@@ -1755,73 +1948,72 @@ HWY_INLINE Vec128<double, 1> LoadU(Simd<
// we don't actually care what is in it, and we don't want
// to introduce extra overhead by initializing it to something.
-HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
+HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
const uint8_t* HWY_RESTRICT p) {
- uint32x2_t a = Undefined(d).raw;
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
-HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
+HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
const uint16_t* HWY_RESTRICT p) {
- uint32x2_t a = Undefined(d).raw;
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
-HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
+HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
const uint32_t* HWY_RESTRICT p) {
- uint32x2_t a = Undefined(d).raw;
+ uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
uint32x2_t b = vld1_lane_u32(p, a, 0);
return Vec128<uint32_t, 1>(b);
-HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
+HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
const int8_t* HWY_RESTRICT p) {
- int32x2_t a = Undefined(d).raw;
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
-HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
+HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
const int16_t* HWY_RESTRICT p) {
- int32x2_t a = Undefined(d).raw;
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
-HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
+HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
const int32_t* HWY_RESTRICT p) {
- int32x2_t a = Undefined(d).raw;
+ int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
int32x2_t b = vld1_lane_s32(p, a, 0);
return Vec128<int32_t, 1>(b);
-HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
+HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
const float* HWY_RESTRICT p) {
- float32x2_t a = Undefined(d).raw;
+ float32x2_t a = Undefined(Simd<float, 2>()).raw;
float32x2_t b = vld1_lane_f32(p, a, 0);
return Vec128<float, 1>(b);
// ------------------------------ Load 16
-HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
+HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
const uint8_t* HWY_RESTRICT p) {
- uint16x4_t a = Undefined(d).raw;
+ uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
-HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
+HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
const uint16_t* HWY_RESTRICT p) {
- uint16x4_t a = Undefined(d).raw;
+ uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
uint16x4_t b = vld1_lane_u16(p, a, 0);
return Vec128<uint16_t, 1>(b);
-HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
+HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
const int8_t* HWY_RESTRICT p) {
- int16x4_t a = Undefined(d).raw;
+ int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
-HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
+HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
const int16_t* HWY_RESTRICT p) {
- int16x4_t a = Undefined(d).raw;
+ int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
int16x4_t b = vld1_lane_s16(p, a, 0);
return Vec128<int16_t, 1>(b);
@@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128<floa
float* HWY_RESTRICT aligned) {
vst1q_f32(aligned, v.raw);
-#if defined(__aarch64__)
HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
double* HWY_RESTRICT aligned) {
vst1q_f64(aligned, v.raw);
@@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128<floa
float* HWY_RESTRICT p) {
vst1_f32(p, v.raw);
-#if defined(__aarch64__)
HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
double* HWY_RESTRICT p) {
vst1_f64(p, v.raw);
@@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128<doub
HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
uint8_t* HWY_RESTRICT p) {
uint32x2_t a = vreinterpret_u32_u8(v.raw);
- vst1_lane_u32(p, a, 0);
+ vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
uint16_t* HWY_RESTRICT p) {
uint32x2_t a = vreinterpret_u32_u16(v.raw);
- vst1_lane_u32(p, a, 0);
+ vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
uint32_t* HWY_RESTRICT p) {
@@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128<uint
HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
int8_t* HWY_RESTRICT p) {
int32x2_t a = vreinterpret_s32_s8(v.raw);
- vst1_lane_s32(p, a, 0);
+ vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
int16_t* HWY_RESTRICT p) {
int32x2_t a = vreinterpret_s32_s16(v.raw);
- vst1_lane_s32(p, a, 0);
+ vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
int32_t* HWY_RESTRICT p) {
@@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128<floa
HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
uint8_t* HWY_RESTRICT p) {
uint16x4_t a = vreinterpret_u16_u8(v.raw);
- vst1_lane_u16(p, a, 0);
+ vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
uint16_t* HWY_RESTRICT p) {
@@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128<uint
HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
int8_t* HWY_RESTRICT p) {
int16x4_t a = vreinterpret_s16_s8(v.raw);
- vst1_lane_s16(p, a, 0);
+ vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
int16_t* HWY_RESTRICT p) {
@@ -2068,18 +2260,18 @@ HWY_INLINE Vec128<uint64_t> PromoteTo(Fu
const Vec128<uint32_t, 2> v) {
return Vec128<uint64_t>(vmovl_u32(v.raw));
-HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
+HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
const Vec128<uint8_t, 8> v) {
- return Vec128<int16_t>(vmovl_u8(v.raw));
+ return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
const Vec128<uint8_t, 4> v) {
uint16x8_t a = vmovl_u8(v.raw);
- return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
+ return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
const Vec128<uint16_t, 4> v) {
- return Vec128<int32_t>(vmovl_u16(v.raw));
+ return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
// Unsigned: zero-extend to half vector.
@@ -2105,9 +2297,9 @@ HWY_INLINE Vec128<uint64_t, N> PromoteTo
return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
+HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
const Vec128<uint8_t, N> v) {
- return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
+ return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
@@ -2170,12 +2362,14 @@ HWY_INLINE Vec128<int64_t, N> PromoteTo(
HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
const Vec128<float16_t, 4> v) {
- return Vec128<float>(vcvt_f32_f16(v.raw));
+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
+ return Vec128<float>(f32);
template <size_t N>
HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
const Vec128<float16_t, N> v) {
- return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
+ return Vec128<float, N>(vget_low_f32(f32));
@@ -2204,7 +2398,7 @@ HWY_INLINE Vec128<float, N> PromoteTo(Si
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
const Vec128<float, 2> v) {
@@ -2298,12 +2492,13 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
HWY_INLINE Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
const Vec128<float> v) {
- return Vec128<float16_t, 4>{vcvt_f16_f32(v.raw)};
+ return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
template <size_t N>
HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
const Vec128<float, N> v) {
- return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
+ const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
+ return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
@@ -2339,7 +2534,7 @@ HWY_INLINE Vec128<float16_t, N> DemoteTo
-#if defined(__aarch64__)
HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
const Vec128<double> v) {
@@ -2397,7 +2592,7 @@ HWY_INLINE Vec128<int8_t, N> DemoteTo(Si
const Vec128<int32_t> v) {
Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
Vec128<int16_t, N> b;
- uint16x8_t c = vcombine_s16(a.raw, b.raw);
+ int16x8_t c = vcombine_s16(a.raw, b.raw);
return Vec128<int8_t, N>(vqmovn_s16(c));
@@ -2426,7 +2621,7 @@ HWY_INLINE Vec128<int32_t, N> ConvertTo(
return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
const Vec128<int64_t> v) {
@@ -2451,7 +2646,7 @@ HWY_INLINE Vec128<int64_t, 1> ConvertTo(
// ------------------------------ Round (IfThenElse, mask, logical)
-#if defined(__aarch64__)
// Toward nearest integer
@@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor,
// representation, clearing the lowest 23-exp mantissa bits. This requires 9
// integer operations and 3 constants, which is likely more expensive.
+namespace detail {
+// The original value is already the desired result if NaN or the magnitude is
+// large (i.e. the value is already an integer).
+template <size_t N>
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
+ return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
+} // namespace detail
template <size_t N>
HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
const Simd<float, N> df;
- const Simd<int32_t, N> di;
+ const RebindToSigned<decltype(df)> di;
const auto integer = ConvertTo(di, v); // round toward 0
const auto int_f = ConvertTo(df, integer);
- // The original value is already the desired result if NaN or the magnitude is
- // large (i.e. the value is already an integer).
- const auto max = Set(df, MantissaEnd<float>());
- return IfThenElse(Abs(v) < max, int_f, v);
+ return IfThenElse(detail::UseInt(v), int_f, v);
template <size_t N>
@@ -2506,7 +2709,7 @@ HWY_INLINE Vec128<float, N> Round(const
template <size_t N>
HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
const Simd<float, N> df;
- const Simd<int32_t, N> di;
+ const RebindToSigned<decltype(df)> di;
const auto integer = ConvertTo(di, v); // round toward 0
const auto int_f = ConvertTo(df, integer);
@@ -2514,9 +2717,7 @@ HWY_INLINE Vec128<float, N> Ceil(const V
// Truncating a positive non-integer ends up smaller; if so, add 1.
const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
- // Keep original if NaN or the magnitude is large (already an int).
- const auto max = Set(df, MantissaEnd<float>());
- return IfThenElse(Abs(v) < max, int_f - neg1, v);
+ return IfThenElse(detail::UseInt(v), int_f - neg1, v);
template <size_t N>
@@ -2530,16 +2731,14 @@ HWY_INLINE Vec128<float, N> Floor(const
// Truncating a negative non-integer ends up larger; if so, subtract 1.
const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
- // Keep original if NaN or the magnitude is large (already an int).
- const auto max = Set(df, MantissaEnd<float>());
- return IfThenElse(Abs(v) < max, int_f + neg1, v);
+ return IfThenElse(detail::UseInt(v), int_f + neg1, v);
// ------------------------------ NearestInt (Round)
-#if defined(__aarch64__)
HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
@@ -2596,7 +2795,7 @@ HWY_INLINE Vec128<int64_t, 1> LowerHalf(
HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
return Vec128<float, 2>(vget_low_f32(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
return Vec128<double, 1>(vget_low_f64(v.raw));
@@ -2629,7 +2828,7 @@ HWY_INLINE Vec128<int64_t, 1> UpperHalf(
HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
return Vec128<float, 2>(vget_high_f32(v.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
return Vec128<double, 1>(vget_high_f64(v.raw));
@@ -2714,7 +2913,7 @@ HWY_INLINE Vec128<T, N> ShiftRightLanes(
// ------------------------------ Broadcast/splat any lane
-#if defined(__aarch64__)
// Unsigned
template <int kLane>
HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
@@ -2886,7 +3085,7 @@ HWY_API Vec128<T> TableLookupBytes(const
const Vec128<T> from) {
const Full128<T> d;
const Repartition<uint8_t, decltype(d)> d8;
-#if defined(__aarch64__)
return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
BitCast(d8, from).raw)));
@@ -2911,33 +3110,58 @@ HWY_INLINE Vec128<T, N> TableLookupBytes
BitCast(d8, from).raw)));
-// ------------------------------ Hard-coded shuffles
+// ------------------------------ TableLookupLanes
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
+// Returned by SetTableIndices for use by TableLookupLanes.
+template <typename T, size_t N>
+struct Indices128 {
+ typename detail::Raw128<T, N>::type raw;
-// Swap 32-bit halves in 64-bits
-HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
- return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
-HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
- return Vec128<int32_t, 2>(vrev64_s32(v.raw));
-HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
- return Vec128<float, 2>(vrev64_f32(v.raw));
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+ for (size_t i = 0; i < N; ++i) {
+ HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+ }
+ const Repartition<uint8_t, decltype(d)> d8;
+ alignas(16) uint8_t control[16] = {0};
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ control[idx_lane * sizeof(T) + idx_byte] =
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+ }
+ }
+ return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
-HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>(vrev64q_u32(v.raw));
+template <size_t N>
+HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
-HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
- return Vec128<int32_t>(vrev64q_s32(v.raw));
+template <size_t N>
+HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
+ const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
-HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
- return Vec128<float>(vrev64q_f32(v.raw));
+template <size_t N>
+HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+ const Indices128<float, N> idx) {
+ const Simd<int32_t, N> di;
+ const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
+ return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
+// ------------------------------ Other shuffles (TableLookupBytes)
+// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
+// Shuffle0321 rotates one lane to the right (the previous least-significant
+// lane is now most-significant). These could also be implemented via
+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
// Swap 64-bit halves
template <typename T>
HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
@@ -2975,49 +3199,6 @@ HWY_INLINE Vec128<T> Shuffle0123(const V
return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
-// ------------------------------ TableLookupLanes
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
-struct Indices128 {
- uint8x16_t raw;
-template <typename T>
-HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
- const size_t N = 16 / sizeof(T);
- for (size_t i = 0; i < N; ++i) {
- HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
- }
- const Full128<uint8_t> d8;
- alignas(16) uint8_t control[16];
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
- const size_t idx_lane = idx_byte / sizeof(T);
- const size_t mod = idx_byte % sizeof(T);
- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
- }
- return Indices128<T>{Load(d8, control).raw};
-HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
- const Indices128<uint32_t> idx) {
- return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
-HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
- const Indices128<int32_t> idx) {
- return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
-HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
- const Indices128<float> idx) {
- const Full128<int32_t> di;
- const Full128<float> df;
- return BitCast(df,
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
// ------------------------------ Interleave lanes
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
-#if defined(__aarch64__)
// For 64 bit types, we only have the "q" version of the function defined as
// interleaving 64-wide registers with 64-wide types in them makes no sense.
HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
@@ -3079,7 +3260,7 @@ HWY_INLINE Vec128<float> InterleaveLower
const Vec128<float> b) {
return Vec128<float>(vzip1q_f32(a.raw, b.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
const Vec128<double> b) {
return Vec128<double>(vzip1q_f64(a.raw, b.raw));
@@ -3090,10 +3271,10 @@ HWY_INLINE Vec128<float> InterleaveUpper
const Vec128<float> b) {
return Vec128<float>(vzip2q_f32(a.raw, b.raw));
-#if defined(__aarch64__)
HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
const Vec128<double> b) {
- return Vec128<double>(vzip2q_s64(a.raw, b.raw));
+ return Vec128<double>(vzip2q_f64(a.raw, b.raw));
@@ -3105,119 +3286,125 @@ HWY_INLINE Vec128<double> InterleaveUppe
// Full vectors
HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
const Vec128<uint8_t> b) {
- return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
+ return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw)));
HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
- return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
+ return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw)));
HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
- return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
+ return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw)));
HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
const Vec128<int8_t> b) {
- return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
+ return Vec128<int16_t>(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw)));
HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
- return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
+ return Vec128<int32_t>(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw)));
HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
- return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
+ return Vec128<int64_t>(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw)));
HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
const Vec128<uint8_t> b) {
- return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
+ return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw)));
HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
- return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
+ return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw)));
HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
- return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
+ return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw)));
HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
const Vec128<int8_t> b) {
- return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
+ return Vec128<int16_t>(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw)));
HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
- return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
+ return Vec128<int32_t>(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw)));
HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
- return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
+ return Vec128<int64_t>(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw)));
// Half vectors or less
template <size_t N, HWY_IF_LE64(uint8_t, N)>
HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
+ return Vec128<uint16_t, (N + 1) / 2>(
+ vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
- return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
+ return Vec128<uint32_t, (N + 1) / 2>(
+ vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
- return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
+ return Vec128<uint64_t, (N + 1) / 2>(
+ vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
+ return Vec128<int16_t, (N + 1) / 2>(
+ vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
- return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
+ return Vec128<int32_t, (N + 1) / 2>(
+ vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
- return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
+ return Vec128<int64_t, (N + 1) / 2>(
+ vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(uint8_t, N)>
HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
+ return Vec128<uint16_t, N / 2>(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
- return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
+ return Vec128<uint32_t, N / 2>(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
- return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
+ return Vec128<uint64_t, N / 2>(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
+ return Vec128<int16_t, N / 2>(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
- return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
+ return Vec128<int32_t, N / 2>(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw)));
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
- return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
+ return Vec128<int64_t, N / 2>(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw)));
// ------------------------------ Blocks
@@ -3274,84 +3461,113 @@ HWY_INLINE Vec128<T> OddEven(const Vec12
// ================================================== MISC
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
- HWY_ALIGN T lanes[16 / sizeof(T)];
- for (size_t i = 0; i < 16 / sizeof(T); ++i) {
- lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+// ------------------------------ Scatter (Store)
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+ }
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ for (size_t i = 0; i < N; ++i) {
+ base[index_lanes[i]] = lanes[i];
- return Load(d, lanes);
-// ------------------------------ Gather (requires GetLane)
+// ------------------------------ Gather (Load/Store)
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
- static_assert(N == 1, "NEON does not support full gather");
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
- T val;
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
- return Set(d, val);
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ alignas(16) T lanes[N];
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+ }
+ return Load(d, lanes);
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
- static_assert(N == 1, "NEON does not support full gather");
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
- return Set(d, base[GetLane(index)]);
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ alignas(16) T lanes[N];
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = base[index_lanes[i]];
+ }
+ return Load(d, lanes);
-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
+// ------------------------------ Reductions
-#if !defined(__aarch64__)
+namespace detail {
-template <size_t N>
-HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
- const Vec128<int64_t, N> b) {
- const Simd<int32_t, N * 2> d32;
- const Simd<int64_t, N> d64;
- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
- const auto cmp64 = cmp32 & Shuffle2301(cmp32);
- return MaskFromVec(BitCast(d64, cmp64));
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
+ return v;
-template <size_t N>
-HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
- const Vec128<uint64_t, N> b) {
- const Simd<uint32_t, N * 2> d32;
- const Simd<uint64_t, N> d64;
- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
- const auto cmp64 = cmp32 & Shuffle2301(cmp32);
- return MaskFromVec(BitCast(d64, cmp64));
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
-HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
- const Vec128<int64_t> b) {
- const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
- return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
+// u32/i32/f32: N=2
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
+ return v10 + Shuffle2301(v10);
-HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
- const Vec128<int64_t, 1> b) {
- const int64x1_t sub = vqsub_s64(a.raw, b.raw);
- return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Min(v10, Shuffle2301(v10));
-template <size_t N>
-HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
- const Vec128<int64_t, N> b) {
- return b < a;
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Max(v10, Shuffle2301(v10));
-// ------------------------------ Reductions
-#if defined(__aarch64__)
-// Supported for 32b and 64b vector types. Returns the sum in each lane.
+// full vectors
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
@@ -3398,20 +3614,15 @@ HWY_INLINE Vec128<int64_t> SumOfLanes(co
-namespace detail {
-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
@@ -3419,15 +3630,13 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
// For u64/i64[/f64].
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
@@ -3435,6 +3644,10 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
} // namespace detail
template <typename T, size_t N>
+HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
+ return detail::SumOfLanes(v);
+template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
@@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Vec128<uint8_t> values =
BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
// Can't vaddv - we need two separate bytes (16 bits).
const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
const uint8x8_t x4 = vpadd_u8(x2, x2);
const uint8x8_t x8 = vpadd_u8(x4, x4);
- return vreinterpret_u16_u8(x8)[0];
+ return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
// Don't have vpaddq, so keep doubling lane size.
const uint16x8_t x2 = vpaddlq_u8(values.raw);
const uint32x4_t x4 = vpaddlq_u16(x2);
const uint64x2_t x8 = vpaddlq_u32(x4);
- return (uint64_t(x8[1]) << 8) | x8[0];
+ return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
@@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if defined(__aarch64__)
return vaddv_u8(values.raw);
const uint16x4_t x2 = vpaddl_u8(values.raw);
@@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Full128<uint16_t> du;
const Vec128<uint16_t> values =
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
return vaddvq_u16(values.raw);
const uint32x4_t x2 = vpaddlq_u16(values.raw);
@@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Simd<uint16_t, N> du;
const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if defined(__aarch64__)
return vaddv_u16(values.raw);
const uint32x2_t x2 = vpaddl_u16(values.raw);
@@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Full128<uint32_t> du;
const Vec128<uint32_t> values =
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
return vaddvq_u32(values.raw);
const uint64x2_t x2 = vpaddlq_u32(values.raw);
@@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Simd<uint32_t, N> du;
const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if defined(__aarch64__)
return vaddv_u32(values.raw);
const uint64x1_t x2 = vpaddl_u32(values.raw);
@@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
const Full128<uint64_t> du;
const Vec128<uint64_t> values =
BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
-#if defined(__aarch64__)
return vaddvq_u64(values.raw);
return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
@@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
const int8x16_t ones =
vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-#if defined(__aarch64__)
return vaddvq_s8(ones);
const int16x8_t x2 = vpaddlq_s8(ones);
const int32x4_t x4 = vpaddlq_s16(x2);
const int64x2_t x8 = vpaddlq_s32(x4);
- return x8[0] + x8[1];
+ return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
template <typename T>
@@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
const int16x8_t ones =
vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-#if defined(__aarch64__)
return vaddvq_s16(ones);
const int32x4_t x2 = vpaddlq_s16(ones);
const int64x2_t x4 = vpaddlq_s32(x2);
- return x4[0] + x4[1];
+ return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
@@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
const int32x4_t ones =
vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-#if defined(__aarch64__)
return vaddvq_s32(ones);
const int64x2_t x2 = vpaddlq_s32(ones);
- return x2[0] + x2[1];
+ return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
-#if defined(__aarch64__)
const Full128<int64_t> di;
const int64x2_t ones =
vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
return vaddvq_s64(ones);
- const Full128<int64_t> di;
- const int64x2_t ones =
- vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
- return ones[0] + ones[1];
+ const Full128<uint64_t> du;
+ const auto mask_u = VecFromMask(du, RebindMask(du, mask));
+ const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
+ return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
@@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma
// Full
template <typename T>
HWY_INLINE bool AllFalse(const Mask128<T> m) {
+ const Full128<uint32_t> d32;
+ const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128<T>(), m)));
+ return (vmaxvq_u32(m32.raw) == 0);
const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
uint32x2_t a = vqmovn_u64(v64.raw);
- return vreinterpret_u64_u32(a)[0] == 0;
+ return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
// Partial
@@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128<T,
namespace detail {
+// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
+HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
+ const uint8_t* bytes) {
+ return Vec128<uint8_t>(vreinterpretq_u8_u64(
+ vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
+// Load 8 bytes and return half-reg with N <= 8 bytes.
+template <size_t N, HWY_IF_LE64(uint8_t, N)>
+HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N> d,
+ const uint8_t* bytes) {
+ return Load(d, bytes);
template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
+ const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 256);
+ const Simd<T, N> d;
+ const Repartition<uint8_t, decltype(d)> d8;
+ const Simd<uint16_t, N> du;
+ // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
+ // indices for VTBL (one vector's worth for each of 256 combinations of
+ // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+ // store lane indices and convert to byte indices (2*lane + 0..1), with the
+ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+ // is likely more costly than the higher cache footprint from storing bytes.
+ alignas(16) constexpr uint8_t table[256 * 8] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
+ const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+ return BitCast(d, pairs + Set(du, 0x0100));
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
+ const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
// There are only 4 lanes, so we can afford to load the index vector directly.
@@ -3742,7 +4113,8 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(
template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
+ const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 4);
// There are only 2 lanes, so we can afford to load the index vector directly.
@@ -3761,59 +4133,15 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
// Helper function called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- const Simd<float, N> df;
- const Simd<int32_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- const Simd<double, N> df;
- const Simd<int64_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
+ const auto idx =
+ detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
+ using D = Simd<T, N>;
+ const RebindToSigned<D> di;
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
} // namespace detail
template <typename T, size_t N>
@@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128<T, N
return PopCount(mask_bits);
+// ------------------------------ StoreInterleaved3
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
+ const Vec128<uint8_t> v1,
+ const Vec128<uint8_t> v2,
+ Full128<uint8_t> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
+ vst3q_u8(unaligned, triple);
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
+ const Vec128<uint8_t, 8> v1,
+ const Vec128<uint8_t, 8> v2,
+ Simd<uint8_t, 8> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
+ vst3_u8(unaligned, triple);
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
+ const Vec128<uint8_t, N> v1,
+ const Vec128<uint8_t, N> v2,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ alignas(16) uint8_t buf[24];
+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
+ vst3_u8(buf, triple);
+ CopyBytes<N * 3>(buf, unaligned);
+// ------------------------------ StoreInterleaved4
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+ const Vec128<uint8_t> v1,
+ const Vec128<uint8_t> v2,
+ const Vec128<uint8_t> v3,
+ Full128<uint8_t> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+ vst4q_u8(unaligned, quad);
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> v0,
+ const Vec128<uint8_t, 8> v1,
+ const Vec128<uint8_t, 8> v2,
+ const Vec128<uint8_t, 8> v3,
+ Simd<uint8_t, 8> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+ vst4_u8(unaligned, quad);
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> v0,
+ const Vec128<uint8_t, N> v1,
+ const Vec128<uint8_t, N> v2,
+ const Vec128<uint8_t, N> v3,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ alignas(16) uint8_t buf[32];
+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
+ vst4_u8(buf, quad);
+ CopyBytes<N * 4>(buf, unaligned);
// ================================================== Operator wrapper
// These apply to all x86_*-inl.h because there are no restrictions on V.
@@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a
return a <= b;
-#if !defined(__aarch64__)
+namespace detail { // for code folding
#undef vuzp1_s8
#undef vuzp1_u8
#undef vuzp1_s16
@@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
+} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 2021-06-02 10:56:05.230904367 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -39,6 +39,11 @@ using TFromV = TFromD<DFromV<V>>;
hwy::EnableIf<IsSigned<TFromV<V>>() && !IsFloat<TFromV<V>>()>* = nullptr
#define HWY_IF_FLOAT_V(V) hwy::EnableIf<IsFloat<TFromV<V>>()>* = nullptr
+// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
+template <typename T, int kShift = 0>
+using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
+ : (HWY_LANES(T) << kShift)>;
// ================================================== MACROS
// Generate specializations and function definitions using X macros. Although
@@ -58,29 +63,30 @@ namespace detail { // for code folding
// For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the
// preprocessor cannot easily do it.
- X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP) \
- X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP) \
- X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP) \
- X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP)
- X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP) \
- X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP) \
- X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP) \
- X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP)
- X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP) \
- X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP) \
- X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP) \
- X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP)
- X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP) \
- X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP) \
- X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP) \
- X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP)
+// TODO(janwas): GCC does not yet support fractional LMUL
+ X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
+ X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
+ X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
+ X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
+ X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
+ X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP) \
+ X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP) \
+ X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
+ X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
+ X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
+ X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP) \
+ X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
+ X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
+ X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
+ X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
+ X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP)
// SEW for unsigned:
@@ -153,63 +159,61 @@ namespace detail { // for code folding
// Assemble types for use in x-macros
#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t
+#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
#define HWY_RVV_M(MLEN) vbool##MLEN##_t
} // namespace detail
// TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly
-// TODO(janwas): do we want fractional LMUL? (can encode as negative)
-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they
-// need many registers.
- using HWY_RVV_D(CHAR, SEW, LMUL) = \
- using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \
- template <> \
- struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
- using Lane = HWY_RVV_T(BASE, SEW); \
- using type = Simd<Lane, HWY_LANES(Lane) * LMUL>; \
+// Until we have full intrinsic support for fractional LMUL, mixed-precision
+// code can use LMUL 1..8 (adequate unless they need many registers).
+ template <> \
+ struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
+ using Lane = HWY_RVV_T(BASE, SEW); \
+ using type = Full<Lane, SHIFT>; \
using Vf16m1 = vfloat16m1_t;
using Vf16m2 = vfloat16m2_t;
using Vf16m4 = vfloat16m4_t;
using Vf16m8 = vfloat16m8_t;
-using Df16m1 = Simd<float16_t, HWY_LANES(uint16_t) * 1>;
-using Df16m2 = Simd<float16_t, HWY_LANES(uint16_t) * 2>;
-using Df16m4 = Simd<float16_t, HWY_LANES(uint16_t) * 4>;
-using Df16m8 = Simd<float16_t, HWY_LANES(uint16_t) * 8>;
+using Df16m1 = Full<float16_t, 0>;
+using Df16m2 = Full<float16_t, 1>;
+using Df16m4 = Full<float16_t, 2>;
+using Df16m8 = Full<float16_t, 3>;
// vector = f(d), e.g. Zero
(void)Lanes(d); \
- return v##OP##_##CHAR##SEW##m##LMUL(); \
+ return v##OP##_##CHAR##SEW##LMUL(); \
// vector = f(vector), e.g. Not
- return v##OP##_v_##CHAR##SEW##m##LMUL(v); \
+ return v##OP##_v_##CHAR##SEW##LMUL(v); \
// vector = f(vector, scalar), e.g. detail::Add
- return v##OP##_##CHAR##SEW##m##LMUL(a, b); \
+ return v##OP##_##CHAR##SEW##LMUL(a, b); \
// vector = f(vector, vector), e.g. Add
- return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(a, b); \
// ================================================== INIT
@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
// vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
- HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \
- return v##OP##SEW##m##LMUL(); \
+ HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \
+ return v##OP##SEW##LMUL(); \
@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero,
template <class D>
using VFromD = decltype(Zero(D()));
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Full<T>> Zero(Simd<T, N> /*tag*/) {
+ return Zero(Full<T>());
// ------------------------------ Set
// vector = f(d, scalar), e.g. Set
(void)Lanes(d); \
- return v##OP##_##CHAR##SEW##m##LMUL(arg); \
+ return v##OP##_##CHAR##SEW##LMUL(arg); \
#undef HWY_RVV_SET
+// Partial vectors
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> Set(Simd<T, N> /*tag*/, T arg) {
+ return Set(Full<T>(), arg);
// ------------------------------ Undefined
// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
@@ -265,7 +281,7 @@ HWY_API VFromD<D> Undefined(D d) {
namespace detail {
// u8: no change
BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
return v; \
@@ -276,25 +292,25 @@ namespace detail {
// Other integers
- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
- return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v); \
- } \
- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \
- return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v); \
+ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+ return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
+ } \
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
+ return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
// Float: first cast to/from unsigned
- HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
- return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL( \
- v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v)); \
- } \
- HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \
- return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL( \
- v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v)); \
+ HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+ return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
+ v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v)); \
+ } \
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \
+ return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL( \
+ v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
@@ -315,6 +331,12 @@ HWY_API VFromD<D> BitCast(D d, FromV v)
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
+// Partial
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> BitCast(Simd<T, N> /*tag*/, FromV v) {
+ return BitCast(Full<T>(), v);
namespace detail {
template <class V, class DU = RebindToUnsigned<DFromV<V>>>
@@ -336,6 +358,12 @@ HWY_API VFromD<DU> Iota0(const D /*d*/)
return BitCastToUnsigned(Iota0(DU()));
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> Iota0(Simd<T, N> /*tag*/) {
+ return Iota0(Full<T>());
} // namespace detail
// ================================================== LOGICAL
@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) {
// ------------------------------ Or
// Scalar argument plus mask. Used by VecFromMask.
HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \
- return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm); \
+ return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm); \
namespace detail {
@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV,
// ------------------------------ ShiftLeft[Same]
// Intrinsics do not define .vi forms, so use .vx instead.
- template <int kBits> \
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits); \
- } \
- NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast<uint8_t>(bits)); \
+ template <int kBits> \
+ return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits); \
+ } \
+ NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
+ return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits)); \
@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi
// ------------------------------ Shl
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, bits); \
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, \
- detail::BitCastToUnsigned(bits)); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \
@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons
// ------------------------------ MulAdd
// Note: op is still named vv, not vvv.
- return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x); \
@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub
// of all bits; SLEN 8 / LMUL 4 = half of all bits.
// mask = f(vector, vector)
(void)Lanes(DFromV<decltype(a)>()); \
- return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b); \
+ return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b); \
// ------------------------------ Eq
@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo
// ------------------------------ IfThenElse
- HWY_RVV_V(BASE, SEW, LMUL) no) { \
- return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes); \
+ HWY_RVV_V(BASE, SEW, LMUL) no) { \
+ return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes); \
@@ -710,7 +737,7 @@ template <class D>
using MFromD = decltype(MaskFromVec(Zero(D())));
template <class D, typename MFrom>
-HWY_API MFromD<D> RebindMask(const D d, const MFrom mask) {
+HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
// No need to check lane size/LMUL are the same: if not, casting MFrom to
// MFromD<D> would fail.
return mask;
@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _,
// ------------------------------ Load
- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
- (void)Lanes(d); \
- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p); \
+ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
+ (void)Lanes(d); \
+ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p); \
-// Partial load
+// Partial
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API VFromD<Simd<T, N>> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
return Load(d, p);
@@ -800,16 +827,22 @@ HWY_API VFromD<D> LoadU(D d, const TFrom
// ------------------------------ Store
- (void)Lanes(d); \
- return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v); \
+ (void)Lanes(d); \
+ return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v); \
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void Store(VFromD<Simd<T, N>> v, Simd<T, N> d, T* HWY_RESTRICT p) {
+ return Store(v, Full<T>(), p);
// ------------------------------ StoreU
// RVV only requires lane alignment, not natural alignment of the entire vector.
@@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H
Store(v, d, aligned);
+// ------------------------------ ScatterOffset
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
+ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
+ base, detail::BitCastToUnsigned(offset), v); \
+ }
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(VFromD<Simd<T, N>> v, Simd<T, N> d,
+ VFromD<Simd<MakeSigned<T>, N>> offset) {
+ return ScatterOffset(v, Full<T>(), base, offset);
+// ------------------------------ ScatterIndex
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
+ const VFromD<RebindToSigned<D>> index) {
+ return ScatterOffset(v, d, base, ShiftLeft<2>(index));
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
+ const VFromD<RebindToSigned<D>> index) {
+ return ScatterOffset(v, d, base, ShiftLeft<3>(index));
// ------------------------------ GatherOffset
- NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
- const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
- HWY_RVV_V(int, SEW, LMUL) offset) { \
- return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL( \
- base, detail::BitCastToUnsigned(offset)); \
+ NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
+ const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
+ return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
+ base, detail::BitCastToUnsigned(offset)); \
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> GatherOffset(Simd<T, N> d,
+ const T* HWY_RESTRICT base,
+ VFromD<Simd<MakeSigned<T>, N>> offset) {
+ return GatherOffset(Full<T>(), base, offset);
// ------------------------------ GatherIndex
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
@@ -852,37 +928,101 @@ HWY_API VFromD<D> GatherIndex(D d, const
return GatherOffset(d, base, ShiftLeft<3>(index));
-// ================================================== CONVERT
+// ------------------------------ StoreInterleaved3
-// ------------------------------ PromoteTo U
+ HWY_API void NAME( \
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
+ const v##BASE##SEW##LMUL##x3_t triple = \
+ vcreate_##CHAR##SEW##LMUL##x3(a, b, c); \
+ return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple); \
+ }
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
+HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3)
+HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3)
-HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); }
-HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); }
-HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); }
+#undef HWY_RVV_STORE3
-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); }
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); }
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void StoreInterleaved3(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
+ VFromD<Simd<T, N>> v2, Simd<T, N> /*tag*/,
+ T* unaligned) {
+ return StoreInterleaved3(v0, v1, v2, Full<T>(), unaligned);
+// ------------------------------ StoreInterleaved4
+ HWY_API void NAME( \
+ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \
+ const v##BASE##SEW##LMUL##x4_t quad = \
+ vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3); \
+ return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad); \
+ }
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
+HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4)
+HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4)
-HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) {
- return vzext_vf2_u32m2(v);
-HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) {
- return vzext_vf2_u32m4(v);
-HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) {
- return vzext_vf2_u32m8(v);
+#undef HWY_RVV_STORE4
-HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) {
- return vzext_vf2_u64m2(v);
-HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) {
- return vzext_vf2_u64m4(v);
-HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) {
- return vzext_vf2_u64m8(v);
+// Partial
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API void StoreInterleaved4(VFromD<Simd<T, N>> v0, VFromD<Simd<T, N>> v1,
+ VFromD<Simd<T, N>> v2, VFromD<Simd<T, N>> v3,
+ Simd<T, N> /*tag*/, T* unaligned) {
+ return StoreInterleaved4(v0, v1, v2, v3, Full<T>(), unaligned);
+// ================================================== CONVERT
+ PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/, \
+ return OP##CHAR##BITS##LMUL(v); \
+ }
+// TODO(janwas): GCC does not yet support fractional LMUL
+// ------------------------------ PromoteTo
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8)
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16)
+HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32)
+HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8)
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16)
+HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32)
+HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16)
+HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32)
+// i32 to f64
+HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
template <size_t N>
HWY_API VFromD<Simd<int16_t, N>> PromoteTo(Simd<int16_t, N> d,
VFromD<Simd<uint8_t, N>> v) {
@@ -901,67 +1041,6 @@ HWY_API VFromD<Simd<int32_t, N>> Promote
return BitCast(d, PromoteTo(Simd<uint32_t, N>(), v));
-// ------------------------------ PromoteTo I
-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); }
-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); }
-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); }
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); }
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); }
-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) {
- return vsext_vf2_i32m2(v);
-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) {
- return vsext_vf2_i32m4(v);
-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) {
- return vsext_vf2_i32m8(v);
-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) {
- return vsext_vf2_i64m2(v);
-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) {
- return vsext_vf2_i64m4(v);
-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) {
- return vsext_vf2_i64m8(v);
-// ------------------------------ PromoteTo F
-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) {
- return vfwcvt_f_f_v_f32m2(v);
-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) {
- return vfwcvt_f_f_v_f32m4(v);
-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) {
- return vfwcvt_f_f_v_f32m8(v);
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) {
- return vfwcvt_f_f_v_f64m2(v);
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) {
- return vfwcvt_f_f_v_f64m4(v);
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) {
- return vfwcvt_f_f_v_f64m8(v);
-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) {
- return vfwcvt_f_x_v_f64m2(v);
-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) {
- return vfwcvt_f_x_v_f64m4(v);
-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) {
- return vfwcvt_f_x_v_f64m8(v);
// ------------------------------ DemoteTo U
// First clamp negative numbers to zero to match x86 packus.
@@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
// ------------------------------ ConvertTo F
HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) { \
- return vfcvt_f_x_v_f##SEW##m##LMUL(v); \
+ return vfcvt_f_x_v_f##SEW##LMUL(v); \
} \
/* Truncates (rounds toward zero). */ \
HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \
- return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v); \
+ return vfcvt_rtz_x_f_v_i##SEW##LMUL(v); \
} \
/* Uses default rounding mode. */ \
- return vfcvt_x_f_v_i##SEW##m##LMUL(v); \
+ return vfcvt_x_f_v_i##SEW##LMUL(v); \
// API only requires f32 but we provide f64 for internal use (otherwise, it
@@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
+// Partial
+template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
+HWY_API VFromD<Simd<T, N>> ConvertTo(Simd<T, N> /*tag*/, FromV v) {
+ return ConvertTo(Full<T>(), v);
// ================================================== SWIZZLE
// ------------------------------ Compress
- return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v); \
+ return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v); \
@@ -1121,10 +1207,10 @@ HWY_API VFromD<DU> SetTableIndices(D d,
// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
// to 2048! We could instead use vrgatherei16.
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, idx); \
HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
@@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D
using T = MakeUnsigned<TFromD<D>>;
return detail::And(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
} // namespace detail
template <class V>
@@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) {
// ------------------------------ GetLane
- return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v); \
+ return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); \
@@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL
// ------------------------------ ShiftLeftLanes
-// vector = f(vector, size_t)
- NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) { \
- return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes); \
+// vector = f(vector, vector, size_t)
+ size_t lanes) { \
+ return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes); \
namespace detail {
@@ -1270,7 +1356,7 @@ template <size_t kLanes, class V>
HWY_API V ShiftLeftLanes(const V v) {
using D = DFromV<V>;
const RebindToSigned<D> di;
- const auto shifted = detail::SlideUp(v, kLanes);
+ const auto shifted = detail::SlideUp(v, v, kLanes);
// Match x86 semantics by zeroing lower lanes in 128-bit blocks
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
@@ -1300,7 +1386,7 @@ template <size_t kLanes, class V>
HWY_API V ShiftRightLanes(const V v) {
using D = DFromV<V>;
const RebindToSigned<D> di;
- const auto shifted = detail::SlideDown(v, kLanes);
+ const auto shifted = detail::SlideDown(v, v, kLanes);
// Match x86 semantics by zeroing upper lanes in 128-bit blocks
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
@@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c
template <class V>
HWY_API V ConcatLowerLower(const V hi, const V lo) {
// Move lower half into upper
- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
+ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
return ConcatUpperLower(hi_up, lo);
@@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c
template <class V>
HWY_API V ConcatUpperUpper(const V hi, const V lo) {
// Move upper half into lower
- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
+ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
return ConcatUpperLower(hi, lo_down);
@@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c
template <class V>
HWY_API V ConcatLowerUpper(const V hi, const V lo) {
// Move half of both inputs to the other half
- const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
- const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
+ const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
+ const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
return ConcatUpperLower(hi_up, lo_down);
@@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b)
// ================================================== REDUCE
// vector = f(vector, zero_m1)
- vsetvlmax_e##SEW##m##LMUL(); \
- return Set(HWY_RVV_D(CHAR, SEW, LMUL)(), \
- GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \
- v0, v, v0))); \
+ vsetvlmax_e##SEW##LMUL(); \
+ return Set( \
+ GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \
// ------------------------------ SumOfLanes
namespace detail {
} // namespace detail
template <class V>
HWY_API V SumOfLanes(const V v) {
using T = TFromV<V>;
- const auto v0 = Zero(Simd<T, HWY_LANES(T)>()); // always m1
+ const auto v0 = Zero(Full<T>()); // always m1
return detail::RedSum(v, v0);
// ------------------------------ MinOfLanes
namespace detail {
} // namespace detail
template <class V>
HWY_API V MinOfLanes(const V v) {
using T = TFromV<V>;
- const Simd<T, HWY_LANES(T)> d1; // always m1
+ const Full<T> d1; // always m1
const auto neutral = Set(d1, HighestValue<T>());
return detail::RedMin(v, neutral);
// ------------------------------ MaxOfLanes
namespace detail {
} // namespace detail
template <class V>
HWY_API V MaxOfLanes(const V v) {
using T = TFromV<V>;
- const Simd<T, HWY_LANES(T)> d1; // always m1
+ const Full<T> d1; // always m1
const auto neutral = Set(d1, LowestValue<T>());
return detail::RedMax(v, neutral);
@@ -1507,7 +1587,7 @@ HWY_API VFromD<D> LoadDup128(D d, const
HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \
/* LMUL=1 is always enough */ \
- Simd<uint8_t, HWY_LANES(uint8_t)> d8; \
+ Full<uint8_t> d8; \
const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN; \
/* TODO(janwas): how to convert vbool* to vuint?*/ \
/*Store(m, d8, p);*/ \
@@ -1518,6 +1598,22 @@ HWY_API VFromD<D> LoadDup128(D d, const
+// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
+// Disallow for 8-bit because Iota is likely to overflow.
+template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
+ const RebindToSigned<D> di;
+ return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+HWY_API MFromD<D> FirstN(const D d, const size_t n) {
+ const auto zero = Zero(d);
+ const auto one = Set(d, 1);
+ return Eq(detail::SlideUp(one, zero, n), one);
// ------------------------------ Neg
template <class V, HWY_IF_SIGNED_V(V)>
@@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) {
// vector = f(vector), but argument is repeated
- return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v); \
+ return v##OP##_vv_##CHAR##SEW##LMUL(v, v); \
@@ -1565,7 +1661,6 @@ template <class V>
HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
} // namespace detail
template <class V>
@@ -1636,10 +1731,8 @@ HWY_API VFromD<D> Iota(const D d, TFromD
// Using vwmul does not work for m8, so use mulh instead. Highway only provides
// MulHigh for 16-bit, so use a private wrapper.
namespace detail {
} // namespace detail
template <class V>
@@ -1649,7 +1742,7 @@ HWY_API VFromD<RepartitionToWide<DFromV<
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
const RepartitionToWide<DFromV<V>> dw;
- return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo));
+ return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo));
// ================================================== END MACROS
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 2021-06-02 10:56:05.237904402 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -19,7 +19,6 @@
#include <stdint.h>
#include <algorithm> // std::min
-#include <cmath>
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
@@ -199,7 +198,7 @@ HWY_API Vec1<T> BroadcastSignBit(const V
template <typename TFrom, typename TTo>
HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
- return Mask1<TTo>(m.raw);
+ return Mask1<TTo>{m.bits};
// v must be 0 or FF..FF.
@@ -224,6 +223,11 @@ Vec1<T> VecFromMask(Sisd<T> /* tag */, c
return v;
+template <typename T>
+HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
+ return Mask1<T>::FromBool(n != 0);
// Returns mask ? yes : no.
template <typename T>
HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
@@ -357,9 +361,9 @@ HWY_INLINE Vec1<T> operator>>(const Vec1
template <typename T>
HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
- const uint64_t a64 = static_cast<int64_t>(a.raw);
- const uint64_t b64 = static_cast<int64_t>(b.raw);
- return Vec1<T>((a64 + b64) & ~T(0));
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
+ return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
return Vec1<float>(a.raw + b.raw);
@@ -370,9 +374,9 @@ HWY_INLINE Vec1<double> operator+(const
template <typename T>
HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
- const uint64_t a64 = static_cast<int64_t>(a.raw);
- const uint64_t b64 = static_cast<int64_t>(b.raw);
- return Vec1<T>((a64 - b64) & ~T(0));
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
+ return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
return Vec1<float>(a.raw - b.raw);
@@ -388,21 +392,25 @@ HWY_INLINE Vec1<double> operator-(const
// Unsigned
HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
const Vec1<uint8_t> b) {
- return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255));
+ return Vec1<uint8_t>(
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
const Vec1<uint16_t> b) {
- return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535));
+ return Vec1<uint16_t>(
+ static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
// Signed
HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
const Vec1<int8_t> b) {
- return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127));
+ return Vec1<int8_t>(
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
const Vec1<int16_t> b) {
- return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767));
+ return Vec1<int16_t>(
+ static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
// ------------------------------ Saturating subtraction
@@ -412,21 +420,25 @@ HWY_INLINE Vec1<int16_t> SaturatedAdd(co
// Unsigned
HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
const Vec1<uint8_t> b) {
- return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255));
+ return Vec1<uint8_t>(
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
const Vec1<uint16_t> b) {
- return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535));
+ return Vec1<uint16_t>(
+ static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
// Signed
HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
const Vec1<int8_t> b) {
- return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127));
+ return Vec1<int8_t>(
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
const Vec1<int16_t> b) {
- return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767));
+ return Vec1<int16_t>(
+ static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
// ------------------------------ Average
@@ -435,11 +447,11 @@ HWY_INLINE Vec1<int16_t> SaturatedSub(co
HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
const Vec1<uint8_t> b) {
- return Vec1<uint8_t>((a.raw + b.raw + 1) / 2);
+ return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
const Vec1<uint16_t> b) {
- return Vec1<uint16_t>((a.raw + b.raw + 1) / 2);
+ return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
// ------------------------------ Absolute value
@@ -514,15 +526,15 @@ HWY_INLINE Vec1<T> operator/(const Vec1<
// Returns the upper 16 bits of a * b in each lane.
HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
- return Vec1<int16_t>((a.raw * b.raw) >> 16);
+ return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
const Vec1<uint16_t> b) {
// Cast to uint32_t first to prevent overflow. Otherwise the result of
// uint16_t * uint16_t is in "int" which may overflow. In practice the result
// is the same but this way it is also defined.
- return Vec1<uint16_t>(
- (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16);
+ return Vec1<uint16_t>(static_cast<uint16_t>(
+ (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
@@ -617,6 +629,31 @@ HWY_INLINE Vec1<T> Round(const Vec1<T> v
return Vec1<T>(static_cast<T>(rounded));
+// Round-to-nearest even.
+HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
+ using T = float;
+ using TI = int32_t;
+ const T abs = Abs(v).raw;
+ const bool signbit = std::signbit(v.raw);
+ if (!(abs < MantissaEnd<T>())) { // Huge or NaN
+ // Check if too large to cast or NaN
+ if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
+ return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
+ }
+ return Vec1<int32_t>(static_cast<TI>(v.raw));
+ }
+ const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
+ const TI rounded = static_cast<TI>(v.raw + bias);
+ if (rounded == 0) return Vec1<int32_t>(0);
+ // Round to even
+ if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
+ return Vec1<TI>(rounded - (signbit ? -1 : 1));
+ }
+ return Vec1<TI>(rounded);
template <typename T>
HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
using TI = MakeSigned<T>;
@@ -641,7 +678,8 @@ V Ceiling(const V v) {
Bits bits;
CopyBytes<sizeof(Bits)>(&v, &bits);
- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
+ const int exponent =
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
// Already an integer.
if (exponent >= kMantissaBits) return v;
// |v| <= 1 => 0 or 1.
@@ -672,7 +710,8 @@ V Floor(const V v) {
Bits bits;
CopyBytes<sizeof(Bits)>(&v, &bits);
- const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
+ const int exponent =
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
// Already an integer.
if (exponent >= kMantissaBits) return v;
// |v| <= 1 => -1 or 0.
@@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1<T> v,
return Store(v, d, p);
+// ------------------------------ StoreInterleaved3
+HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
+ const Vec1<uint8_t> v2, Sisd<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ StoreU(v0, d, unaligned + 0);
+ StoreU(v1, d, unaligned + 1);
+ StoreU(v2, d, unaligned + 2);
+HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
+ const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
+ Sisd<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ StoreU(v0, d, unaligned + 0);
+ StoreU(v1, d, unaligned + 1);
+ StoreU(v2, d, unaligned + 2);
+ StoreU(v3, d, unaligned + 3);
// ------------------------------ Stream
template <typename T>
@@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1<T> v,
return Store(v, d, aligned);
+// ------------------------------ Scatter
+template <typename T, typename Offset>
+HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
+ const Vec1<Offset> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
+ return Store(v, d, reinterpret_cast<T*>(base8));
+template <typename T, typename Index>
+HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
+ const Vec1<Index> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ return Store(v, d, base + index.raw);
// ------------------------------ Gather
template <typename T, typename Offset>
HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
const Vec1<Offset> offset) {
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
return Load(d, reinterpret_cast<const T*>(addr));
@@ -792,7 +868,7 @@ HWY_INLINE Vec1<T> GatherOffset(Sisd<T>
template <typename T, typename Index>
HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
const Vec1<Index> index) {
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
return Load(d, base + index.raw);
@@ -833,15 +909,20 @@ HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT>
static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
const Vec1<float16_t> v) {
uint16_t bits16;
CopyBytes<2>(&v.raw, &bits16);
+ const uint16_t bits16 = v.raw.bits;
const uint32_t sign = bits16 >> 15;
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
const uint32_t mantissa = bits16 & 0x3FF;
// Subnormal or zero
if (biased_exp == 0) {
- const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+ const float subnormal =
+ (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
return Vec1<float>(sign ? -subnormal : subnormal);
@@ -867,8 +948,12 @@ static HWY_INLINE Vec1<float16_t> Demote
// Tiny or zero => zero.
Vec1<float16_t> out;
if (exp < -24) {
- bits32 = 0;
- CopyBytes<2>(&bits32, &out);
+ const uint16_t zero = 0;
+ CopyBytes<2>(&zero, &out.raw);
+ out.raw.bits = 0;
return out;
@@ -890,7 +975,12 @@ static HWY_INLINE Vec1<float16_t> Demote
HWY_DASSERT(mantissa16 < 1024);
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
HWY_DASSERT(bits16 < 0x10000);
- CopyBytes<2>(&bits16, &out);
+ const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
+ CopyBytes<2>(&narrowed, &out.raw);
+ out.raw.bits = static_cast<uint16_t>(bits16);
return out;
@@ -919,18 +1009,6 @@ HWY_INLINE Vec1<uint8_t> U8FromU32(const
return DemoteTo(Sisd<uint8_t>(), v);
-// Approximation of round-to-nearest for numbers representable as int32_t.
-HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
- const float f = v.raw;
- if (std::isinf(f) ||
- std::fabs(f) > static_cast<float>(LimitsMax<int32_t>())) {
- return Vec1<int32_t>(std::signbit(f) ? LimitsMin<int32_t>()
- : LimitsMax<int32_t>());
- }
- const float bias = f < 0.0f ? -0.5f : 0.5f;
- return Vec1<int32_t>(static_cast<int>(f + bias));
// ================================================== SWIZZLE
// Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 2021-06-02 10:56:05.224904336 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -31,11 +31,6 @@
#undef HWY_ALIGN
#undef HWY_LANES
#undef HWY_CAP_FLOAT64
#undef HWY_CAP_GE256
@@ -53,11 +48,6 @@
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
-#define HWY_GATHER_LANES(T) 1
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
@@ -73,11 +63,6 @@
#define HWY_ALIGN alignas(32)
#define HWY_LANES(T) (32 / sizeof(T))
-#define HWY_COMPARE64_LANES 4
-#define HWY_MINMAX64_LANES 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
@@ -96,11 +81,6 @@
#define HWY_ALIGN alignas(64)
#define HWY_LANES(T) (64 / sizeof(T))
-#define HWY_COMPARE64_LANES 8
-#define HWY_MINMAX64_LANES 8
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
@@ -121,11 +101,6 @@
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
-#define HWY_GATHER_LANES(T) 1
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 2
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
@@ -142,19 +117,14 @@
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
-#define HWY_GATHER_LANES(T) 1
-#define HWY_MINMAX64_LANES 2
-#define HWY_COMPARE64_LANES 2
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
-#ifdef __arm__
-#define HWY_CAP_FLOAT64 0
#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_FLOAT64 0
@@ -162,17 +132,34 @@
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// SVE[2]
+// SVE only requires lane alignment, not natural alignment of the entire vector.
+#define HWY_ALIGN alignas(8)
+// Upper bound, not the actual lane count!
+#define HWY_LANES(T) (256 / sizeof(T))
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
-#define HWY_GATHER_LANES(T) 1
-#define HWY_COMPARE64_LANES 2
-#define HWY_MINMAX64_LANES 2
#define HWY_CAP_INTEGER64 0
#define HWY_CAP_FLOAT64 0
#define HWY_CAP_GE256 0
@@ -194,11 +181,6 @@
// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
#define HWY_LANES(T) (4096 / sizeof(T))
-// Cannot use HWY_LANES/sizeof here because these are used in an #if.
-#define HWY_COMPARE64_LANES 256
-#define HWY_MINMAX64_LANES 256
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
@@ -215,13 +197,9 @@
#define HWY_ALIGN
+// For internal use only; use Lanes(d) instead.
#define HWY_LANES(T) 1
-#define HWY_GATHER_LANES(T) 1
-#define HWY_COMPARE64_LANES 1
-#define HWY_MINMAX64_LANES 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
@@ -265,3 +243,7 @@
#define HWY_ATTR
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 2021-06-02 10:56:05.235904392 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -14,6 +14,8 @@
// Per-target definitions shared by ops/*.h and user code.
+#include <cmath>
// Separate header because foreach_target.h re-enables its include guard.
#include "hwy/ops/set_macros-inl.h"
@@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si
// Targets with non-constexpr Lanes define this themselves.
// (Potentially) non-constant actual size of the vector at runtime, subject to
// the limit imposed by the Simd. Useful for advancing loop counters.
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 2021-06-02 10:56:05.242904427 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -19,8 +19,6 @@
#include <stdint.h>
#include <wasm_simd128.h>
-#include <cmath>
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
@@ -177,6 +175,16 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N
+// Returns a vector with lane i=[0, N) set to "first" + i.
+template <typename T, size_t N, typename T2>
+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+ HWY_ALIGN T lanes[16 / sizeof(T)];
+ for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+ lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+ }
+ return Load(d, lanes);
// ================================================== ARITHMETIC
// ------------------------------ Addition
@@ -273,24 +281,24 @@ HWY_API Vec128<float, N> operator-(const
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint8_t, N>{wasm_u8x16_add_saturate(a.raw, b.raw)};
+ return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
- return Vec128<uint16_t, N>{wasm_u16x8_add_saturate(a.raw, b.raw)};
+ return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int8_t, N>{wasm_i8x16_add_saturate(a.raw, b.raw)};
+ return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
- return Vec128<int16_t, N>{wasm_i16x8_add_saturate(a.raw, b.raw)};
+ return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
// ------------------------------ Saturating subtraction
@@ -301,24 +309,24 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint8_t, N>{wasm_u8x16_sub_saturate(a.raw, b.raw)};
+ return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
- return Vec128<uint16_t, N>{wasm_u16x8_sub_saturate(a.raw, b.raw)};
+ return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int8_t, N>{wasm_i8x16_sub_saturate(a.raw, b.raw)};
+ return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
- return Vec128<int16_t, N>{wasm_i16x8_sub_saturate(a.raw, b.raw)};
+ return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
// ------------------------------ Average
@@ -352,6 +360,12 @@ template <size_t N>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
+template <size_t N>
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
+ // TODO(janwas): use wasm_i64x2_abs when available
+ const Vec128<int64_t, N> mask = wasm_i64x2_shr(v.raw, 63);
+ return ((v ^ mask) - mask);
template <size_t N>
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
@@ -396,9 +410,38 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
+// 8-bit
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
+ const Simd<T, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
+ return kBits == 1
+ ? (v + v)
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
+template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
+ const Simd<uint8_t, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
+ return shifted & Set(d8, 0xFF >> kBits);
+template <int kBits, size_t N>
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
+ const Simd<int8_t, N> di;
+ const Simd<uint8_t, N> du;
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ Shift lanes by same variable #bits
-// Unsigned (no u8)
+// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
const int bits) {
@@ -420,7 +463,7 @@ HWY_API Vec128<uint32_t, N> ShiftRightSa
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
-// Signed (no i8)
+// Signed
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
const int bits) {
@@ -442,6 +485,35 @@ HWY_API Vec128<int32_t, N> ShiftRightSam
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
+// 8-bit
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
+ const Simd<T, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{
+ ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
+template <size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
+ const int bits) {
+ const Simd<uint8_t, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
+ return shifted & Set(d8, 0xFF >> bits);
+template <size_t N>
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
+ const Simd<int8_t, N> di;
+ const Simd<uint8_t, N> du;
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ Minimum
// Unsigned
@@ -607,29 +679,29 @@ template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
- const auto al = wasm_i32x4_widen_low_u16x8(a.raw);
- const auto ah = wasm_i32x4_widen_high_u16x8(a.raw);
- const auto bl = wasm_i32x4_widen_low_u16x8(b.raw);
- const auto bh = wasm_i32x4_widen_high_u16x8(b.raw);
+ const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
+ const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
+ const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
+ const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<uint16_t, N>{
- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
- const auto al = wasm_i32x4_widen_low_i16x8(a.raw);
- const auto ah = wasm_i32x4_widen_high_i16x8(a.raw);
- const auto bl = wasm_i32x4_widen_low_i16x8(b.raw);
- const auto bh = wasm_i32x4_widen_high_i16x8(b.raw);
+ const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
+ const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
+ const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
+ const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<int16_t, N>{
- wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
+ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
@@ -765,53 +837,76 @@ HWY_API Vec128<float, N> ApproximateReci
// Toward nearest integer, ties to even
template <size_t N>
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
- // TODO(eustas): is it f32x4.nearest? (not implemented yet)
- alignas(16) float input[4];
- alignas(16) float output[4];
- wasm_v128_store(input, v.raw);
- for (size_t i = 0; i < 4; ++i) {
- output[i] = std::nearbyint(input[i]);
- }
- return Vec128<float, N>{wasm_v128_load(output)};
+ // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not
+ // yet have an instruction for that (f32x4.nearest is not implemented). We
+ // rely on rounding after addition with a large value such that no mantissa
+ // bits remain (assuming the current mode is nearest-even). We may need a
+ // compiler flag for precise floating-point to prevent "optimizing" this out.
+ const Simd<float, N> df;
+ const auto max = Set(df, MantissaEnd<float>());
+ const auto large = CopySignToAbs(max, v);
+ const auto added = large + v;
+ const auto rounded = added - large;
+ // Keep original if NaN or the magnitude is large (already an int).
+ return IfThenElse(Abs(v) < max, rounded, v);
+namespace detail {
+// Truncating to integer and converting back to float is correct except when the
+// input magnitude is large, in which case the input was already an integer
+// (because mantissa >> exponent is zero).
+template <size_t N>
+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
+ return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
+} // namespace detail
// Toward zero, aka truncate
template <size_t N>
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.trunc? (not implemented yet)
- alignas(16) float input[4];
- alignas(16) float output[4];
- wasm_v128_store(input, v.raw);
- for (size_t i = 0; i < 4; ++i) {
- output[i] = std::trunc(input[i]);
- }
- return Vec128<float, N>{wasm_v128_load(output)};
+ const Simd<float, N> df;
+ const RebindToSigned<decltype(df)> di;
+ const auto integer = ConvertTo(di, v); // round toward 0
+ const auto int_f = ConvertTo(df, integer);
+ return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
// Toward +infinity, aka ceiling
template <size_t N>
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
+HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.ceil? (not implemented yet)
- alignas(16) float input[4];
- alignas(16) float output[4];
- wasm_v128_store(input, v.raw);
- for (size_t i = 0; i < 4; ++i) {
- output[i] = std::ceil(input[i]);
- }
- return Vec128<float, N>{wasm_v128_load(output)};
+ const Simd<float, N> df;
+ const RebindToSigned<decltype(df)> di;
+ const auto integer = ConvertTo(di, v); // round toward 0
+ const auto int_f = ConvertTo(df, integer);
+ // Truncating a positive non-integer ends up smaller; if so, add 1.
+ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
+ return IfThenElse(detail::UseInt(v), int_f - neg1, v);
// Toward -infinity, aka floor
template <size_t N>
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
+HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.floor? (not implemented yet)
- alignas(16) float input[4];
- alignas(16) float output[4];
- wasm_v128_store(input, v.raw);
- for (size_t i = 0; i < 4; ++i) {
- output[i] = std::floor(input[i]);
- }
- return Vec128<float, N>{wasm_v128_load(output)};
+ const Simd<float, N> df;
+ const RebindToSigned<decltype(df)> di;
+ const auto integer = ConvertTo(di, v); // round toward 0
+ const auto int_f = ConvertTo(df, integer);
+ // Truncating a negative non-integer ends up larger; if so, subtract 1.
+ const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
+ return IfThenElse(detail::UseInt(v), int_f + neg1, v);
// ================================================== COMPARE
@@ -902,12 +997,12 @@ HWY_API Mask128<int64_t, N> operator>(co
// Otherwise, the lower half decides.
const auto m_eq = a32 == b32;
- const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
+ const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
const auto lo_gt = And(m_eq, lo_in_hi);
const auto gt = Or(lo_gt, m_gt);
// Copy result in upper 32 bits to lower 32 bits.
- return Mask128<int64_t, N>{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)};
+ return Mask128<int64_t, N>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
template <size_t N>
@@ -935,6 +1030,14 @@ HWY_API Mask128<float, N> operator>=(con
return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
+// ------------------------------ FirstN (Iota, Lt)
+template <typename T, size_t N>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+ const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
// ================================================== LOGICAL
// ------------------------------ Not
@@ -1015,7 +1118,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
template <size_t N>
HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
- return VecFromMask(v < Zero(Simd<int8_t, N>()));
+ return VecFromMask(Simd<int8_t, N>(), v < Zero(Simd<int8_t, N>()));
// ------------------------------ Mask
@@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128<T, N> v, Simd
wasm_v128_store(aligned, v.raw);
-// ------------------------------ Gather
+// ------------------------------ Scatter (Store)
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+ }
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ for (size_t i = 0; i < N; ++i) {
+ base[index_lanes[i]] = lanes[i];
+ }
+// ------------------------------ Gather (Load/Store)
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
- static_assert(N == 1, "Wasm does not support full gather");
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
- T val;
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
- return Set(d, val);
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ alignas(16) T lanes[N];
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+ }
+ return Load(d, lanes);
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
- static_assert(N == 1, "Wasm does not support full gather");
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
- return Set(d, base[GetLane(index)]);
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ alignas(16) T lanes[N];
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = base[index_lanes[i]];
+ }
+ return Load(d, lanes);
// ================================================== SWIZZLE
@@ -1346,12 +1496,12 @@ HWY_API Vec128<T, N / 2> LowerHalf(Vec12
template <typename T>
HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Vec128<T> v) {
// TODO(eustas): use swizzle?
- return Vec128<T, 8 / sizeof(T)>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+ return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
template <>
HWY_INLINE Vec128<float, 2> UpperHalf(Vec128<float> v) {
// TODO(eustas): use swizzle?
- return Vec128<float, 2>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
+ return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
// ------------------------------ Shift vector by constant #bytes
@@ -1366,64 +1516,64 @@ HWY_API Vec128<T> ShiftLeftBytes(const V
return v;
case 1:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14)};
case 2:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13)};
case 3:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12)};
case 4:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11)};
case 5:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10)};
case 6:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
case 7:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
case 8:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
case 9:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
case 10:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
case 11:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
case 12:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
case 13:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
case 14:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 0,
case 15:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
@@ -1447,69 +1597,69 @@ HWY_API Vec128<T> ShiftRightBytes(const
return v;
case 1:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16)};
case 2:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 16)};
case 3:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 16, 16)};
case 4:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 16, 16, 16)};
case 5:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 16, 16, 16, 16)};
case 6:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 16, 16, 16, 16, 16)};
case 7:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 16, 16, 16, 16, 16, 16)};
case 8:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
15, 16, 16, 16, 16, 16, 16, 16, 16)};
case 9:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
15, 16, 16, 16, 16, 16, 16, 16, 16,
case 10:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
16, 16, 16, 16, 16, 16, 16, 16, 16,
case 11:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
case 12:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
case 13:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
case 14:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
case 15:
- return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
@@ -1535,72 +1685,72 @@ HWY_API Vec128<T> CombineShiftRightBytes
return lo;
case 1:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16)};
case 2:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17)};
case 3:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18)};
case 4:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19)};
case 5:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20)};
case 6:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20,
case 7:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21,
case 8:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22,
case 9:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23,
case 10:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25)};
case 11:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26)};
case 12:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27)};
case 13:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28)};
case 14:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29)};
case 15:
- return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
+ return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30)};
@@ -1613,28 +1763,28 @@ HWY_API Vec128<T> CombineShiftRightBytes
template <int kLane, size_t N>
HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
- return Vec128<uint16_t, N>{wasm_v16x8_shuffle(
+ return Vec128<uint16_t, N>{wasm_i16x8_shuffle(
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
template <int kLane, size_t N>
HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint32_t, N>{
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
// Signed
template <int kLane, size_t N>
HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
- return Vec128<int16_t, N>{wasm_v16x8_shuffle(
+ return Vec128<int16_t, N>{wasm_i16x8_shuffle(
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
template <int kLane, size_t N>
HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int32_t, N>{
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
// Float
@@ -1642,7 +1792,7 @@ template <int kLane, size_t N>
HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<float, N>{
- wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
+ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
// ------------------------------ Shuffle bytes with variable indices
@@ -1652,16 +1802,23 @@ HWY_API Vec128<float, N> Broadcast(const
template <typename T, size_t N>
HWY_API Vec128<T, N> TableLookupBytes(const Vec128<T, N> bytes,
const Vec128<T, N> from) {
- // TODO(eustas): use swizzle? (shuffle does not work for variable indices)
+// Not yet available in all engines, see
+// V8 implementation of this had a bug, fixed on 2021-04-03:
+#if 0
+ return Vec128<T, N>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
alignas(16) uint8_t control[16];
alignas(16) uint8_t input[16];
alignas(16) uint8_t output[16];
wasm_v128_store(control, from.raw);
wasm_v128_store(input, bytes.raw);
for (size_t i = 0; i < 16; ++i) {
- output[i] = input[control[i]];
+ output[i] = control[i] < 16 ? input[control[i]] : 0;
return Vec128<T, N>{wasm_v128_load(output)};
// ------------------------------ Hard-coded shuffles
@@ -1673,101 +1830,102 @@ HWY_API Vec128<T, N> TableLookupBytes(co
// Swap 32-bit halves in 64-bit halves.
HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
// Swap 64-bit halves
HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+ return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
- return Vec128<int32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+ return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
- return Vec128<float>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
+ return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
// Rotate right 32 bits
HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
// Rotate left 32 bits
HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
// Reverse
HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
- return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
+ return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
// ------------------------------ TableLookupLanes
// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
+template <typename T, size_t N>
struct Indices128 {
__v128_u raw;
-template <typename T>
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
- const size_t N = 16 / sizeof(T);
for (size_t i = 0; i < N; ++i) {
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
- const Full128<uint8_t> d8;
- alignas(16) uint8_t control[16]; // = Lanes()
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
- const size_t idx_lane = idx_byte / sizeof(T);
- const size_t mod = idx_byte % sizeof(T);
- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
+ const Repartition<uint8_t, decltype(d)> d8;
+ alignas(16) uint8_t control[16] = {0};
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ control[idx_lane * sizeof(T) + idx_byte] =
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+ }
- return Indices128<T>{Load(d8, control).raw};
+ return Indices128<T, N>{Load(d8, control).raw};
-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
- const Indices128<uint32_t> idx) {
- return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
- const Indices128<int32_t> idx) {
- return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
+ const Indices128<int32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
- const Indices128<float> idx) {
- const Full128<int32_t> di;
- const Full128<float> df;
+template <size_t N>
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+ const Indices128<float, N> idx) {
+ const Simd<int32_t, N> di;
+ const Simd<float, N> df;
return BitCast(df,
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
+ TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
// ------------------------------ Zip lanes
@@ -1778,33 +1936,33 @@ HWY_API Vec128<float> TableLookupLanes(c
template <size_t N>
HWY_API Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
+ return Vec128<uint16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
template <size_t N>
HWY_API Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, (N + 1) / 2>{
- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
template <size_t N>
HWY_API Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
+ return Vec128<int16_t, (N + 1) / 2>{wasm_i8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
template <size_t N>
HWY_API Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, (N + 1) / 2>{
- wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
+ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
template <size_t N>
HWY_API Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
- return Vec128<uint16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+ return Vec128<uint16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
10, 26, 11, 27, 12, 28, 13,
29, 14, 30, 15, 31)};
@@ -1812,13 +1970,13 @@ template <size_t N>
HWY_API Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, N / 2>{
- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
template <size_t N>
HWY_API Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
- return Vec128<int16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
+ return Vec128<int16_t, N / 2>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
10, 26, 11, 27, 12, 28, 13,
29, 14, 30, 15, 31)};
@@ -1826,7 +1984,7 @@ template <size_t N>
HWY_API Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, N / 2>{
- wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
// ------------------------------ Interleave lanes
@@ -1842,17 +2000,17 @@ HWY_API Vec128<T> InterleaveLower(const
template <>
HWY_INLINE Vec128<uint32_t> InterleaveLower<uint32_t>(
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
template <>
HWY_INLINE Vec128<int32_t> InterleaveLower<int32_t>(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
template <>
HWY_INLINE Vec128<float> InterleaveLower<float>(const Vec128<float> a,
const Vec128<float> b) {
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
template <typename T>
@@ -1862,17 +2020,17 @@ HWY_API Vec128<T> InterleaveUpper(const
template <>
HWY_INLINE Vec128<uint32_t> InterleaveUpper<uint32_t>(
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
- return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ return Vec128<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
template <>
HWY_INLINE Vec128<int32_t> InterleaveUpper<int32_t>(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
- return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ return Vec128<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
template <>
HWY_INLINE Vec128<float> InterleaveUpper<float>(const Vec128<float> a,
const Vec128<float> b) {
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
// ------------------------------ Blocks
@@ -1880,13 +2038,13 @@ HWY_INLINE Vec128<float> InterleaveUpper
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <typename T>
HWY_API Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)};
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
template <typename T>
HWY_API Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)};
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
@@ -1898,7 +2056,7 @@ HWY_API Vec128<T> ConcatLowerUpper(const
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
template <typename T>
HWY_API Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
- return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)};
+ return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)};
// ------------------------------ Odd/even lanes
@@ -1917,12 +2075,12 @@ HWY_API Vec128<T> odd_even_impl(hwy::Siz
template <typename T>
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
const Vec128<T> b) {
- return Vec128<T>{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
+ return Vec128<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
template <typename T>
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
const Vec128<T> b) {
- return Vec128<T>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+ return Vec128<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
// TODO(eustas): implement
// template <typename T>
@@ -1939,7 +2097,7 @@ HWY_API Vec128<T> OddEven(const Vec128<T
template <>
HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
const Vec128<float> b) {
- return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
+ return Vec128<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
// ================================================== CONVERT
@@ -1950,52 +2108,52 @@ HWY_INLINE Vec128<float> OddEven<float>(
template <size_t N>
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
- return Vec128<uint16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
+ return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint32_t, N>{
- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
- return Vec128<int16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
+ return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int32_t, N>{
- wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
+ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
- return Vec128<uint32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
+ return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
- return Vec128<int32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
+ return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
// Signed: replicate sign bit.
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<int8_t, N> v) {
- return Vec128<int16_t, N>{wasm_i16x8_widen_low_i8x16(v.raw)};
+ return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int32_t, N>{
- wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))};
+ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int16_t, N> v) {
- return Vec128<int32_t, N>{wasm_i32x4_widen_low_i16x8(v.raw)};
+ return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
template <size_t N>
@@ -2122,7 +2280,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
-// ------------------------------ Convert i32 <=> f32
+// ------------------------------ Convert i32 <=> f32 (Round)
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
@@ -2133,33 +2291,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<
template <size_t N>
HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
const Vec128<float, N> v) {
- return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(v.raw)};
+ return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
- const __f32x4 c00 = wasm_f32x4_splat(0.0f);
- const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00));
- const __f32x4 c05 = wasm_f32x4_splat(0.5f);
- // +0.5 for non-negative lane, -0.5 for other.
- const __f32x4 delta = wasm_f32x4_add(c05, corr);
- // Shift input by 0.5 away from 0.
- const __f32x4 fixed = wasm_f32x4_add(v.raw, delta);
- return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(fixed)};
+ return ConvertTo(Simd<int32_t, N>(), Round(v));
// ================================================== MISC
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
- HWY_ALIGN T lanes[16 / sizeof(T)];
- for (size_t i = 0; i < 16 / sizeof(T); ++i) {
- lanes[i] = static_cast<T>(first + static_cast<T2>(i));
- }
- return Load(d, lanes);
// ------------------------------ Mask
namespace detail {
@@ -2167,20 +2308,13 @@ namespace detail {
template <typename T, size_t N>
HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
const Mask128<T, N> mask) {
- const __i8x16 slice =
- wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8);
- // Each u32 lane has byte[i] = (1 << i) or 0.
- const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice);
- // OR together 4 bytes of each u32 to get the 4 bits.
- const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16);
- const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z);
- const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8);
- const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0);
- const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24);
- // Assemble four nibbles into 16 bits.
- alignas(16) uint32_t lanes[4];
- wasm_v128_store(lanes, nibble_per_u32);
- return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12);
+ alignas(16) uint64_t lanes[2];
+ wasm_v128_store(lanes, mask.raw);
+ constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
+ const uint64_t lo = ((lanes[0] * kMagic) >> 56);
+ const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
+ return (hi + lo);
template <typename T, size_t N>
@@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() {
template <typename T, size_t N>
HWY_API uint64_t BitsFromMask(const Mask128<T, N> mask) {
- return OnlyActive<T, N>(
- BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
+ return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
template <typename T>
@@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128<T
// Full vector, type-independent
template <typename T>
HWY_API bool AllFalse(const Mask128<T> m) {
- return !wasm_i8x16_any_true(m.raw);
+#if 0
+ // Casting followed by wasm_i8x16_any_true results in wasm error:
+ // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
+ const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(Full128<T>(), m));
+ return !wasm_i8x16_any_true(v8.raw);
+ return (wasm_i64x2_extract_lane(m.raw, 0) |
+ wasm_i64x2_extract_lane(m.raw, 1)) == 0;
// Full vector, type-dependent
@@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128<T, N>
namespace detail {
template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 256);
+ const Simd<T, N> d;
+ const Rebind<uint8_t, decltype(d)> d8;
+ const Simd<uint16_t, N> du;
+ // We need byte indices for TableLookupBytes (one vector's worth for each of
+ // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+ // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+ // with the doubling baked into the table. Unpacking nibbles is likely more
+ // costly than the higher cache footprint from storing bytes.
+ alignas(16) constexpr uint8_t table[256 * 8] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+ return BitCast(d, pairs + Set(du, 0x0100));
+template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
@@ -2383,57 +2657,37 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
-// Helper function called by both Compress and CompressStore - avoids a
+// Helper functions called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
+ const uint64_t mask_bits) {
+ const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
+ using D = Simd<T, N>;
+ const RebindToSigned<D> di;
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
+ const uint64_t mask_bits) {
+ const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+ using D = Simd<T, N>;
+ const RebindToSigned<D> di;
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
+template <typename T, size_t N>
+HWY_API Vec128<uint64_t, N> Compress(hwy::SizeTag<8> /*tag*/,
+ Vec128<uint64_t, N> v,
const uint64_t mask_bits) {
const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- const Simd<float, N> df;
- const Simd<int32_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
- const uint64_t mask_bits) {
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- const Simd<double, N> df;
- const Simd<int64_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+ using D = Simd<T, N>;
+ const RebindToSigned<D> di;
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
@@ -2442,7 +2696,8 @@ HWY_API Vec128<double, N> Compress(Vec12
template <typename T, size_t N>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
- return detail::Compress(v, detail::BitsFromMask(mask));
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+ detail::BitsFromMask(mask));
// ------------------------------ CompressStore
@@ -2451,63 +2706,284 @@ template <typename T, size_t N>
HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
Simd<T, N> d, T* HWY_RESTRICT aligned) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
- Store(detail::Compress(v, mask_bits), d, aligned);
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
return PopCount(mask_bits);
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
+ const Vec128<uint8_t> c, Full128<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const auto k5 = Set(d, 5);
+ const auto k6 = Set(d, 6);
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = Load(d, tbl_r0);
+ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ StoreU(int0, d, unaligned + 0 * 16);
+ // Second vector: g10,r10, bgr[9:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
+ const auto r1 = TableLookupBytes(a, shuf_r1);
+ const auto g1 = TableLookupBytes(b, shuf_g1);
+ const auto b1 = TableLookupBytes(c, shuf_b1);
+ const auto int1 = r1 | g1 | b1;
+ StoreU(int1, d, unaligned + 1 * 16);
+ // Third vector: bgr[15:11], b10
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
+ const auto r2 = TableLookupBytes(a, shuf_r2);
+ const auto g2 = TableLookupBytes(b, shuf_g2);
+ const auto b2 = TableLookupBytes(c, shuf_b2);
+ const auto int2 = r2 | g2 | b2;
+ StoreU(int2, d, unaligned + 2 * 16);
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
+ const Vec128<uint8_t, 8> b,
+ const Vec128<uint8_t, 8> c, Simd<uint8_t, 8> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors for the shuffles and first result.
+ const Full128<uint8_t> d_full;
+ const auto k5 = Set(d_full, 5);
+ const auto k6 = Set(d_full, 6);
+ const Vec128<uint8_t> full_a{a.raw};
+ const Vec128<uint8_t> full_b{b.raw};
+ const Vec128<uint8_t> full_c{c.raw};
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = Load(d_full, tbl_r0);
+ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ StoreU(int0, d_full, unaligned + 0 * 16);
+ // Second (HALF) vector: bgr[7:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
+ const auto r1 = TableLookupBytes(full_a, shuf_r1);
+ const auto g1 = TableLookupBytes(full_b, shuf_g1);
+ const auto b1 = TableLookupBytes(full_c, shuf_b1);
+ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
+ StoreU(int1, d, unaligned + 1 * 16);
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
+ const Vec128<uint8_t, N> b,
+ const Vec128<uint8_t, N> c,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors for the shuffles and result.
+ const Full128<uint8_t> d_full;
+ const Vec128<uint8_t> full_a{a.raw};
+ const Vec128<uint8_t> full_b{b.raw};
+ const Vec128<uint8_t> full_c{c.raw};
+ // Shuffle (a,b,c) vector bytes to bgr[3:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
+ 0x80, 0x80, 0x80, 0x80};
+ const auto shuf_r0 = Load(d_full, tbl_r0);
+ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
+ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ alignas(16) uint8_t buf[16];
+ StoreU(int0, d_full, buf);
+ CopyBytes<N * 3>(buf, unaligned);
+// ------------------------------ StoreInterleaved4
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+ const Vec128<uint8_t> v1,
+ const Vec128<uint8_t> v2,
+ const Vec128<uint8_t> v3, Full128<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto ba8 = ZipUpper(v0, v1);
+ const auto dc8 = ZipUpper(v2, v3);
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC
+ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
+ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
+ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
+ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
+ const Vec128<uint8_t, 8> in1,
+ const Vec128<uint8_t, 8> in2,
+ const Vec128<uint8_t, 8> in3,
+ Simd<uint8_t, 8> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors to reduce the number of stores.
+ const Vec128<uint8_t> v0{in0.raw};
+ const Vec128<uint8_t> v1{in1.raw};
+ const Vec128<uint8_t> v2{in2.raw};
+ const Vec128<uint8_t> v3{in3.raw};
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
+ const Full128<uint8_t> d_full;
+ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
+ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
+ const Vec128<uint8_t, N> in1,
+ const Vec128<uint8_t, N> in2,
+ const Vec128<uint8_t, N> in3,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors to reduce the number of stores.
+ const Vec128<uint8_t> v0{in0.raw};
+ const Vec128<uint8_t> v1{in1.raw};
+ const Vec128<uint8_t> v2{in2.raw};
+ const Vec128<uint8_t> v3{in3.raw};
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ alignas(16) uint8_t buf[16];
+ const Full128<uint8_t> d_full;
+ StoreU(BitCast(d_full, dcba_0), d_full, buf);
+ CopyBytes<4 * N>(buf, unaligned);
// ------------------------------ Reductions
namespace detail {
-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+// u32/i32/f32:
+// N=2
+template <typename T>
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+// N=4 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = v3210 + v1032;
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return v20_31_20_31 + v31_20_31_20;
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Max(v20_31_20_31, v31_20_31_20);
-// For u64/i64/f64.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+// u64/i64/f64:
+// N=2 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return v10 + v01;
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
} // namespace detail
-// Supported for u/i/f 32/64. Returns the sum in each lane.
+// Supported for u/i/f 32/64. Returns the same value in each lane.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 2021-06-02 10:56:05.240904417 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -154,27 +154,28 @@ HWY_API Vec128<double, N> Zero(Simd<doub
// Returns a vector/part with all lanes set to "t".
template <size_t N, HWY_IF_LE128(uint8_t, N)>
HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
- return Vec128<uint8_t, N>{_mm_set1_epi8(t)};
+ return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(uint16_t, N)>
HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
- return Vec128<uint16_t, N>{_mm_set1_epi16(t)};
+ return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(uint32_t, N)>
HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
- return Vec128<uint32_t, N>{_mm_set1_epi32(t)};
+ return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
template <size_t N, HWY_IF_LE128(uint64_t, N)>
HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
- return Vec128<uint64_t, N>{_mm_set1_epi64x(t)};
+ return Vec128<uint64_t, N>{
+ _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(int8_t, N)>
HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
- return Vec128<int8_t, N>{_mm_set1_epi8(t)};
+ return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(int16_t, N)>
HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
- return Vec128<int16_t, N>{_mm_set1_epi16(t)};
+ return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
@@ -182,7 +183,8 @@ HWY_API Vec128<int32_t, N> Set(Simd<int3
template <size_t N, HWY_IF_LE128(int64_t, N)>
HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
- return Vec128<int64_t, N>{_mm_set1_epi64x(t)};
+ return Vec128<int64_t, N>{
+ _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
@@ -510,7 +512,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<
template <typename TFrom, typename TTo, size_t N>
HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
- return Mask128<TTo, N>{m.raw};
+ const Simd<TFrom, N> d;
+ return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
// ------------------------------ Equality
@@ -683,6 +686,14 @@ HWY_API Mask128<double, N> operator>=(co
return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
+// ------------------------------ FirstN (Iota, Lt)
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
// ================================================== ARITHMETIC
// ------------------------------ Addition
@@ -894,7 +905,7 @@ template <size_t N>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
+// i64 is implemented after BroadcastSignBit.
template <size_t N>
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
@@ -959,7 +970,6 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> Mu
// ------------------------------ ShiftLeft
-// Unsigned
template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
@@ -988,6 +998,16 @@ HWY_API Vec128<int64_t, N> ShiftLeft(con
return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
+template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
+ const Simd<T, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
+ return kBits == 1
+ ? (v + v)
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
// ------------------------------ ShiftRight
template <int kBits, size_t N>
@@ -1004,6 +1024,15 @@ HWY_API Vec128<uint64_t, N> ShiftRight(c
template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
+ const Simd<uint8_t, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
+ return shifted & Set(d8, 0xFF >> kBits);
+template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
@@ -1012,6 +1041,15 @@ HWY_API Vec128<int32_t, N> ShiftRight(co
return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
+template <int kBits, size_t N>
+HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
+ const Simd<int8_t, N> di;
+ const Simd<uint8_t, N> du;
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// i64 is implemented after BroadcastSignBit.
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
@@ -1039,15 +1077,24 @@ HWY_API Vec128<int64_t, N> BroadcastSign
return VecFromMask(v < Zero(Simd<int64_t, N>()));
// Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires
- // two constants and domain crossing. 32-bit compare only requires Zero()
- // plus a shuffle to replicate the upper 32 bits.
+ // two constants and domain crossing. 32-bit shift avoids generating a zero.
const Simd<int32_t, N * 2> d32;
- const auto sign = BitCast(d32, v) < Zero(d32);
+ const auto sign = ShiftRight<31>(BitCast(d32, v));
return Vec128<int64_t, N>{
_mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
+template <size_t N>
+HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
+ return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
+ const auto zero = Zero(Simd<int64_t,N>());
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
template <int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
@@ -1097,6 +1144,15 @@ HWY_API Vec128<int64_t, N> ShiftLeftSame
return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
+ const Simd<T, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<T, N> shifted{
+ ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
// ------------------------------ ShiftRightSame (BroadcastSignBit)
template <size_t N>
@@ -1116,6 +1172,16 @@ HWY_API Vec128<uint64_t, N> ShiftRightSa
template <size_t N>
+HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
+ const int bits) {
+ const Simd<uint8_t, N> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec128<uint8_t, N> shifted{
+ ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
+ return shifted & Set(d8, 0xFF >> bits);
+template <size_t N>
HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -1140,6 +1206,15 @@ HWY_API Vec128<int64_t, N> ShiftRightSam
+template <size_t N>
+HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
+ const Simd<int8_t, N> di;
+ const Simd<uint8_t, N> du;
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ Negate
template <typename T, size_t N, HWY_IF_FLOAT(T)>
@@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128<double,
_mm_stream_pd(aligned, v.raw);
-// ------------------------------ Gather
+// ------------------------------ Scatter
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
using GatherIndex64 = long long int; // NOLINT(google-runtime-int)
static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
+namespace detail {
+template <typename T, size_t N>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+ const Vec128<int32_t, N> offset) {
+ if (N == 4) {
+ _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
+ }
+template <typename T, size_t N>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+ const Vec128<int32_t, N> index) {
+ if (N == 4) {
+ _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
+ }
+template <typename T, size_t N>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+ const Vec128<int64_t, N> offset) {
+ if (N == 2) {
+ _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
+ }
+template <typename T, size_t N>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
+ Simd<T, N> /* tag */, T* HWY_RESTRICT base,
+ const Vec128<int64_t, N> index) {
+ if (N == 2) {
+ _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
+ }
+} // namespace detail
+template <typename T, size_t N, typename Offset>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+template <typename T, size_t N, typename Index>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+template <size_t N>
+HWY_INLINE void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec128<int32_t, N> offset) {
+ if (N == 4) {
+ _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
+ }
+template <size_t N>
+HWY_INLINE void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec128<int32_t, N> index) {
+ if (N == 4) {
+ _mm_i32scatter_ps(base, index.raw, v.raw, 4);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
+ }
+template <size_t N>
+HWY_INLINE void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec128<int64_t, N> offset) {
+ if (N == 2) {
+ _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
+ }
+template <size_t N>
+HWY_INLINE void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec128<int64_t, N> index) {
+ if (N == 2) {
+ _mm_i64scatter_pd(base, index.raw, v.raw, 8);
+ } else {
+ const __mmask8 mask = (1u << N) - 1;
+ _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
+ }
+#else // HWY_TARGET == HWY_AVX3
+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Offset, N> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+ }
+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
+ const Vec128<Index, N> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) T lanes[N];
+ Store(v, d, lanes);
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ for (size_t i = 0; i < N; ++i) {
+ base[index_lanes[i]] = lanes[i];
+ }
+// ------------------------------ Gather (Load/Store)
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
- static_assert(N == 1, "SSE4 does not support full gather");
- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
- const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
- T val;
- CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
- return Set(d, val);
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ alignas(16) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ alignas(16) T lanes[N];
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
+ }
+ return Load(d, lanes);
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
- static_assert(N == 1, "SSE4 does not support full gather");
- static_assert(sizeof(T) == sizeof(Index), "T must match Index");
- return Set(d, base[GetLane(index)]);
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ alignas(16) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ alignas(16) T lanes[N];
+ for (size_t i = 0; i < N; ++i) {
+ lanes[i] = base[index_lanes[i]];
+ }
+ return Load(d, lanes);
@@ -1832,6 +2071,8 @@ HWY_API Vec128<double, N> GatherIndex(Si
#endif // HWY_TARGET != HWY_SSE4
// ================================================== SWIZZLE
// ------------------------------ Extract half
@@ -1859,10 +2100,10 @@ HWY_INLINE Vec128<double, 1> UpperHalf(V
// ------------------------------ Shift vector by constant #bytes
// 0x01..0F, kBytes = 1 => 0x02..0F00
-template <int kBytes, typename T>
-HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
- return Vec128<T>{_mm_slli_si128(v.raw, kBytes)};
+ return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
template <int kLanes, typename T, size_t N>
@@ -1873,10 +2114,10 @@ HWY_API Vec128<T, N> ShiftLeftLanes(cons
// 0x01..0F, kBytes = 1 => 0x0001..0E
-template <int kBytes, typename T>
-HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
- return Vec128<T>{_mm_srli_si128(v.raw, kBytes)};
+ return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
template <int kLanes, typename T, size_t N>
@@ -2041,44 +2282,47 @@ HWY_API Vec128<float> Shuffle0123(const
// ------------------------------ TableLookupLanes
// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
+template <typename T, size_t N>
struct Indices128 {
__m128i raw;
-template <typename T>
-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
- const size_t N = 16 / sizeof(T);
for (size_t i = 0; i < N; ++i) {
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
- const Full128<uint8_t> d8;
- alignas(16) uint8_t control[16];
- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
- const size_t idx_lane = idx_byte / sizeof(T);
- const size_t mod = idx_byte % sizeof(T);
- control[idx_byte] = static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + mod);
+ const Repartition<uint8_t, decltype(d)> d8;
+ alignas(16) uint8_t control[16] = {0};
+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ control[idx_lane * sizeof(T) + idx_byte] =
+ static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
+ }
- return Indices128<T>{Load(d8, control).raw};
+ return Indices128<T, N>{Load(d8, control).raw};
-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
- const Indices128<uint32_t> idx) {
- return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TableLookupLanes(
+ const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
- const Indices128<int32_t> idx) {
- return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
+template <size_t N>
+HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
+ const Indices128<int32_t, N> idx) {
+ return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
- const Indices128<float> idx) {
- const Full128<int32_t> di;
- const Full128<float> df;
+template <size_t N>
+HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
+ const Indices128<float, N> idx) {
+ const Simd<int32_t, N> di;
+ const Simd<float, N> df;
return BitCast(df,
- TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
+ TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
// ------------------------------ Interleave lanes
@@ -2286,47 +2530,47 @@ HWY_INLINE Vec128<double> ConcatUpperLow
namespace detail {
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
- const Vec128<T> b) {
- const Full128<T> d;
- const Full128<uint8_t> d8;
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ const Simd<T, N> d;
+ const Repartition<uint8_t, decltype(d)> d8;
alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
- const Vec128<T> b) {
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
- const Vec128<T> b) {
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
-template <typename T>
-HWY_API Vec128<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T> a,
- const Vec128<T> b) {
- return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
+ const Vec128<T, N> b) {
+ return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
} // namespace detail
-template <typename T>
-HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
-template <>
-HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
- const Vec128<float> b) {
- return Vec128<float>{_mm_blend_ps(a.raw, b.raw, 5)};
+template <size_t N>
+HWY_INLINE Vec128<float, N> OddEven(const Vec128<float, N> a,
+ const Vec128<float, N> b) {
+ return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
-template <>
-HWY_INLINE Vec128<double> OddEven<double>(const Vec128<double> a,
- const Vec128<double> b) {
- return Vec128<double>{_mm_blend_pd(a.raw, b.raw, 1)};
+template <size_t N>
+HWY_INLINE Vec128<double, N> OddEven(const Vec128<double, N> a,
+ const Vec128<double, N> b) {
+ return Vec128<double, N>{_mm_blend_pd(a.raw, b.raw, 1)};
// ------------------------------ Shl (ZipLower, Mul)
@@ -2764,7 +3008,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
return LowerHalf(LowerHalf(BitCast(d8, quad)));
-// ------------------------------ Convert integer <=> floating point
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
@@ -2779,13 +3023,20 @@ HWY_API Vec128<double, N> ConvertTo(Simd
return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
- alignas(16) int64_t lanes_i[2];
- Store(v, Simd<int64_t, N>(), lanes_i);
- alignas(16) double lanes_d[2];
- for (size_t i = 0; i < N; ++i) {
- lanes_d[i] = static_cast<double>(lanes_i[i]);
- }
- return Load(dd, lanes_d);
+ // Based on wim's approach (
+ const Repartition<uint32_t, decltype(dd)> d32;
+ const Repartition<uint64_t, decltype(dd)> d64;
+ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
+ const auto k84_63 = Set(d64, 0x4530000080000000ULL);
+ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
+ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
+ const auto k52 = Set(d32, 0x43300000);
+ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
+ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
+ return (v_upper - k84_63_52) + v_lower; // order matters!
@@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128<T
namespace detail {
template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+ HWY_DASSERT(mask_bits < 256);
+ const Simd<T, N> d;
+ const Rebind<uint8_t, decltype(d)> d8;
+ const Simd<uint16_t, N> du;
+ // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
+ // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
+ // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+ // store lane indices and convert to byte indices (2*lane + 0..1), with the
+ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+ // is likely more costly than the higher cache footprint from storing bytes.
+ alignas(16) constexpr uint8_t table[256 * 8] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
+ const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+ const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+ return BitCast(d, pairs + Set(du, 0x0100));
+template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
@@ -2968,71 +3355,42 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
// Helper function called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
- const uint64_t mask_bits) {
- return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
- const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
- const uint64_t mask_bits) {
- return Vec128<int32_t, N>{_mm_maskz_compress_epi32(mask_bits, v.raw)};
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
- const uint64_t mask_bits) {
- return Vec128<uint64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
- const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
-template <size_t N>
-HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
- const uint64_t mask_bits) {
- return Vec128<int64_t, N>{_mm_maskz_compress_epi64(mask_bits, v.raw)};
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- return TableLookupBytes(v, idx);
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
+ const uint64_t mask_bits) {
+ const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
+ using D = Simd<T, N>;
+ const RebindToSigned<D> di;
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-template <size_t N>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
- const uint64_t mask_bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
+ const uint64_t mask_bits) {
+ using D = Simd<T, N>;
+ using TI = MakeSigned<T>;
+ const Rebind<TI, D> di;
- return Vec128<float, N>{_mm_maskz_compress_ps(mask_bits, v.raw)};
+ return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi32(
+ mask_bits, BitCast(di, v).raw)});
- const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
- const Simd<float, N> df;
- const Simd<int32_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+ const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-template <size_t N>
-HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
- const uint64_t mask_bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
+ const uint64_t mask_bits) {
+ using D = Simd<T, N>;
+ using TI = MakeSigned<T>;
+ const Rebind<TI, D> di;
- return Vec128<double, N>{_mm_maskz_compress_pd(mask_bits, v.raw)};
+ return BitCast(D(), Vec128<TI, N>{_mm_maskz_compress_epi64(
+ mask_bits, BitCast(di, v).raw)});
- const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
- const Simd<double, N> df;
- const Simd<int64_t, N> di;
- return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
+ const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
@@ -3040,7 +3398,8 @@ HWY_API Vec128<double, N> Compress(Vec12
template <typename T, size_t N>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
- return detail::Compress(v, detail::BitsFromMask(mask));
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+ detail::BitsFromMask(mask));
// ------------------------------ CompressStore
@@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128<T, N
Simd<T, N> d, T* HWY_RESTRICT aligned) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
// Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
- Store(detail::Compress(v, mask_bits), d, aligned);
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
return PopCount(mask_bits);
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+// 128 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
+ const Vec128<uint8_t> v1,
+ const Vec128<uint8_t> v2, Full128<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const auto k5 = Set(d, 5);
+ const auto k6 = Set(d, 6);
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = Load(d, tbl_r0);
+ const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ StoreU(int0, d, unaligned + 0 * 16);
+ // Second vector: g10,r10, bgr[9:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
+ const auto r1 = TableLookupBytes(v0, shuf_r1);
+ const auto g1 = TableLookupBytes(v1, shuf_g1);
+ const auto b1 = TableLookupBytes(v2, shuf_b1);
+ const auto int1 = r1 | g1 | b1;
+ StoreU(int1, d, unaligned + 1 * 16);
+ // Third vector: bgr[15:11], b10
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
+ const auto r2 = TableLookupBytes(v0, shuf_r2);
+ const auto g2 = TableLookupBytes(v1, shuf_g2);
+ const auto b2 = TableLookupBytes(v2, shuf_b2);
+ const auto int2 = r2 | g2 | b2;
+ StoreU(int2, d, unaligned + 2 * 16);
+// 64 bits
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
+ const Vec128<uint8_t, 8> v1,
+ const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors for the shuffles and first result.
+ const Full128<uint8_t> d_full;
+ const auto k5 = Set(d_full, 5);
+ const auto k6 = Set(d_full, 6);
+ const Vec128<uint8_t> full_a{v0.raw};
+ const Vec128<uint8_t> full_b{v1.raw};
+ const Vec128<uint8_t> full_c{v2.raw};
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = Load(d_full, tbl_r0);
+ const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ StoreU(int0, d_full, unaligned + 0 * 16);
+ // Second (HALF) vector: bgr[7:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
+ const auto r1 = TableLookupBytes(full_a, shuf_r1);
+ const auto g1 = TableLookupBytes(full_b, shuf_g1);
+ const auto b1 = TableLookupBytes(full_c, shuf_b1);
+ const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
+ StoreU(int1, d, unaligned + 1 * 16);
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
+ const Vec128<uint8_t, N> v1,
+ const Vec128<uint8_t, N> v2,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors for the shuffles and result.
+ const Full128<uint8_t> d_full;
+ const Vec128<uint8_t> full_a{v0.raw};
+ const Vec128<uint8_t> full_b{v1.raw};
+ const Vec128<uint8_t> full_c{v2.raw};
+ // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
+ 0x80, 0x80, 0x80, 0x80};
+ const auto shuf_r0 = Load(d_full, tbl_r0);
+ const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0);
+ const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0);
+ const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
+ const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
+ const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
+ const auto int0 = r0 | g0 | b0;
+ alignas(16) uint8_t buf[16];
+ StoreU(int0, d_full, buf);
+ CopyBytes<N * 3>(buf, unaligned);
+// ------------------------------ StoreInterleaved4
+// 128 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
+ const Vec128<uint8_t> v1,
+ const Vec128<uint8_t> v2,
+ const Vec128<uint8_t> v3, Full128<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto ba8 = ZipUpper(v0, v1);
+ const auto dc8 = ZipUpper(v2, v3);
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC
+ StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16);
+ StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16);
+ StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16);
+ StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16);
+// 64 bits
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
+ const Vec128<uint8_t, 8> in1,
+ const Vec128<uint8_t, 8> in2,
+ const Vec128<uint8_t, 8> in3,
+ Simd<uint8_t, 8> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors to reduce the number of stores.
+ const Vec128<uint8_t> v0{in0.raw};
+ const Vec128<uint8_t> v1{in1.raw};
+ const Vec128<uint8_t> v2{in2.raw};
+ const Vec128<uint8_t> v3{in3.raw};
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4
+ const Full128<uint8_t> d_full;
+ StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16);
+ StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16);
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
+ const Vec128<uint8_t, N> in1,
+ const Vec128<uint8_t, N> in2,
+ const Vec128<uint8_t, N> in3,
+ Simd<uint8_t, N> /*tag*/,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // Use full vectors to reduce the number of stores.
+ const Vec128<uint8_t> v0{in0.raw};
+ const Vec128<uint8_t> v1{in1.raw};
+ const Vec128<uint8_t> v2{in2.raw};
+ const Vec128<uint8_t> v3{in3.raw};
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0
+ alignas(16) uint8_t buf[16];
+ const Full128<uint8_t> d_full;
+ StoreU(BitCast(d_full, dcba_0), d_full, buf);
+ CopyBytes<4 * N>(buf, unaligned);
// ------------------------------ Reductions
namespace detail {
-// For u32/i32/f32.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+// N=1 for any T: no-op
+template <typename T>
+HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+template <typename T>
+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+template <typename T>
+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+ const Vec128<T, 1> v) {
+ return v;
+// u32/i32/f32:
+// N=2
+template <typename T>
+HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
+template <typename T>
+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+template <typename T>
+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+ const Vec128<T, 2> v10) {
+ return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
+// N=4 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = v3210 + v1032;
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return v20_31_20_31 + v31_20_31_20;
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
- const Vec128<T, N> v3210) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Max(v20_31_20_31, v31_20_31_20);
-// For u64/i64/f64.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+// u64/i64/f64:
+// N=2 (full)
+template <typename T>
+HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return v10 + v01;
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
- const Vec128<T, N> v10) {
+template <typename T>
+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
} // namespace detail
-// Supported for u/i/f 32/64. Returns the sum in each lane.
+// Supported for u/i/f 32/64. Returns the same value in each lane.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 2021-06-02 10:56:05.234904387 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -20,6 +20,20 @@
// particular, "Broadcast", pack and zip behavior may be surprising.
#include <immintrin.h> // AVX2+
+#if defined(_MSC_VER) && defined(__clang__)
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
+// including these headers when _MSC_VER is defined, like when using clang-cl.
+// Include these directly here.
+#include <avxintrin.h>
+// avxintrin defines __m256i and must come before avx2intrin.
+#include <avx2intrin.h>
+#include <bmi2intrin.h> // _pext_u64
+#include <f16cintrin.h>
+#include <fmaintrin.h>
+#include <smmintrin.h>
#include <stddef.h>
#include <stdint.h>
@@ -148,23 +162,24 @@ HWY_API Vec256<uint16_t> Set(Full256<uin
return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
- return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))}; // NOLINT
+ return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
return Vec256<uint64_t>{
_mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
- return Vec256<int8_t>{_mm256_set1_epi8(t)};
+ return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))}; // NOLINT
HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
- return Vec256<int16_t>{_mm256_set1_epi16(t)};
+ return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))}; // NOLINT
HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
return Vec256<int32_t>{_mm256_set1_epi32(t)};
HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
- return Vec256<int64_t>{_mm256_set1_epi64x(t)};
+ return Vec256<int64_t>{
+ _mm256_set1_epi64x(static_cast<long long>(t))}; // NOLINT
HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
return Vec256<float>{_mm256_set1_ps(t)};
@@ -340,6 +355,8 @@ HWY_API Vec256<T> VecFromMask(Full256<T>
return Vec256<T>{v.raw};
+// ------------------------------ IfThenElse
// mask ? yes : no
template <typename T>
HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
@@ -412,9 +429,9 @@ HWY_API Mask256<T> Xor(const Mask256<T>
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
template <typename TFrom, typename TTo>
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
+HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
- return Mask256<TTo>{m.raw};
+ return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
// ------------------------------ Equality
@@ -670,6 +687,14 @@ HWY_API Vec256<double> Max(const Vec256<
return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
+// ------------------------------ FirstN (Iota, Lt)
+template <typename T>
+HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
// ================================================== ARITHMETIC
// ------------------------------ Addition
@@ -832,7 +857,13 @@ HWY_API Vec256<uint16_t> AverageRound(co
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
+ // Workaround for incorrect codegen? (wrong result)
+ const auto zero = Zero(Full256<int8_t>());
+ return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
@@ -840,6 +871,7 @@ HWY_API Vec256<int16_t> Abs(const Vec256
HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
+// i64 is implemented after BroadcastSignBit.
HWY_API Vec256<float> Abs(const Vec256<float> v) {
const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
@@ -925,6 +957,16 @@ HWY_API Vec256<int64_t> ShiftLeft(const
return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
+ const Full256<T> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
+ return kBits == 1
+ ? (v + v)
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
// ------------------------------ ShiftRight
template <int kBits>
@@ -943,6 +985,14 @@ HWY_API Vec256<uint64_t> ShiftRight(cons
template <int kBits>
+HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
+ const Full256<uint8_t> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
+ return shifted & Set(d8, 0xFF >> kBits);
+template <int kBits>
HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
@@ -952,6 +1002,15 @@ HWY_API Vec256<int32_t> ShiftRight(const
return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
+template <int kBits>
+HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
+ const Full256<int8_t> di;
+ const Full256<uint8_t> du;
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// i64 is implemented after BroadcastSignBit.
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
@@ -989,6 +1048,15 @@ HWY_API Vec256<int64_t> ShiftRight(const
+HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
+ return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
+ const auto zero = Zero(Full256<int64_t>());
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
// ------------------------------ ShiftLeftSame
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@@ -1016,6 +1084,14 @@ HWY_API Vec256<int64_t> ShiftLeftSame(co
return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
+ const Full256<T> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
// ------------------------------ ShiftRightSame (BroadcastSignBit)
HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
@@ -1031,6 +1107,13 @@ HWY_API Vec256<uint64_t> ShiftRightSame(
return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
+ const Full256<uint8_t> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
+ return shifted & Set(d8, 0xFF >> bits);
HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
const int bits) {
return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -1053,6 +1136,14 @@ HWY_API Vec256<int64_t> ShiftRightSame(c
+HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
+ const Full256<int8_t> di;
+ const Full256<uint8_t> du;
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ Negate
template <typename T, HWY_IF_FLOAT(T)>
@@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256<double>
_mm256_stream_pd(aligned, v.raw);
+// ------------------------------ Scatter
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+namespace detail {
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256<T> v,
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec256<int32_t> offset) {
+ _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256<T> v,
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec256<int32_t> index) {
+ _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256<T> v,
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec256<int64_t> offset) {
+ _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256<T> v,
+ Full256<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec256<int64_t> index) {
+ _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
+} // namespace detail
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+ const Vec256<Offset> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+ const Vec256<Index> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+template <>
+HWY_INLINE void ScatterOffset<float>(Vec256<float> v, Full256<float> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec256<int32_t> offset) {
+ _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
+template <>
+HWY_INLINE void ScatterIndex<float>(Vec256<float> v, Full256<float> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec256<int32_t> index) {
+ _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
+template <>
+HWY_INLINE void ScatterOffset<double>(Vec256<double> v,
+ Full256<double> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec256<int64_t> offset) {
+ _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
+template <>
+HWY_INLINE void ScatterIndex<double>(Vec256<double> v,
+ Full256<double> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec256<int64_t> index) {
+ _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+ const Vec256<Offset> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ constexpr size_t N = 32 / sizeof(T);
+ alignas(32) T lanes[N];
+ Store(v, d, lanes);
+ alignas(32) Offset offset_lanes[N];
+ Store(offset, Simd<Offset, N>(), offset_lanes);
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
+ for (size_t i = 0; i < N; ++i) {
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
+ }
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
+ const Vec256<Index> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ constexpr size_t N = 32 / sizeof(T);
+ alignas(32) T lanes[N];
+ Store(v, d, lanes);
+ alignas(32) Index index_lanes[N];
+ Store(index, Simd<Index, N>(), index_lanes);
+ for (size_t i = 0; i < N; ++i) {
+ base[index_lanes[i]] = lanes[i];
+ }
// ------------------------------ Gather
namespace detail {
@@ -1374,13 +1582,13 @@ HWY_API Vec256<T> GatherIndex(hwy::SizeT
template <typename T, typename Offset>
HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
const Vec256<Offset> offset) {
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
template <typename T, typename Index>
HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
const Vec256<Index> index) {
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
@@ -1410,6 +1618,8 @@ HWY_INLINE Vec256<double> GatherIndex<do
return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
// ================================================== SWIZZLE
template <typename T>
@@ -1861,38 +2071,26 @@ HWY_API Vec256<int64_t> ZipUpper(const V
return Vec256<int64_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
-// ------------------------------ Blocks
+// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
+// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is
+// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on
+// UpperUpper at the cost of one extra cycle/instruction.
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <typename T>
HWY_API Vec256<T> ConcatLowerLower(const Vec256<T> hi, const Vec256<T> lo) {
- return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)};
+ return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)};
template <>
HWY_INLINE Vec256<float> ConcatLowerLower(const Vec256<float> hi,
const Vec256<float> lo) {
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)};
+ return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)};
template <>
HWY_INLINE Vec256<double> ConcatLowerLower(const Vec256<double> hi,
const Vec256<double> lo) {
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)};
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
-template <typename T>
-HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
- return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
-template <>
-HWY_INLINE Vec256<float> ConcatUpperUpper(const Vec256<float> hi,
- const Vec256<float> lo) {
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
-template <>
-HWY_INLINE Vec256<double> ConcatUpperUpper(const Vec256<double> hi,
- const Vec256<double> lo) {
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
+ return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)};
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
@@ -1927,6 +2125,12 @@ HWY_INLINE Vec256<double> ConcatUpperLow
return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
+// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
+template <typename T>
+HWY_API Vec256<T> ConcatUpperUpper(const Vec256<T> hi, const Vec256<T> lo) {
+ return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo)));
// ------------------------------ Odd/even lanes
namespace detail {
@@ -2211,11 +2415,18 @@ HWY_API Vec128<int8_t> DemoteTo(Full128<
_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
+ // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
+ // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
+HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
const Vec256<float> v) {
return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
const Vec256<double> v) {
return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
@@ -2241,7 +2452,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
return BitCast(Simd<uint8_t, 8>(), pair);
-// ------------------------------ Convert integer <=> floating point
+// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
const Vec256<int32_t> v) {
@@ -2253,13 +2464,20 @@ HWY_API Vec256<double> ConvertTo(Full256
return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
- alignas(32) int64_t lanes_i[4];
- Store(v, Full256<int64_t>(), lanes_i);
- alignas(32) double lanes_d[4];
- for (size_t i = 0; i < 4; ++i) {
- lanes_d[i] = static_cast<double>(lanes_i[i]);
- }
- return Load(dd, lanes_d);
+ // Based on wim's approach (
+ const Repartition<uint32_t, decltype(dd)> d32;
+ const Repartition<uint64_t, decltype(dd)> d64;
+ // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
+ const auto k84_63 = Set(d64, 0x4530000080000000ULL);
+ const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
+ // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
+ const auto k52 = Set(d32, 0x43300000);
+ const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
+ const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
+ return (v_upper - k84_63_52) + v_lower; // order matters!
@@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT
const auto compressed =
_mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
+#endif // HWY_ARCH_X86_64
template <typename T>
@@ -2473,75 +2690,100 @@ HWY_INLINE Vec256<uint32_t> Idx64x4FromB
return Load(d32, packed_array + 8 * mask_bits);
-// Helper function called by both Compress and CompressStore - avoids a
+// Helper functions called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
-HWY_API Vec256<uint32_t> Compress(Vec256<uint32_t> v,
- const uint64_t mask_bits) {
- return Vec256<uint32_t>{
- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
- const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
- return Vec256<uint32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-HWY_API Vec256<int32_t> Compress(Vec256<int32_t> v, const uint64_t mask_bits) {
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<4> /*tag*/, Vec256<T> v,
+ const uint64_t mask_bits) {
+ const auto vu = BitCast(Full256<uint32_t>(), v);
- return Vec256<int32_t>{
- _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)};
+ const __m256i ret =
+ _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw);
const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
- return Vec256<int32_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
+ return BitCast(Full256<T>(), Vec256<uint32_t>{ret});
-HWY_API Vec256<uint64_t> Compress(Vec256<uint64_t> v,
- const uint64_t mask_bits) {
- return Vec256<uint64_t>{
- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
- const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
- return Vec256<uint64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-HWY_API Vec256<int64_t> Compress(Vec256<int64_t> v, const uint64_t mask_bits) {
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<8> /*tag*/, Vec256<T> v,
+ const uint64_t mask_bits) {
+ const auto vu = BitCast(Full256<uint64_t>(), v);
- return Vec256<int64_t>{
- _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)};
+ const __m256i ret =
+ _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw);
const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
- return Vec256<int64_t>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
+ const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw);
+ return BitCast(Full256<T>(), Vec256<uint64_t>{ret});
-HWY_API Vec256<float> Compress(Vec256<float> v, const uint64_t mask_bits) {
- return Vec256<float>{
- _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)};
- const Vec256<uint32_t> idx = detail::Idx32x8FromBits(mask_bits);
- return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
+// Otherwise, defined in x86_512-inl.h so it can use wider vectors.
-HWY_API Vec256<double> Compress(Vec256<double> v, const uint64_t mask_bits) {
- return Vec256<double>{
- _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)};
- const Vec256<uint32_t> idx = detail::Idx64x4FromBits(mask_bits);
- return Vec256<double>{_mm256_castsi256_pd(
- _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))};
+// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using
+// the native Compress is probably more efficient than 2 LUTs.
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
+ const uint64_t mask_bits) {
+ using D = Full256<T>;
+ const Rebind<uint16_t, D> du;
+ const Repartition<int32_t, D> dw;
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
+ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
+ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
+ const uint64_t mask_bits0 = mask_bits & 0xFF;
+ const uint64_t mask_bits1 = mask_bits >> 8;
+ const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0);
+ const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1);
+ const Half<decltype(du)> dh;
+ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
+ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
+ const size_t count0 = PopCount(mask_bits0);
+ // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
+ // VPERMD for shifting at 4 byte granularity.
+ alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7};
+ const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2);
+ const auto shift1_multiple4 =
+ BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices));
+ // Whole-register unconditional shift by 2 bytes.
+ // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead?
+ const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw,
+ shift1_multiple4.raw, 0x08);
+ const auto shift1_multiple2 =
+ Vec256<uint16_t>{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)};
+ // Make the shift conditional on the lower bit of count0.
+ const auto m_odd = TestBit(Set(du, count0), Set(du, 1));
+ const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4);
+ // Blend the lower and shifted upper parts.
+ constexpr uint16_t on = 0xFFFF;
+ alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on),
+ HWY_REP4(on), HWY_REP4(on)};
+ const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0));
+ return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1));
+#endif // HWY_TARGET != HWY_AVX3
} // namespace detail
+// Otherwise, defined in x86_512-inl.h after detail::Compress.
template <typename T>
HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
- return detail::Compress(v, detail::BitsFromMask(mask));
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+ detail::BitsFromMask(mask));
// ------------------------------ CompressStore
@@ -2550,10 +2792,101 @@ template <typename T>
HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
T* HWY_RESTRICT aligned) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
- Store(detail::Compress(v, mask_bits), d, aligned);
+ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
+ // using StoreU to concatenate the results would cause page faults if
+ // `aligned` is the last valid vector. Instead rely on in-register splicing.
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
return PopCount(mask_bits);
+#endif // HWY_TARGET != HWY_AVX3
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes, ConcatUpperLower)
+HWY_API void StoreInterleaved3(const Vec256<uint8_t> v0,
+ const Vec256<uint8_t> v1,
+ const Vec256<uint8_t> v2, Full256<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const auto k5 = Set(d, 5);
+ const auto k6 = Set(d, 6);
+ // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = LoadDup128(d, tbl_r0);
+ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
+ const auto interleaved_10_00 = r0 | g0 | b0;
+ // Second vector: g10,r10, bgr[9:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
+ const auto r1 = TableLookupBytes(v0, shuf_r1);
+ const auto g1 = TableLookupBytes(v1, shuf_g1);
+ const auto b1 = TableLookupBytes(v2, shuf_b1);
+ const auto interleaved_15_05 = r1 | g1 | b1;
+ // We want to write the lower halves of the interleaved vectors, then the
+ // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but
+ // that would require two ununaligned stores. For the lower halves, we can
+ // merge two 128-bit stores for the same swizzling cost:
+ const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00);
+ StoreU(out0, d, unaligned + 0 * 32);
+ // Third vector: bgr[15:11], b10
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
+ const auto r2 = TableLookupBytes(v0, shuf_r2);
+ const auto g2 = TableLookupBytes(v1, shuf_g2);
+ const auto b2 = TableLookupBytes(v2, shuf_b2);
+ const auto interleaved_1A_0A = r2 | g2 | b2;
+ const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A);
+ StoreU(out1, d, unaligned + 1 * 32);
+ const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05);
+ StoreU(out2, d, unaligned + 2 * 32);
+// ------------------------------ StoreInterleaved4
+HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
+ const Vec256<uint8_t> v1,
+ const Vec256<uint8_t> v2,
+ const Vec256<uint8_t> v3, Full256<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto ba8 = ZipUpper(v0, v1);
+ const auto dc8 = ZipUpper(v2, v3);
+ const auto dcba_0 = ZipLower(ba0, dc0); // d..a13 d..a10 | d..a03 d..a00
+ const auto dcba_4 = ZipUpper(ba0, dc0); // d..a17 d..a14 | d..a07 d..a04
+ const auto dcba_8 = ZipLower(ba8, dc8); // d..a1B d..a18 | d..a0B d..a08
+ const auto dcba_C = ZipUpper(ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C
+ // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
+ // efficiently combine two lower halves into 256 bits:
+ const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0));
+ const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8));
+ StoreU(out0, d, unaligned + 0 * 32);
+ StoreU(out1, d, unaligned + 1 * 32);
+ const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0));
+ const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8));
+ StoreU(out2, d, unaligned + 2 * 32);
+ StoreU(out3, d, unaligned + 3 * 32);
// ------------------------------ Reductions
namespace detail {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 2021-06-02 10:56:05.218904306 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -19,6 +19,23 @@
// particular, "Broadcast", pack and zip behavior may be surprising.
#include <immintrin.h> // AVX2+
+#if defined(_MSC_VER) && defined(__clang__)
+// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
+// including these headers when _MSC_VER is defined, like when using clang-cl.
+// Include these directly here.
+#include <smmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <f16cintrin.h>
+#include <fmaintrin.h>
+#include <avx512fintrin.h>
+#include <avx512vlintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512vlbwintrin.h>
+#include <avx512vldqintrin.h>
#include <stddef.h>
#include <stdint.h>
@@ -100,9 +117,8 @@ struct RawMask512<8> {
// Mask register: one bit per lane.
template <typename T>
class Mask512 {
- using Raw = typename RawMask512<sizeof(T)>::type;
+ using Raw = typename RawMask512<sizeof(T)>::type;
Raw raw;
@@ -167,23 +183,24 @@ HWY_API Vec512<uint16_t> Set(Full512<uin
return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
- return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))}; // NOLINT
+ return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
return Vec512<uint64_t>{
_mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
- return Vec512<int8_t>{_mm512_set1_epi8(t)};
+ return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))}; // NOLINT
HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
- return Vec512<int16_t>{_mm512_set1_epi16(t)};
+ return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))}; // NOLINT
HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
return Vec512<int32_t>{_mm512_set1_epi32(t)};
HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
- return Vec512<int64_t>{_mm512_set1_epi64(t)};
+ return Vec512<int64_t>{
+ _mm512_set1_epi64(static_cast<long long>(t))}; // NOLINT
HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
return Vec512<float>{_mm512_set1_ps(t)};
@@ -329,7 +346,45 @@ HWY_API Vec512<T> CopySignToAbs(const Ve
return CopySign(abs, sign);
-// ------------------------------ Select/blend
+// ------------------------------ FirstN
+// Possibilities for constructing a bitmask of N ones:
+// - kshift* only consider the lowest byte of the shift count, so they would
+// not correctly handle large n.
+// - Scalar shifts >= 64 are UB.
+// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
+// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
+#if HWY_ARCH_X86_32
+namespace detail {
+// 32 bit mask is sufficient for lane size >= 2.
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_API Mask512<T> FirstN(size_t n) {
+ using Bits = typename Mask512<T>::Raw;
+ return Mask512<T>{static_cast<Bits>(_bzhi_u32(~uint32_t(0), n))};
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Mask512<T> FirstN(size_t n) {
+ const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
+ return Mask512<T>{static_cast<__mmask64>(bits)};
+} // namespace detail
+#endif // HWY_ARCH_X86_32
+template <typename T>
+HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
+#if HWY_ARCH_X86_64
+ using Bits = typename Mask512<T>::Raw;
+ return Mask512<T>{static_cast<Bits>(_bzhi_u64(~uint64_t(0), n))};
+ return detail::FirstN<T>(n);
+#endif // HWY_ARCH_X86_64
+// ------------------------------ IfThenElse
// Returns mask ? b : a.
@@ -626,7 +681,13 @@ HWY_API Vec512<uint16_t> AverageRound(co
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) {
+ // Workaround for incorrect codegen? (untested due to internal compiler error)
+ const auto zero = Zero(Full512<int8_t>());
+ return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) {
return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
@@ -634,6 +695,9 @@ HWY_API Vec512<int16_t> Abs(const Vec512
HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) {
return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
+HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
+ return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
// These aren't native instructions, they also involve AND with constant.
HWY_API Vec512<float> Abs(const Vec512<float> v) {
@@ -675,6 +739,16 @@ HWY_API Vec512<int64_t> ShiftLeft(const
return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
+template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
+ const Full512<T> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
+ return kBits == 1
+ ? (v + v)
+ : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
// ------------------------------ ShiftRight
template <int kBits>
@@ -693,6 +767,14 @@ HWY_API Vec512<uint64_t> ShiftRight(cons
template <int kBits>
+HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
+ const Full512<uint8_t> d8;
+ // Use raw instead of BitCast to support N=1.
+ const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
+ return shifted & Set(d8, 0xFF >> kBits);
+template <int kBits>
HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) {
return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
@@ -707,6 +789,15 @@ HWY_API Vec512<int64_t> ShiftRight(const
return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
+template <int kBits>
+HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
+ const Full512<int8_t> di;
+ const Full512<uint8_t> du;
+ const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ ShiftLeftSame
HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v,
@@ -734,6 +825,14 @@ HWY_API Vec512<int64_t> ShiftLeftSame(co
return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
+ const Full512<T> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
+ return shifted & Set(d8, (0xFF << bits) & 0xFF);
// ------------------------------ ShiftRightSame
HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v,
@@ -749,6 +848,13 @@ HWY_API Vec512<uint64_t> ShiftRightSame(
return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) {
+ const Full512<uint8_t> d8;
+ const RepartitionToWide<decltype(d8)> d16;
+ const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
+ return shifted & Set(d8, 0xFF >> bits);
HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v,
const int bits) {
return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
@@ -763,6 +869,14 @@ HWY_API Vec512<int64_t> ShiftRightSame(c
return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
+HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
+ const Full512<int8_t> di;
+ const Full512<uint8_t> du;
+ const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
+ const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
+ return (shifted ^ shifted_sign) - shifted_sign;
// ------------------------------ Shl
HWY_API Vec512<uint16_t> operator<<(const Vec512<uint16_t> v,
@@ -1046,6 +1160,10 @@ HWY_API Vec512<float> ApproximateRecipro
// ------------------------------ Floating-point rounding
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
// Toward nearest integer, tie to even
HWY_API Vec512<float> Round(const Vec512<float> v) {
return Vec512<float>{_mm512_roundscale_ps(
@@ -1086,6 +1204,8 @@ HWY_API Vec512<double> Floor(const Vec51
_mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
// ================================================== COMPARE
// Comparisons set a mask bit to 1 if the condition is true, else 0.
@@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512<double>
_mm512_stream_pd(aligned, v.raw);
+// ------------------------------ Scatter
+// Work around warnings in the intrinsic definitions (passing -1 as a mask).
+HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
+namespace detail {
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v,
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec512<int32_t> offset) {
+ _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v,
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec512<int32_t> index) {
+ _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
+template <typename T>
+HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v,
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec512<int64_t> offset) {
+ _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
+template <typename T>
+HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v,
+ Full512<T> /* tag */, T* HWY_RESTRICT base,
+ const Vec512<int64_t> index) {
+ _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
+} // namespace detail
+template <typename T, typename Offset>
+HWY_API void ScatterOffset(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
+ const Vec512<Offset> offset) {
+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+ return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
+template <typename T, typename Index>
+HWY_API void ScatterIndex(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
+ const Vec512<Index> index) {
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
+ return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
+template <>
+HWY_INLINE void ScatterOffset<float>(Vec512<float> v, Full512<float> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec512<int32_t> offset) {
+ _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
+template <>
+HWY_INLINE void ScatterIndex<float>(Vec512<float> v, Full512<float> /* tag */,
+ float* HWY_RESTRICT base,
+ const Vec512<int32_t> index) {
+ _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
+template <>
+HWY_INLINE void ScatterOffset<double>(Vec512<double> v,
+ Full512<double> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec512<int64_t> offset) {
+ _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
+template <>
+HWY_INLINE void ScatterIndex<double>(Vec512<double> v,
+ Full512<double> /* tag */,
+ double* HWY_RESTRICT base,
+ const Vec512<int64_t> index) {
+ _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
// ------------------------------ Gather
namespace detail {
@@ -1713,13 +1910,13 @@ HWY_API Vec512<T> GatherIndex(hwy::SizeT
template <typename T, typename Offset>
HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
const Vec512<Offset> offset) {
- static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
+static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
template <typename T, typename Index>
HWY_API Vec512<T> GatherIndex(Full512<T> d, const T* HWY_RESTRICT base,
const Vec512<Index> index) {
- static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
@@ -1749,6 +1946,8 @@ HWY_INLINE Vec512<double> GatherIndex<do
return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
// ================================================== SWIZZLE
template <typename T>
@@ -2439,7 +2638,11 @@ HWY_API Vec256<int8_t> DemoteTo(Full256<
HWY_API Vec256<float16_t> DemoteTo(Full256<float16_t> /* tag */,
const Vec512<float> v) {
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
@@ -2633,8 +2836,81 @@ HWY_API Vec512<double> Compress(Vec512<d
return Vec512<double>{_mm512_maskz_compress_pd(mask.raw, v.raw)};
+namespace detail {
+// Ignore IDE redefinition error for these two functions: if this header is
+// included, then the functions weren't actually defined in x86_256-inl.h.
+template <typename T>
+HWY_API Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
+ const uint64_t mask_bits) {
+ using D = Full256<T>;
+ const Rebind<uint16_t, D> du;
+ const Rebind<int32_t, D> dw; // 512-bit, not 256!
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
+ const Mask512<int32_t> mask{static_cast<__mmask16>(mask_bits)};
+ return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask)));
+} // namespace detail
+template <typename T>
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+ return detail::Compress(hwy::SizeTag<sizeof(T)>(), v,
+ detail::BitsFromMask(mask));
+// Expands to 32-bit, compresses, concatenate demoted halves.
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
+ using D = Full512<T>;
+ const Rebind<uint16_t, D> du;
+ const Repartition<int32_t, D> dw;
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
+ const auto promoted0 = PromoteTo(dw, LowerHalf(vu16));
+ const auto promoted1 = PromoteTo(dw, UpperHalf(vu16));
+ const Mask512<int32_t> mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)};
+ const Mask512<int32_t> mask1{static_cast<__mmask16>(mask.raw >> 16)};
+ const auto compressed0 = Compress(promoted0, mask0);
+ const auto compressed1 = Compress(promoted1, mask1);
+ const Half<decltype(du)> dh;
+ const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
+ const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
+ // Concatenate into single vector by shifting upper with writemask.
+ const size_t num0 = CountTrue(mask0);
+ const __mmask32 m_upper = ~((1u << num0) - 1);
+ alignas(64) uint16_t iota[64] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+ const auto idx = LoadU(du, iota + 32 - num0);
+ return Vec512<T>{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw,
+ demoted1.raw)};
// ------------------------------ CompressStore
+template <typename T>
+HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
+ T* HWY_RESTRICT aligned) {
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
+ Store(detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits), d, aligned);
+ return PopCount(mask_bits);
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API size_t CompressStore(Vec512<T> v, const Mask512<T> mask, Full512<T> d,
+ T* HWY_RESTRICT aligned) {
+ // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but
+ // using StoreU to concatenate the results would cause page faults if
+ // `aligned` is the last valid vector. Instead rely on in-register splicing.
+ Store(Compress(v, mask), d, aligned);
+ return CountTrue(mask);
HWY_API size_t CompressStore(Vec512<uint32_t> v, const Mask512<uint32_t> mask,
Full512<uint32_t> /* tag */,
uint32_t* HWY_RESTRICT aligned) {
@@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512<doub
return CountTrue(mask);
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
+ const Vec512<uint8_t> c, Full512<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ const auto k5 = Set(d, 5);
+ const auto k6 = Set(d, 6);
+ // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
+ // 0x80 so lanes to be filled from other vectors are 0 for blending.
+ alignas(16) static constexpr uint8_t tbl_r0[16] = {
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+ alignas(16) static constexpr uint8_t tbl_g0[16] = {
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+ const auto shuf_r0 = LoadDup128(d, tbl_r0);
+ const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5
+ const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0);
+ const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
+ const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
+ const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
+ const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00
+ // Second vector: g10,r10, bgr[9:6], b5,g5
+ const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
+ const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
+ const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
+ const auto r1 = TableLookupBytes(a, shuf_r1);
+ const auto g1 = TableLookupBytes(b, shuf_g1);
+ const auto b1 = TableLookupBytes(c, shuf_b1);
+ const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05
+ // Third vector: bgr[15:11], b10
+ const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
+ const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
+ const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
+ const auto r2 = TableLookupBytes(a, shuf_r2);
+ const auto g2 = TableLookupBytes(b, shuf_g2);
+ const auto b2 = TableLookupBytes(c, shuf_b2);
+ const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
+ // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
+ const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
+ const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
+ const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
+ // Alternating order, most-significant 128 bits from the second arg.
+ const __mmask8 m = 0xCC;
+ const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
+ const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
+ const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
+ StoreU(Vec512<uint8_t>{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00
+ StoreU(Vec512<uint8_t>{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15
+ StoreU(Vec512<uint8_t>{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A
+// ------------------------------ StoreInterleaved4
+HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
+ const Vec512<uint8_t> v1,
+ const Vec512<uint8_t> v2,
+ const Vec512<uint8_t> v3, Full512<uint8_t> d,
+ uint8_t* HWY_RESTRICT unaligned) {
+ // let a,b,c,d denote v0..3.
+ const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0
+ const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0
+ const auto ba8 = ZipUpper(v0, v1);
+ const auto dc8 = ZipUpper(v2, v3);
+ const auto i = ZipLower(ba0, dc0).raw; // 4x128bit: d..a3 d..a0
+ const auto j = ZipUpper(ba0, dc0).raw; // 4x128bit: d..a7 d..a4
+ const auto k = ZipLower(ba8, dc8).raw; // 4x128bit: d..aB d..a8
+ const auto l = ZipUpper(ba8, dc8).raw; // 4x128bit: d..aF d..aC
+ // 128-bit blocks were independent until now; transpose 4x4.
+ const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
+ const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
+ const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
+ const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
+ constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
+ constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
+ const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
+ const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
+ const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
+ const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
+ StoreU(Vec512<uint8_t>{l0_k0_j0_i0}, d, unaligned + 0 * 64);
+ StoreU(Vec512<uint8_t>{l1_k1_j1_i1}, d, unaligned + 1 * 64);
+ StoreU(Vec512<uint8_t>{l2_k2_j2_i2}, d, unaligned + 2 * 64);
+ StoreU(Vec512<uint8_t>{l3_k3_j3_i3}, d, unaligned + 3 * 64);
// ------------------------------ Reductions
// Returns the sum in each lane.
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-06-02 10:56:05.281904625 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-05-31 10:37:11.000000000 -0400
@@ -28,12 +28,12 @@
#if HWY_ARCH_X86
#include <xmmintrin.h>
-#ifdef _MSC_VER
#include <intrin.h>
#include <cpuid.h>
+#endif // HWY_ARCH_X86
namespace hwy {
namespace {
@@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
void Cpuid(const uint32_t level, const uint32_t count,
uint32_t* HWY_RESTRICT abcd) {
-#ifdef _MSC_VER
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
uint32_t a;
uint32_t b;
uint32_t c;
@@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
// Returns the lower 32 bits of extended control register 0.
// Requires CPU support for "OSXSAVE" (see below).
uint32_t ReadXCR0() {
-#ifdef _MSC_VER
return static_cast<uint32_t>(_xgetbv(0));
uint32_t xcr0, xcr0_high;
const uint32_t index = 0;
asm volatile(".byte 0x0F, 0x01, 0xD0"
: "=a"(xcr0), "=d"(xcr0_high)
: "c"(index));
return xcr0;
#endif // HWY_ARCH_X86
@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13;
constexpr uint32_t kAVX512DQ = 1u << 14;
constexpr uint32_t kAVX512BW = 1u << 15;
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
+#endif // HWY_ARCH_X86
} // namespace
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 2021-06-02 10:56:05.267904554 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h 2021-05-31 10:37:11.000000000 -0400
@@ -65,7 +65,9 @@
-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
+#define HWY_SVE2 0x400
+#define HWY_SVE 0x800
+// 0x1000 reserved for Helium
#define HWY_NEON 0x2000
@@ -90,6 +92,9 @@
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
#define HWY_SCALAR 0x20000000
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
@@ -106,25 +111,26 @@
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
-// SSE4 codegen (msan failure), so disable all those targets.
+// SSE4 codegen (possibly only for msan), so disable all those targets.
-// TODO: Disable all non-scalar targets for every build target once we have
-// clang-7 enabled in our builders.
// This entails a major speed reduction, so warn unless the user explicitly
// opts in to scalar-only.
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
-// MSVC, or 32-bit may fail to compile AVX2/3.
-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
+// 32-bit may fail to compile AVX2/3.
+#elif HWY_ARCH_X86_32
-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
+// MSVC AVX3 support is buggy:
+// armv7be has not been tested and is not yet supported.
+#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
@@ -145,53 +151,74 @@
// user to override this without any guarantee of success.
-#ifdef __wasm_simd128__
+// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
+#if HWY_ARCH_WASM && defined(__wasm_simd128__)
-#ifdef __VSX__
+// Avoid choosing the PPC target until we have an implementation.
+#if HWY_ARCH_PPC && defined(__VSX__) && 0
-// GCC 4.5.4 only defines the former; 5.4 defines both.
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+// Avoid choosing the SVE[2] targets the implementation is ready.
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
+#define HWY_BASELINE_SVE2 0
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
+// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
+#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
-#ifdef __SSE4_1__
+// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
+// we at least get SSE4 on machines supporting AVX but not AVX2.
+#if HWY_ARCH_X86 && \
+ (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
-#ifdef __AVX2__
+#if HWY_ARCH_X86 && defined(__AVX2__)
-#ifdef __AVX512F__
+#if HWY_ARCH_X86 && defined(__AVX512F__)
-#ifdef __riscv_vector
+#if HWY_ARCH_RVV && defined(__riscv_vector)
@@ -242,13 +269,12 @@
// 3) For tests: include all attainable targets (in particular: scalar)
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
// excluding superseded targets, in particular scalar.
#endif // target policy
@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha
+ case HWY_SVE2:
+ return "SVE2";
+ case HWY_SVE:
+ return "SVE";
case HWY_NEON:
return "Neon";
@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha
return "Scalar";
- return "?";
+ return "Unknown"; // must satisfy gtest IsValidParamName()
@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha
nullptr, /* SSE3 */ \
nullptr /* SSE2 */
-#endif // HWY_ARCH_X86
// See HWY_ARCH_X86 above for details.
#define HWY_CHOOSE_TARGET_LIST(func_name) \
- nullptr, /* reserved */ \
- nullptr, /* reserved */ \
+ HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
+ HWY_CHOOSE_SVE(func_name), /* SVE */ \
nullptr, /* reserved */ \
HWY_CHOOSE_NEON(func_name) /* NEON */
-#endif // HWY_ARCH_ARM
// See HWY_ARCH_X86 above for details.
@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha
nullptr, /* VSX */ \
nullptr /* AltiVec */
-#endif // HWY_ARCH_PPC
// See HWY_ARCH_X86 above for details.
@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha
nullptr, /* reserved */ \
HWY_CHOOSE_WASM(func_name) /* WASM */
-#endif // HWY_ARCH_WASM
// See HWY_ARCH_X86 above for details.
@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha
nullptr, /* reserved */ \
HWY_CHOOSE_RVV(func_name) /* RVV */
-#endif // HWY_ARCH_RVV
+// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
+// still creating single-entry tables in HWY_EXPORT to ensure portability.
struct ChosenTarget {
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ chromium-91.0.4472.77/third_party/highway/src/hwy/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-06-02 10:56:05.264904539 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/ 2021-05-31 10:37:11.000000000 -0400
@@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR)
void CheckFakeFunction() {
- if ((HWY_TARGETS & HWY_##TGT) != 0) { \
- hwy::SetSupportedTargetsForTest(HWY_##TGT); \
- /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
- /* the pointer to the already cached function. */ \
- hwy::chosen_target.Update(); \
- /* Calling DeInit() will test that the initializer function */ \
- /* also calls the right function. */ \
- hwy::chosen_target.DeInit(); \
- /* Second call uses the cached value from the previous call. */ \
+ if ((HWY_TARGETS & HWY_##TGT) != 0) { \
+ hwy::SetSupportedTargetsForTest(HWY_##TGT); \
+ /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
+ /* the pointer to the already cached function. */ \
+ hwy::chosen_target.Update(); \
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+ /* Calling DeInit() will test that the initializer function */ \
+ /* also calls the right function. */ \
+ hwy::chosen_target.DeInit(); \
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+ /* Second call uses the cached value from the previous call. */ \
+ EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.251904473 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -16,7 +16,6 @@
#include <stdint.h>
#include <algorithm>
-#include <cmath>
#include <limits>
@@ -173,16 +172,8 @@ struct TestFloatAbs {
HWY_NOINLINE void TestAllAbs() {
- const ForPartialVectors<TestAbs> test;
- test(int8_t());
- test(int16_t());
- test(int32_t());
- const ForPartialVectors<TestFloatAbs> test_float;
- test_float(float());
- test_float(double());
+ ForSignedTypes(ForPartialVectors<TestAbs>());
+ ForFloatTypes(ForPartialVectors<TestFloatAbs>());
template <bool kSigned>
@@ -199,6 +190,45 @@ struct TestLeftShifts {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
+ const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ // 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+ // 1
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(i) - T(N) : T(i);
+ expected[i] = T(TU(value) << 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+ // max
+ for (size_t i = 0; i < N; ++i) {
+ const T value = kSigned ? T(i) - T(N) : T(i);
+ expected[i] = T(TU(value) << kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+ }
+template <bool kSigned>
+struct TestVariableLeftShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T t, D d) {
+ if (kSigned) {
+ // Also test positive values
+ TestVariableLeftShifts</*kSigned=*/false>()(t, d);
+ }
+ using TI = MakeSigned<T>;
+ using TU = MakeUnsigned<T>;
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
@@ -209,8 +239,6 @@ struct TestLeftShifts {
const auto large_shifts = max_shift - small_shifts;
// Same: 0
- HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
- HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
// Same: 1
@@ -218,8 +246,6 @@ struct TestLeftShifts {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << 1);
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
// Same: max
@@ -227,8 +253,6 @@ struct TestLeftShifts {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << kMaxShift);
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
// Variable: small
@@ -252,6 +276,37 @@ struct TestUnsignedRightShifts {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
+ const auto values = Iota(d, 0);
+ const T kMax = LimitsMax<T>();
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+ // max
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> kMaxShift);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
+ }
+struct TestVariableUnsignedRightShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, 0);
@@ -265,21 +320,15 @@ struct TestUnsignedRightShifts {
const auto large_shifts = max_shift - small_shifts;
// Same: 0
- HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
- HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
// Same: 1
for (size_t i = 0; i < N; ++i) {
- expected[i] = T(i >> 1);
+ expected[i] = T(T(i & kMax) >> 1);
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
- HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
// Same: max
- HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
- HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
// Variable: small
@@ -296,33 +345,120 @@ struct TestUnsignedRightShifts {
-struct TestSignedRightShifts {
+template <int kAmount, typename T>
+T RightShiftNegative(T val) {
+ // C++ shifts are implementation-defined for negative numbers, and we have
+ // seen divisions replaced with shifts, so resort to bit operations.
+ using TU = hwy::MakeUnsigned<T>;
+ TU bits;
+ CopyBytes<sizeof(T)>(&val, &bits);
+ const TU shifted = bits >> kAmount;
+ const TU all = ~TU(0);
+ const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
+ const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
+ bits = shifted | sign_extended;
+ CopyBytes<sizeof(T)>(&bits, &val);
+ return val;
+class TestSignedRightShifts {
+ public:
template <typename T, class D>
- HWY_NOINLINE void operator()(T t, D d) {
- // Also test positive values
- TestUnsignedRightShifts()(t, d);
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto expected = AllocateAligned<T>(N);
+ constexpr T kMin = LimitsMin<T>();
+ constexpr T kMax = LimitsMax<T>();
+ constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ // First test positive values, negative are checked below.
+ const auto v0 = Zero(d);
+ const auto values = Iota(d, 0) & Set(d, kMax);
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+ HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+ // max
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+ // Even negative value
+ Test<0>(kMin, d, __LINE__);
+ Test<1>(kMin, d, __LINE__);
+ Test<2>(kMin, d, __LINE__);
+ Test<kMaxShift>(kMin, d, __LINE__);
+ const T odd = static_cast<T>(kMin + 1);
+ Test<0>(odd, d, __LINE__);
+ Test<1>(odd, d, __LINE__);
+ Test<2>(odd, d, __LINE__);
+ Test<kMaxShift>(odd, d, __LINE__);
+ }
+ private:
+ template <int kAmount, typename T, class D>
+ void Test(T val, D d, int line) {
+ const auto expected = Set(d, RightShiftNegative<kAmount>(val));
+ const auto in = Set(d, val);
+ const char* file = __FILE__;
+ AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
+ AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
+ }
+struct TestVariableSignedRightShifts {
+ template <typename T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr T kMin = LimitsMin<T>();
- const auto values = Iota(d, kMin);
+ constexpr T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+ // First test positive values, negative are checked below.
+ const auto v0 = Zero(d);
+ const auto positive = Iota(d, 0) & Set(d, kMax);
+ // Shift by 0
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
+ HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
+ // Shift by 1
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = T(T(i & kMax) >> 1);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
+ // max
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
+ HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
- // Test varying values to shift
+ const auto negative = Iota(d, kMin);
+ // Test varying negative to shift
for (size_t i = 0; i < N; ++i) {
- // We want a right-shift here, which is undefined behavior for negative
- // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if
- // minT is odd and negative.
- T minT = static_cast<T>(kMin + i);
- expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0));
+ expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
- HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1)));
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
// Shift MSB right by small amounts
for (size_t i = 0; i < N; ++i) {
@@ -343,6 +479,13 @@ struct TestSignedRightShifts {
HWY_NOINLINE void TestAllShifts() {
+ ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
+ ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
+ ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
+ ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
+HWY_NOINLINE void TestAllVariableShifts() {
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
@@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() {
+struct TestNearestInt {
+ template <typename TF, class DF>
+ HWY_NOINLINE void operator()(TF tf, const DF df) {
+ using TI = MakeSigned<TF>;
+ const RebindToSigned<DF> di;
+ size_t padded;
+ auto in = RoundTestCases(tf, df, padded);
+ auto expected = AllocateAligned<TI>(padded);
+ constexpr double max = static_cast<double>(LimitsMax<TI>());
+ for (size_t i = 0; i < padded; ++i) {
+ if (std::isnan(in[i])) {
+ // We replace NaN with 0 below (no_nan)
+ expected[i] = 0;
+ } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
+ // Avoid undefined result for lrintf
+ expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
+ } else {
+ expected[i] = lrintf(in[i]);
+ }
+ }
+ for (size_t i = 0; i < padded; i += Lanes(df)) {
+ const auto v = Load(df, &in[i]);
+ const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
+ HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
+ }
+ }
+HWY_NOINLINE void TestAllNearestInt() {
+ ForPartialVectors<TestNearestInt>()(float());
struct TestTrunc {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
@@ -909,8 +1086,7 @@ struct TestSumOfLanes {
HWY_NOINLINE void TestAllSumOfLanes() {
- // Only full vectors because lanes in partial vectors are undefined.
- const ForFullVectors<TestSumOfLanes> sum;
+ const ForPartialVectors<TestSumOfLanes> sum;
// No u8/u16/i8/i16.
@@ -976,9 +1152,8 @@ struct TestMaxOfLanes {
HWY_NOINLINE void TestAllMinMaxOfLanes() {
- // Only full vectors because lanes in partial vectors are undefined.
- const ForFullVectors<TestMinOfLanes> min;
- const ForFullVectors<TestMaxOfLanes> max;
+ const ForPartialVectors<TestMinOfLanes> min;
+ const ForPartialVectors<TestMaxOfLanes> max;
// No u8/u16/i8/i16.
@@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
@@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest,
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.252904478 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.249904463 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.261904523 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -16,8 +16,6 @@
#include <stdint.h>
#include <string.h>
-#include <cmath>
#define HWY_TARGET_INCLUDE "tests/"
#include "hwy/foreach_target.h"
@@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() {
-struct TestNearestInt {
- template <typename TI, class DI>
- HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
- using TF = MakeFloat<TI>;
- const Rebind<TF, DI> df;
- const size_t N = Lanes(df);
- // Integer positive
- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f)));
- // Integer negative
- HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f)));
- // Above positive
- HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f)));
- // Below positive
- HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f)));
- const TF eps = static_cast<TF>(0.0001);
- // Above negative
- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps)));
- // Below negative
- HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps)));
- }
-HWY_NOINLINE void TestAllNearestInt() {
- ForPartialVectors<TestNearestInt>()(int32_t());
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
@@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
@@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.245904442 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -14,6 +14,7 @@
#include <stddef.h>
#include <stdint.h>
+#include <string.h> // memcmp
#include "hwy/base.h"
@@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() {
+struct TestFirstN {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ auto mask_lanes = AllocateAligned<T>(N);
+ // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we
+ // cannot reliably compare against all bits set (NaN for float types).
+ const T off = 1;
+ for (size_t len = 0; len <= N; ++len) {
+ for (size_t i = 0; i < N; ++i) {
+ mask_lanes[i] = i < len ? T(0) : off;
+ }
+ const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
+ HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
+ }
+ }
+HWY_NOINLINE void TestAllFirstN() {
+ ForAllTypes(ForPartialVectors<TestFirstN>());
struct TestIfThenElse {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() {
-// Also tests MaskFromVec/VecFromMask
+struct TestMaskVec {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+ const size_t N = Lanes(d);
+ auto mask_lanes = AllocateAligned<T>(N);
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
+ }
+ const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
+ HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+ }
+ }
+HWY_NOINLINE void TestAllMaskVec() {
+ const ForPartialVectors<TestMaskVec> test;
+ test(uint16_t());
+ test(int16_t());
+ // TODO(janwas): float16_t - cannot compare yet
+ test(uint32_t());
+ test(int32_t());
+ test(float());
+ test(uint64_t());
+ test(int64_t());
+ test(double());
struct TestCompress {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
+ using TU = MakeUnsigned<T>;
+ const Rebind<TU, D> du;
const size_t N = Lanes(d);
auto in_lanes = AllocateAligned<T>(N);
- auto mask_lanes = AllocateAligned<T>(N);
+ auto mask_lanes = AllocateAligned<TU>(N);
auto expected = AllocateAligned<T>(N);
auto actual = AllocateAligned<T>(N);
@@ -224,35 +290,56 @@ struct TestCompress {
for (size_t rep = 0; rep < 100; ++rep) {
size_t expected_pos = 0;
for (size_t i = 0; i < N; ++i) {
- in_lanes[i] = static_cast<T>(Random32(&rng));
- mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
+ const uint64_t bits = Random32(&rng);
+ in_lanes[i] = T(); // cannot initialize float16_t directly.
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+ mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
if (mask_lanes[i] == 0) { // Zero means true (easier to compare)
expected[expected_pos++] = in_lanes[i];
const auto in = Load(d, in_lanes.get());
- const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
+ const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
- HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
Store(Compress(in, mask), d, actual.get());
// Upper lanes are undefined.
for (size_t i = 0; i < expected_pos; ++i) {
- HWY_ASSERT(actual[i] == expected[i]);
+ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
// Also check CompressStore in the same way.
- std::fill(actual.get(), actual.get() + N, T(0));
+ memset(actual.get(), 0, N * sizeof(T));
const size_t num_written = CompressStore(in, mask, d, actual.get());
HWY_ASSERT_EQ(expected_pos, num_written);
for (size_t i = 0; i < expected_pos; ++i) {
- HWY_ASSERT_EQ(expected[i], actual[i]);
+ HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
#if 0
+namespace detail { // for code folding
+void PrintCompress16x8Tables() {
+ constexpr size_t N = 8; // 128-bit SIMD
+ for (uint64_t code = 0; code < 1ull << N; ++code) {
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // Doubled (for converting lane to byte indices)
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", 2 * indices[i]);
+ }
+ }
+ printf("\n");
// Compressed to nibbles
void PrintCompress32x8Tables() {
constexpr size_t N = 8; // AVX2
@@ -340,16 +427,22 @@ void PrintCompress64x2Tables() {
+} // namespace detail
HWY_NOINLINE void TestAllCompress() {
- // PrintCompress32x8Tables();
- // PrintCompress64x4Tables();
- // PrintCompress32x4Tables();
- // PrintCompress64x2Tables();
+ // detail::PrintCompress32x8Tables();
+ // detail::PrintCompress64x4Tables();
+ // detail::PrintCompress32x4Tables();
+ // detail::PrintCompress64x2Tables();
+ // detail::PrintCompress16x8Tables();
const ForPartialVectors<TestCompress> test;
+ test(uint16_t());
+ test(int16_t());
+ test(float16_t());
@@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() {
@@ -432,7 +524,7 @@ struct TestTestBit {
HWY_NOINLINE void TestAllTestBit() {
- ForIntegerTypes(ForFullVectors<TestTestBit>());
+ ForIntegerTypes(ForPartialVectors<TestTestBit>());
struct TestAllTrueFalse {
@@ -445,6 +537,8 @@ struct TestAllTrueFalse {
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
+ auto mask_lanes = AllocateAligned<T>(N);
HWY_ASSERT(AllTrue(Eq(v, zero)));
HWY_ASSERT(!AllFalse(Eq(v, zero)));
@@ -456,7 +550,13 @@ struct TestAllTrueFalse {
for (size_t i = 0; i < N; ++i) {
lanes[i] = T(1);
v = Load(d, lanes.get());
- HWY_ASSERT(!AllTrue(Eq(v, zero)));
+ // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
+ // Assigning to an lvalue is insufficient but storing to memory prevents
+ // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
+ Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
+ HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
lanes[i] = T(-1);
@@ -596,7 +696,7 @@ struct TestLogicalMask {
HWY_NOINLINE void TestAllLogicalMask() {
- ForAllTypes(ForFullVectors<TestLogicalMask>());
+ ForAllTypes(ForPartialVectors<TestLogicalMask>());
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
@@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
@@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.247904453 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -12,6 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
+// detected. Must come before Highway headers.
+#if defined(_WIN32) || defined(_WIN64)
+#include <Windows.h>
#include <stddef.h>
#include <stdint.h>
@@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() {
+struct TestStoreInterleaved3 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ RandomState rng;
+ // Data to be interleaved
+ auto bytes = AllocateAligned<uint8_t>(3 * N);
+ for (size_t i = 0; i < 3 * N; ++i) {
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+ }
+ const auto in0 = Load(d, &bytes[0 * N]);
+ const auto in1 = Load(d, &bytes[1 * N]);
+ const auto in2 = Load(d, &bytes[2 * N]);
+ // Interleave here, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(4 * N);
+ auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+ T* actual = actual_aligned.get() + 1;
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ expected[3 * i + 0] = bytes[0 * N + i];
+ expected[3 * i + 1] = bytes[1 * N + i];
+ expected[3 * i + 2] = bytes[2 * N + i];
+ // Ensure we do not write more than 3*N bytes
+ expected[3 * N + i] = actual[3 * N + i] = 0;
+ }
+ StoreInterleaved3(in0, in1, in2, d, actual);
+ size_t pos = 0;
+ if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
+ Print(d, "in0", in0, pos / 3);
+ Print(d, "in1", in1, pos / 3);
+ Print(d, "in2", in2, pos / 3);
+ const size_t i = pos - pos % 3;
+ fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i],
+ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+ actual[i + 5]);
+ HWY_ASSERT(false);
+ }
+ }
+ }
+HWY_NOINLINE void TestAllStoreInterleaved3() {
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+ const ForExtendableVectors<TestStoreInterleaved3, 4> test;
+ const ForPartialVectors<TestStoreInterleaved3> test;
+ test(uint8_t());
+struct TestStoreInterleaved4 {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ RandomState rng;
+ // Data to be interleaved
+ auto bytes = AllocateAligned<uint8_t>(4 * N);
+ for (size_t i = 0; i < 4 * N; ++i) {
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+ }
+ const auto in0 = Load(d, &bytes[0 * N]);
+ const auto in1 = Load(d, &bytes[1 * N]);
+ const auto in2 = Load(d, &bytes[2 * N]);
+ const auto in3 = Load(d, &bytes[3 * N]);
+ // Interleave here, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(5 * N);
+ auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+ T* actual = actual_aligned.get() + 1;
+ for (size_t rep = 0; rep < 100; ++rep) {
+ for (size_t i = 0; i < N; ++i) {
+ expected[4 * i + 0] = bytes[0 * N + i];
+ expected[4 * i + 1] = bytes[1 * N + i];
+ expected[4 * i + 2] = bytes[2 * N + i];
+ expected[4 * i + 3] = bytes[3 * N + i];
+ // Ensure we do not write more than 4*N bytes
+ expected[4 * N + i] = actual[4 * N + i] = 0;
+ }
+ StoreInterleaved4(in0, in1, in2, in3, d, actual);
+ size_t pos = 0;
+ if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
+ Print(d, "in0", in0, pos / 4);
+ Print(d, "in1", in1, pos / 4);
+ Print(d, "in2", in2, pos / 4);
+ Print(d, "in3", in3, pos / 4);
+ const size_t i = pos;
+ fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i],
+ actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+ actual[i + 5], actual[i + 6], actual[i + 7]);
+ HWY_ASSERT(false);
+ }
+ }
+ }
+HWY_NOINLINE void TestAllStoreInterleaved4() {
+ // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+ const ForExtendableVectors<TestStoreInterleaved4, 4> test;
+ const ForPartialVectors<TestStoreInterleaved4> test;
+ test(uint8_t());
struct TestLoadDup128 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -86,13 +205,14 @@ struct TestLoadDup128 {
for (size_t i = 0; i < N128; ++i) {
lanes[i] = static_cast<T>(1 + i);
- const auto v = LoadDup128(d, lanes);
const size_t N = Lanes(d);
- auto out = AllocateAligned<T>(N);
- Store(v, d, out.get());
+ auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
- HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
+ expected[i] = static_cast<T>(i % N128 + 1);
+ HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
@@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() {
+// Assumes little-endian byte order!
+struct TestScatter {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ using Offset = MakeSigned<T>;
+ const size_t N = Lanes(d);
+ const size_t range = 4 * N; // number of items to scatter
+ const size_t max_bytes = range * sizeof(T); // upper bound on offset
+ RandomState rng;
+ // Data to be scattered
+ auto bytes = AllocateAligned<uint8_t>(max_bytes);
+ for (size_t i = 0; i < max_bytes; ++i) {
+ bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+ }
+ const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
+ // Scatter into these regions, ensure vector results match scalar
+ auto expected = AllocateAligned<T>(range);
+ auto actual = AllocateAligned<T>(range);
+ const Rebind<Offset, D> d_offsets;
+ auto offsets = AllocateAligned<Offset>(N); // or indices
+ for (size_t rep = 0; rep < 100; ++rep) {
+ // Byte offsets
+ std::fill(expected.get(), expected.get() + range, T(0));
+ std::fill(actual.get(), actual.get() + range, T(0));
+ for (size_t i = 0; i < N; ++i) {
+ offsets[i] =
+ static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+ CopyBytes<sizeof(T)>(
+ bytes.get() + i * sizeof(T),
+ reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
+ }
+ const auto voffsets = Load(d_offsets, offsets.get());
+ ScatterOffset(data, d, actual.get(), voffsets);
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+ Print(d, "Data", data);
+ Print(d_offsets, "Offsets", voffsets);
+ HWY_ASSERT(false);
+ }
+ // Indices
+ std::fill(expected.get(), expected.get() + range, T(0));
+ std::fill(actual.get(), actual.get() + range, T(0));
+ for (size_t i = 0; i < N; ++i) {
+ offsets[i] = static_cast<Offset>(Random32(&rng) % range);
+ CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
+ &expected[offsets[i]]);
+ }
+ const auto vindices = Load(d_offsets, offsets.get());
+ ScatterIndex(data, d, actual.get(), vindices);
+ if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+ Print(d, "Data", data);
+ Print(d_offsets, "Indices", vindices);
+ HWY_ASSERT(false);
+ }
+ }
+ }
+HWY_NOINLINE void TestAllScatter() {
+ // No u8,u16,i8,i16.
+ const ForPartialVectors<TestScatter> test;
+ test(uint32_t());
+ test(int32_t());
+ test(uint64_t());
+ test(int64_t());
+ ForFloatTypes(test);
struct TestGather {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -183,21 +381,15 @@ struct TestGather {
HWY_NOINLINE void TestAllGather() {
// No u8,u16,i8,i16.
- const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint32_t)> test32;
- test32(uint32_t());
- test32(int32_t());
+ const ForPartialVectors<TestGather> test;
+ test(uint32_t());
+ test(int32_t());
- const ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(uint64_t)> test64;
- test64(uint64_t());
- test64(int64_t());
- ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(float)>()(float());
- ForPartialVectors<TestGather, 1, 1, HWY_GATHER_LANES(double)>()(double());
+ test(uint64_t());
+ test(int64_t());
+ ForFloatTypes(test);
HWY_NOINLINE void TestAllCache() {
@@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() {
int test = 0;
+ Pause();
// NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-06-02 10:56:05.259904513 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/ 2021-05-31 10:37:11.000000000 -0400
@@ -223,6 +223,7 @@ struct TestTableLookupBytes {
HWY_NOINLINE void TestAllTableLookupBytes() {
struct TestTableLookupLanes {
using Index = uint32_t;
@@ -242,12 +243,13 @@ struct TestTableLookupLanes {
if (N <= 8) { // Test all permutations
for (size_t i0 = 0; i0 < N; ++i0) {
idx[0] = static_cast<Index>(i0);
for (size_t i1 = 0; i1 < N; ++i1) {
- idx[1] = static_cast<Index>(i1);
+ if (N >= 2) idx[1] = static_cast<Index>(i1);
for (size_t i2 = 0; i2 < N; ++i2) {
- idx[2] = static_cast<Index>(i2);
+ if (N >= 4) idx[2] = static_cast<Index>(i2);
for (size_t i3 = 0; i3 < N; ++i3) {
- idx[3] = static_cast<Index>(i3);
+ if (N >= 4) idx[3] = static_cast<Index>(i3);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
@@ -286,7 +288,7 @@ struct TestTableLookupLanes {
HWY_NOINLINE void TestAllTableLookupLanes() {
- const ForFullVectors<TestTableLookupLanes> test;
+ const ForPartialVectors<TestTableLookupLanes> test;
@@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() {
+namespace hwy {
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
@@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
+} // namespace hwy
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h
--- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 2021-06-02 10:56:05.254904488 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h 2021-05-31 10:37:11.000000000 -0400
@@ -23,7 +23,6 @@
#include <stdio.h>
#include <string.h>
-#include <cmath> // isfinite
#include <cstddef>
#include <string>
#include <utility> // std::forward
@@ -73,7 +72,8 @@ class TestWithParamTarget : public testi
// Function to convert the test parameter of a TestWithParamTarget for
// displaying it in the gtest test name.
-std::string TestParamTargetName(const testing::TestParamInfo<uint32_t>& info) {
+static inline std::string TestParamTargetName(
+ const testing::TestParamInfo<uint32_t>& info) {
return TargetName(info.param);
@@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT(
static_assert(true, "For requiring trailing semicolon")
#define HWY_BEFORE_TEST(suite) \
- namespace hwy { \
class suite : public hwy::TestWithParamTarget {}; \
static_assert(true, "For requiring trailing semicolon")
-#define HWY_AFTER_TEST() \
- } /* namespace hwy */ \
- static_assert(true, "For requiring trailing semicolon")
-// Calls test for each enabled and available target.
-template <class Func, typename... Args>
-HWY_NOINLINE void RunTest(const Func& func, Args&&... args) {
- SetSupportedTargetsForTest(0);
- auto targets = SupportedAndGeneratedTargets();
- for (uint32_t target : targets) {
- SetSupportedTargetsForTest(target);
- fprintf(stderr, "Testing for target %s.\n",
- TargetName(static_cast<int>(target)));
- func(std::forward<Args>(args)...);
- }
- // Disable the mask after the test.
- SetSupportedTargetsForTest(0);
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
// which triggers a compiler bug.
class RandomState {
@@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand
// built-in types.
template <class T>
inline void PreventElision(T&& output) {
-#ifndef _MSC_VER
+ (void)output;
asm volatile("" : "+r"(output) : : "memory");
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
@@ -234,23 +215,34 @@ inline void PreventElision(T&& output) {
// understanding which instantiation of a generic test failed.
template <typename T>
static inline std::string TypeName(T /*unused*/, size_t N) {
- std::string prefix(IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u"));
- prefix += std::to_string(sizeof(T) * 8);
- // Scalars: omit the xN suffix.
- if (N == 1) return prefix;
- return prefix + 'x' + std::to_string(N);
+ const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
+ char name[64];
+ // Omit the xN suffix for scalars.
+ if (N == 1) {
+ snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
+ } else {
+ snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
+ }
+ return name;
// String comparison
template <typename T1, typename T2>
-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) {
+inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
+ size_t* pos = nullptr) {
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
for (size_t i = 0; i < size; ++i) {
- if (bytes1[i] != bytes2[i]) return false;
+ if (bytes1[i] != bytes2[i]) {
+ fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
+ size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
+ TypeName(T2(), 1).c_str());
+ if (pos != nullptr) {
+ *pos = i;
+ }
+ return false;
+ }
return true;
@@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const
auto lanes = AllocateAligned<T>(N);
Store(v, d, lanes.get());
const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
- const size_t end = std::min(begin + 5, N);
+ const size_t end = std::min(begin + 7, N);
fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption,
for (size_t i = begin; i < end; ++i) {
- fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str());
+ fprintf(stderr, "%g,", double(lanes[i]));
if (begin >= end) fprintf(stderr, "(out of bounds)");
fprintf(stderr, "\n");
@@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex
const char* filename = "", const int line = -1,
const size_t lane = 0) {
if (!IsEqual(expected, actual)) {
- const std::string expected_str = std::to_string(expected);
- const std::string actual_str = std::to_string(actual);
- NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(),
- actual_str.c_str());
+ char expected_str[100];
+ snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
+ char actual_str[100];
+ snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
+ NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
+ actual_str);
@@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co
fprintf(stderr, "\n\n");
Print(d, "expect", expected, i);
Print(d, "actual", actual, i);
+ char expected_str[100];
+ snprintf(expected_str, sizeof(expected_str), "%g",
+ double(expected_lanes[i]));
+ char actual_str[100];
+ snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
- std::to_string(expected_lanes[i]).c_str(),
- std::to_string(actual_lanes[i]).c_str());
+ expected_str, actual_str);
@@ -458,11 +458,8 @@ struct ForeachSizeR<T, 0, kMinLanes, Tes
// These adapters may be called directly, or via For*Types:
-// Calls Test for all powers of two in [kMinLanes, kMaxLanes / kDivLanes].
-// kMaxLanes is used for HWY_GATHER_LANES etc; use a large default because we
-// don't have access to T in the template argument list.
-template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1,
- size_t kMaxLanes = 1ul << 30>
+// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
+template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
struct ForPartialVectors {
template <typename T>
void operator()(T /*unused*/) const {
@@ -470,8 +467,8 @@ struct ForPartialVectors {
// Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
- ForeachSizeR<T, HWY_MIN(kMaxLanes, HWY_LANES(T)) / kDivLanes / kMinLanes,
- kMinLanes, Test>::Do();
+ ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
+ Test>::Do();
@@ -505,33 +502,19 @@ struct ForGE128Vectors {
-// Calls Test for all powers of two in [128 bits, max bits/2].
-template <class Test>
+// Calls Test for all vectors that can be expanded by kFactor.
+template <class Test, size_t kFactor = 2>
struct ForExtendableVectors {
template <typename T>
void operator()(T /*unused*/) const {
- ForeachSizeR<T, 4, HWY_LANES(T), Test>::Do();
+ ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
- ForeachSizeR<T, HWY_LANES(T) / 2 / (16 / sizeof(T)), (16 / sizeof(T)),
+ ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
-// Calls Test for full vectors only.
-template <class Test>
-struct ForFullVectors {
- template <typename T>
- void operator()(T t) const {
- ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
- (void)t;
- Test()(t, HWY_FULL(T)());
- }
// Type lists to shorten call sites:
diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE
diff -up chromium-91.0.4472.77/third_party/highway/src/ chromium-91.0.4472.77/third_party/highway/src/
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/ chromium-91.0.4472.77/third_party/highway/src/
diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE
diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE
diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile
diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE
diff -up chromium-91.0.4472.77/third_party/highway/src/ chromium-91.0.4472.77/third_party/highway/src/
--- chromium-91.0.4472.77/third_party/highway/src/ 2021-06-02 10:56:05.295904696 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/ 2021-05-31 10:37:11.000000000 -0400
@@ -15,15 +15,19 @@ applying the same operation to 'lanes'.
## Current status
Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
-A port to RVV is in progress.
+Ports to RVV and SVE/SVE2 are in progress.
Version 0.11 is considered stable enough to use in other projects, and is
expected to remain backwards compatible unless serious issues are discovered
while implementing SVE/RVV targets. After these targets are added, Highway will
reach version 1.0.
-Continuous integration tests use a recent version of Clang and older version of
-MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1.
+Continuous integration tests build with a recent version of Clang (running on
+x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
+GCC cross-compile and QEMU. See the
+[testing process](g3doc/ for details.
The `contrib` directory contains SIMD-related utilities: an image class with
aligned rows, and a math library (16 functions already implemented, mostly
@@ -62,9 +66,11 @@ To test on all the attainable targets fo
default configuration skips baseline targets (e.g. scalar) that are superseded
by another baseline target.
+Bazel is also supported for building, but it is not as widely used/tested.
## Quick start
-You can use the `skeleton` examples inside examples/ as a starting point.
+You can use the `benchmark` inside examples/ as a starting point.
A [quick-reference page](g3doc/ briefly lists all operations
and their parameters, and the [instruction_matrix][instmtx] indicates the
diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat
--- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 2021-06-02 10:56:05.293904685 -0400
+++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat 2021-05-31 10:37:11.000000000 -0400
@@ -2,9 +2,9 @@
REM Switch directory of this batch file
cd %~dp0
-if not exist build mkdir build
+if not exist build_win mkdir build_win
-cd build
+cd build_win
cmake .. -G Ninja || goto error
ninja || goto error
ctest -j || goto error
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE
diff -up chromium-91.0.4472.77/third_party/highway/src/ chromium-91.0.4472.77/third_party/highway/src/
diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE
diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time
diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1