diff -up chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/buildtools/third_party/libc++/trunk/test/std/utilities/time/time.hms/time diff -up chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10.12 chromium-91.0.4472.77/third_party/blink/web_tests/platform/mac-mac10 diff -up chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6.12 chromium-91.0.4472.77/third_party/catapult/telemetry/third_party/modulegraph/modulegraph_tests/testdata/nspkg/distribute-0.6 diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt --- chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.12 2021-06-02 10:56:05.305904746 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt 2021-05-31 10:37:11.000000000 -0400 @@ -19,7 +19,7 @@ if(POLICY CMP0083) cmake_policy(SET CMP0083 NEW) endif() -project(hwy VERSION 0.1) +project(hwy VERSION 0.12.2) # Keep in sync with highway.h version set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_EXTENSIONS OFF) @@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE RelWithDebInfo) endif() +set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?") + include(CheckCXXSourceCompiles) check_cxx_source_compiles( "int main() { @@ -51,10 +53,13 @@ check_cxx_source_compiles( HWY_EMSCRIPTEN ) +set(HWY_CONTRIB_SOURCES + hwy/contrib/image/image.cc + hwy/contrib/image/image.h + hwy/contrib/math/math-inl.h +) + set(HWY_SOURCES - contrib/image/image.cc - contrib/image/image.h - contrib/math/math-inl.h hwy/aligned_allocator.cc hwy/aligned_allocator.h hwy/base.h @@ -64,6 +69,7 @@ set(HWY_SOURCES hwy/nanobenchmark.cc hwy/nanobenchmark.h hwy/ops/arm_neon-inl.h + hwy/ops/arm_sve-inl.h hwy/ops/scalar-inl.h hwy/ops/set_macros-inl.h hwy/ops/shared-inl.h @@ -146,13 +152,28 @@ else() -fno-exceptions ) endif() -endif() + + if (HWY_CMAKE_ARM7) + list(APPEND HWY_FLAGS + -march=armv7-a + -mfpu=neon-vfpv4 + -mfloat-abi=hard # must match the toolchain specified as CXX= + -mfp16-format=ieee # required for vcvt_f32_f16 + ) + endif() # HWY_CMAKE_ARM7 + +endif() # !MSVC add_library(hwy STATIC ${HWY_SOURCES}) target_compile_options(hwy PRIVATE ${HWY_FLAGS}) set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR}) +add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES}) +target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON) +target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR}) + # -------------------------------------------------------- install library install(TARGETS hwy DESTINATION "${CMAKE_INSTALL_LIBDIR}") @@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES}) endif() endforeach() -# Add a pkg-config file for libhwy and the test library. +install(TARGETS hwy_contrib + DESTINATION "${CMAKE_INSTALL_LIBDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_CONTRIB_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() + +# Add a pkg-config file for libhwy and the contrib/test libraries. set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}") -foreach (pc libhwy.pc libhwy-test.pc) +foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") @@ -193,34 +226,13 @@ add_custom_command(TARGET hwy POST_BUILD # Avoids mismatch between GTest's static CRT and our dynamic. set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -add_executable(skeleton hwy/examples/skeleton_main.cc) -target_sources(skeleton PRIVATE - hwy/examples/skeleton-inl.h - hwy/examples/skeleton.cc - hwy/examples/skeleton.h - hwy/examples/skeleton_shared.h) -# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to -# observe the difference in targets printed. -target_compile_options(skeleton PRIVATE ${HWY_FLAGS}) -target_link_libraries(skeleton hwy) -set_target_properties(skeleton - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") - -# Similar: shared headers but without the runtime dispatch in skeleton.cc/h -add_executable(skeleton_static hwy/examples/skeleton_static_main.cc) -target_sources(skeleton_static PRIVATE - hwy/examples/skeleton-inl.h - hwy/examples/skeleton_shared.h) -target_compile_options(skeleton_static PRIVATE ${HWY_FLAGS}) -target_link_libraries(skeleton_static hwy) -set_target_properties(skeleton_static - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") - # Programming exercise with integrated benchmark add_executable(hwy_benchmark hwy/examples/benchmark.cc) target_sources(hwy_benchmark PRIVATE hwy/nanobenchmark.cc hwy/nanobenchmark.h) +# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to +# observe the difference in targets printed. target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS}) target_link_libraries(hwy_benchmark hwy) set_target_properties(hwy_benchmark @@ -272,19 +284,21 @@ endif() endif() # HWY_SYSTEM_GTEST set(HWY_TEST_FILES - contrib/image/image_test.cc - # contrib/math/math_test.cc + hwy/contrib/image/image_test.cc + # hwy/contrib/math/math_test.cc + hwy/aligned_allocator_test.cc + hwy/base_test.cc + hwy/highway_test.cc + hwy/targets_test.cc hwy/examples/skeleton_test.cc hwy/tests/arithmetic_test.cc hwy/tests/combine_test.cc hwy/tests/compare_test.cc hwy/tests/convert_test.cc - hwy/tests/hwy_test.cc hwy/tests/logical_test.cc hwy/tests/memory_test.cc hwy/tests/swizzle_test.cc - hwy/aligned_allocator_test.cc - hwy/targets_test.cc + hwy/tests/test_util_test.cc ) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests) @@ -293,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE get_filename_component(TESTNAME ${TESTFILE} NAME_WE) add_executable(${TESTNAME} ${TESTFILE}) target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS}) + # Test all targets, not just the best/baseline. This changes the default + # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can + # cause compile errors because only one may be set, and other CMakeLists.txt + # that include us may set them. + target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1) if(HWY_SYSTEM_GTEST) - target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main) + target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main) else() - target_link_libraries(${TESTNAME} hwy gtest gtest_main) + target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main) endif() # Output test targets in the test directory. set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/") diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txtE diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.in diff -up chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE.12 chromium-91.0.4472.77/third_party/highway/src/CMakeLists.txt.inE diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.cc diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.h diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image.hE diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.cc diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/image/image_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.h diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.cc diff -up chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/contrib/math/math_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTING diff -up chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE.12 chromium-91.0.4472.77/third_party/highway/src/CONTRIBUTINGE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelog --- chromium-91.0.4472.77/third_party/highway/src/debian/changelog.12 2021-06-02 10:56:05.151903967 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/debian/changelog 2021-05-31 10:37:11.000000000 -0400 @@ -1,3 +1,26 @@ +highway (0.12.2-1) UNRELEASED; urgency=medium + + * fix scalar-only test and Windows macro conflict with Load/StoreFence + * replace deprecated wasm intrinsics + + -- Jan Wassenberg Mon, 31 May 2021 16:00:00 +0200 + +highway (0.12.1-1) UNRELEASED; urgency=medium + + * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors + * fix warnings, faster ARM div/sqrt, separate hwy_contrib library + * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC + + -- Jan Wassenberg Wed, 19 May 2021 15:00:00 +0200 + +highway (0.12.0-1) UNRELEASED; urgency=medium + + * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4 + * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES + * Proper IEEE rounding, reduce libstdc++ usage, inlined math + + -- Jan Wassenberg Thu, 15 Apr 2021 20:00:00 +0200 + highway (0.11.1-1) UNRELEASED; urgency=medium * Fix clang7 asan error, finish f16 conversions and add test diff -up chromium-91.0.4472.77/third_party/highway/src/debian/changelogE.12 chromium-91.0.4472.77/third_party/highway/src/debian/changelogE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compat.12 chromium-91.0.4472.77/third_party/highway/src/debian/compat diff -up chromium-91.0.4472.77/third_party/highway/src/debian/compatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/compatE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/control.12 chromium-91.0.4472.77/third_party/highway/src/debian/control diff -up chromium-91.0.4472.77/third_party/highway/src/debian/controlE.12 chromium-91.0.4472.77/third_party/highway/src/debian/controlE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyright.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyright diff -up chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE.12 chromium-91.0.4472.77/third_party/highway/src/debian/copyrightE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rules.12 chromium-91.0.4472.77/third_party/highway/src/debian/rules diff -up chromium-91.0.4472.77/third_party/highway/src/debian/rulesE.12 chromium-91.0.4472.77/third_party/highway/src/debian/rulesE diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/format.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/format diff -up chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE.12 chromium-91.0.4472.77/third_party/highway/src/debian/source/formatE diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf Binary files chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf.12 and chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdf differ diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/highway_intro.pdfE diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdf diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/instruction_matrix.pdfE diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md --- chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md.12 2021-06-02 10:56:05.117903795 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.md 2021-05-31 10:37:11.000000000 -0400 @@ -33,6 +33,12 @@ The public headers are: * hwy/cache_control.h: defines stand-alone functions to control caching (e.g. prefetching) and memory barriers, independent of actual SIMD. +* hwy/nanobenchmark.h: library for precisely measuring elapsed time (under + varying inputs) for benchmarking small/medium regions of code. + +* hwy/tests/test_util-inl.h: defines macros for invoking tests on all + available targets, plus per-target functions useful in tests (e.g. Print). + SIMD implementations must be preceded and followed by the following: ``` @@ -61,76 +67,76 @@ HWY_AFTER_NAMESPACE(); ## Vector and descriptor types -Highway vectors consist of one or more 'lanes' of the same built-in type `T = -uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `T = float##_t` for `## = 16, -32, 64`. `float16_t` is an IEEE binary16 half-float and only supports load, -store, and conversion to/from `float32_t`; infinity or NaN have -implementation-defined results. - -Each vector has `N` lanes (a power of two, possibly unknown at compile time). - -Platforms such as x86 support multiple vector types, and other platforms require -that vectors are built-in types. On RVV, vectors are sizeless and thus cannot be -wrapped inside a class. The Highway API satisfies these constraints because it -is designed around overloaded functions selected via a zero-sized tag parameter -`d` of type `D = Simd`. These are typically constructed using aliases: - -* `const HWY_FULL(T[, LMUL=1]) d;` chooses an `N` that results in a native - vector for the current target. For targets (e.g. RVV) that support register - groups, the optional `LMUL` (1, 2, 4, 8) specifies the number of registers - in the group. This effectively multiplies the lane count in each operation - by `LMUL`. For mixed-precision code, `LMUL` must be at least the ratio of - the sizes of the largest and smallest type. `LMUL > 1` is more efficient on - single-issue machines, but larger values reduce the effective number of - registers, which may cause the compiler to spill them to memory. +Highway vectors consist of one or more 'lanes' of the same built-in type +`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32, +64`. + +In Highway, `float16_t` (an IEEE binary16 half-float) only supports load, store, +and conversion to/from `float32_t`; the behavior of `float16_t` infinity and NaN +are implementation-defined due to ARMv7. + +On RVV, vectors are sizeless and cannot be wrapped inside a class. The Highway +API allows using built-in types as vectors because operations are expressed as +overloaded functions. Instead of constructors, overloaded initialization +functions such as `Set` take a zero-sized tag argument called `d` of type `D = +Simd` and return an actual vector of unspecified type. + +`T` is one of the lane types above, and may be retrieved via `TFromD`. + +`N` is target-dependent and not directly user-specified. The actual lane count +may not be known at compile time, but can be obtained via `Lanes(d)`. Use this +value, which is potentially different from `N`, to increment loop counters etc. +It is typically a power of two, but that is not guaranteed e.g. on SVE. + +`d` lvalues (a tag, NOT actual vector) are typically obtained using two aliases: + +* Most common: pass `HWY_FULL(T[, LMUL=1]) d;` as an argument to return a + native vector. This is preferred because it fully utilizes vector lanes. + + For targets (e.g. RVV) that support register groups, the optional `LMUL` (1, + 2, 4, 8) specifies the number of registers in the group. This effectively + multiplies the lane count in each operation by `LMUL`. For mixed-precision + code, `LMUL` must be at least the ratio of the sizes of the largest and + smallest type. `LMUL > 1` is more efficient on single-issue machines, but + larger values reduce the effective number of registers, which may cause the + compiler to spill them to memory. + +* Less common: pass `HWY_CAPPED(T, N) d;` as an argument to return a vector + which may be native width, but no more than `N` lanes have observable + effects such as loading/storing to memory. This is less performance-portable + because it may not use all available lanes. Note that the resulting lane + count may also be less than `N`. + + For targets (e.g. RVV) that have compile-time-unknown lane counts, such + vectors incur additional runtime cost in `Load` etc. + +User-specified lane counts or tuples of vectors could cause spills on targets +with fewer or smaller vectors. By contrast, Highway encourages vector-length +agnostic code, which is more performance-portable. + +Given that lane counts are potentially compile-time-unknown, storage for vectors +should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`. For +applications that require a compile-time estimate, `MaxLanes(d)` returns the `N` +from `Simd`, which is NOT necessarily the actual lane count. This is +DISCOURAGED because it is not guaranteed to be an upper bound (RVV vectors may +be very large) and some compilers are not able to interpret it as constexpr. -* `const HWY_CAPPED(T, N) d;` for up to `N` lanes. - -For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), descriptors -for the smaller types must be obtained from those of the larger type (e.g. via +For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for +the smaller types must be obtained from those of the larger type (e.g. via `Rebind`). -The type `T` may be accessed as `TFromD`. There are three possibilities for -the template parameter `N`: - -1. Equal to the hardware vector width, e.g. when using `HWY_FULL(T)` on a - target with compile-time constant vectors. +## Using unspecified vector types -1. Less than the hardware vector width. This is the result of a compile-time - decision by the user, i.e. using `HWY_CAPPED(T, N)` to limit the number of - lanes, even when the hardware vector width could be greater. - -1. Unrelated to the hardware vector width, e.g. when the hardware vector width - is not known at compile-time and may be very large. - -In all cases, `Lanes(d)` returns the actual number of lanes, i.e. the amount by -which to advance loop counters. `MaxLanes(d)` returns the `N` from `Simd`, -which is NOT necessarily the actual vector size (see above) and some compilers -are not able to interpret it as constexpr. Instead of `MaxLanes`, prefer to use -alternatives, e.g. `Rebind` or `aligned_allocator.h` for dynamic allocation of -`Lanes(d)` elements. - -Highway is designed to map a vector variable to a (possibly partial) hardware -register or register group. By discouraging user-specified `N` and tuples of -vector variables, we improve performance portability (e.g. by reducing spills to -memory for platforms that have smaller vectors than the developer expected). - -To construct vectors, call factory functions (see "Initialization" below) with -a tag parameter `d`. - -Local variables typically use auto for type deduction. For some generic -functions, a template argument `V` is sufficient: `template V Squared(V -v) { return v * v; }`. In general, functions have a `D` template argument and -can return vectors of type `Vec`. - -Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined -functions reside in `project::[nested]::HWY_NAMESPACE`. Because all Highway -functions generally take either a `Simd` or vector argument, which are also -defined in namespace `hwy`, they will typically be found via Argument-Dependent -Lookup and namespace qualifiers are not necessary. As an exception, Highway -functions that are templates (e.g. because they require a compile-time argument -such as a lane index or shift count) require a using-declaration such as -`using hwy::HWY_NAMESPACE::ShiftLeft`. +Because vector types are unspecified, local vector variables are typically +defined using `auto` for type deduction. A template argument `V` suffices for +simple generic functions: `template V Squared(V v) { return v * v; }`. + +Many functions will need a `D` template argument in order to initialize any +constants. They can use a separate `V` template argument for vectors, or use +`Vec`, or where an lvalue `d` is available, `decltype(Zero(d))`. Using such +aliases instead of auto may improve readability of mixed-type code. They can +also be used for member variables, which are discouraged because compilers often +have difficulty mapping them to registers. ## Operations @@ -141,6 +147,14 @@ unsigned, signed, and floating-point typ bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used. +Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined +functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions +generally take either a `Simd` or vector/mask argument. For targets where +vectors and masks are defined in namespace `hwy`, the functions will be found +via Argument-Dependent Lookup. However, this does not work for function +templates, and RVV and SVE both use builtin vectors. Thus we recommend a `using +hwy::HWY_NAMESPACE;` directive inside `project::[nested]::HWY_NAMESPACE`. + ### Initialization * V **Zero**(D): returns N-lane vector with all bits set to 0. @@ -162,7 +176,7 @@ bits are allowed. Abbreviations of the f * `V`: `{i,f}` \ V **Neg**(V a): returns `-a[i]`. -* `V`: `{i}{8,16,32}, {f}` \ +* `V`: `{i,f}` \ V **Abs**(V a) returns the absolute value of `a[i]`; for integers, `LimitsMin()` maps to `LimitsMax() + 1`. @@ -252,23 +266,24 @@ Left-shifting signed `T` and right-shift shifting `MakeUnsigned` and casting to `T`. Right-shifting negative signed `T` is the same as an unsigned shift, except that 1-bits are shifted in. -Compile-time constant shifts, generally the most efficient variant: +Compile-time constant shifts, generally the most efficient variant (though 8-bit +shifts are potentially slower than other lane sizes): -* `V`: `{u,i}{16,32,64}` \ +* `V`: `{u,i}` \ V **ShiftLeft**<int>(V a) returns `a[i] << int`. -* `V`: `{u,i}{16,32,64}` \ +* `V`: `{u,i}` \ V **ShiftRight**<int>(V a) returns `a[i] >> int`. Shift all lanes by the same (not necessarily compile-time constant) amount: -* `V`: `{u,i}{16,32,64}` \ +* `V`: `{u,i}` \ V **ShiftLeftSame**(V a, int bits) returns `a[i] << bits`. -* `V`: `{u,i}{16,32,64}` \ +* `V`: `{u,i}` \ V **ShiftRightSame**(V a, int bits) returns `a[i] >> bits`. -Per-lane variable shifts (slow if SSE4, or Shr i64 on AVX2): +Per-lane variable shifts (slow if SSE4, or 16-bit, or Shr i64 on AVX2): * `V`: `{u,i}{16,32,64}` \ V **operator<<**(V a, V b) returns `a[i] << b[i]`. @@ -332,12 +347,17 @@ Special functions for signed types: slightly more efficient; requires the first argument to be non-negative. * `V`: `i32/64` \ - V **BroadcastSignBit(V a) returns `a[i] < 0 ? -1 : 0`. + V **BroadcastSignBit**(V a) returns `a[i] < 0 ? -1 : 0`. ### Masks Let `M` denote a mask capable of storing true/false for each lane. +* M **FirstN**(D, size_t N): returns mask with the first `N` + lanes (those with index `< N`) true. `N` larger than `Lanes(D())` result in + an all-true mask. Useful for implementing "masked" stores by loading `prev` + followed by `IfThenElse(FirstN(d, N), what_to_store, prev)`. + * M1 **RebindMask**(D, M2 m): returns same mask bits as `m`, but reinterpreted as a mask for lanes of type `TFromD`. `M1` and `M2` must have the same number of lanes. @@ -389,17 +409,18 @@ Let `M` denote a mask capable of storing * size_t **CountTrue**(M m): returns how many of `m[i]` are true [0, N]. This is typically more expensive than AllTrue/False. -* `V`: `{u,i,f}{32,64}` \ +* `V`: `{u,i,f}{16,32,64}` \ V **Compress**(V v, M m): returns `r` such that `r[n]` is `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true. Compacts lanes whose mask is set into the lower lanes; upper lanes are - implementation-defined. + implementation-defined. Slow with 16-bit lanes. -* `V`: `{u,i,f}{32,64}` \ +* `V`: `{u,i,f}{16,32,64}` \ size_t **CompressStore**(V v, M m, D, T* aligned): writes lanes whose mask is set into `aligned`, starting from lane 0. Returns `CountTrue(m)`, the number of valid lanes. All subsequent lanes may be - overwritten! Alignment ensures inactive lanes will not cause faults. + overwritten! Alignment ensures inactive lanes will not cause faults. Slow + with 16-bit lanes. ### Comparisons @@ -429,10 +450,16 @@ Memory operands are little-endian, other lane configuration. Pointers are the addresses of `N` consecutive `T` values, either naturally-aligned (`aligned`) or possibly unaligned (`p`). +**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic +bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands +are naturally aligned. An unaligned access may require two load ports. + #### Load * Vec<D> **Load**(D, const T* aligned): returns - `aligned[i]`. + `aligned[i]`. May fault if the pointer is not aligned to the vector size. + Using this whenever possible improves codegen on SSE4: unlike `LoadU`, + `Load` can be fused into a memory operand, which reduces register pressure. * Vec<D> **LoadU**(D, const T* p): returns `p[i]`. * Vec<D> **LoadDup128**(D, const T* p): returns one 128-bit @@ -440,19 +467,31 @@ either naturally-aligned (`aligned`) or be faster than broadcasting single values, and is more convenient than preparing constants for the actual vector length. -#### Gather +#### Scatter/Gather -**Note**: Vectors must be `HWY_CAPPED(T, HWY_GATHER_LANES(T))`: +**Note**: Offsets/indices are of type `VI = Vec>` and need not +be unique. The results are implementation-defined if any are negative. -* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \ - Vec<D> **GatherOffset**(D, const T* base, VI offsets). - Returns elements of base selected by possibly repeated *byte* `offsets[i]`. - Results are implementation-defined if `offsets[i]` is negative. - -* `V`,`VI`: (`{u,i,f}{32},i32`), (`{u,i,f}{64},i64`) \ - Vec<D> **GatherIndex**(D, const T* base, VI indices). - Returns vector of `base[indices[i]]`. Indices need not be unique, but - results are implementation-defined if they are negative. +**Note**: Where possible, applications should `Load/Store/TableLookup*` entire +vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form +`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] = +F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`. + +* `D`: `{u,i,f}{32,64}` \ + void **ScatterOffset**(Vec<D> v, D, const T* base, VI + offsets): stores `v[i]` to the base address plus *byte* `offsets[i]`. + +* `D`: `{u,i,f}{32,64}` \ + void **ScatterIndex**(Vec<D> v, D, const T* base, VI + indices): stores `v[i]` to `base[indices[i]]`. + +* `D`: `{u,i,f}{32,64}` \ + Vec<D> **GatherOffset**(D, const T* base, VI offsets): + returns elements of base selected by *byte* `offsets[i]`. + +* `D`: `{u,i,f}{32,64}` \ + Vec<D> **GatherIndex**(D, const T* base, VI indices): + returns vector of `base[indices[i]]`. #### Store @@ -462,6 +501,17 @@ either naturally-aligned (`aligned`) or * void **StoreU**(Vec<D> a, D, T* p): as Store, but without the alignment requirement. +* `D`: `u8` \ + void **StoreInterleaved3**(Vec<D> v0, Vec<D> v1, + Vec<D> v2, D, T* p): equivalent to shuffling `v0, v1, v2` + followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0], + p[2] == v1[0]`. Useful for RGB samples. + +* `D`: `u8` \ + void **StoreInterleaved4**(Vec<D> v0, Vec<D> v1, + Vec<D> v2, Vec<D> v3, D, T* p): as above, but for four + vectors (e.g. RGBA samples). + ### Cache control All functions except Stream are defined in cache_control.h. @@ -483,6 +533,9 @@ All functions except Stream are defined * void **Prefetch**(const T* p): begins loading the cache line containing "p". +* void **Pause**(): when called inside a spin-loop, may reduce + power consumption. + ### Type conversion * Vec<D> **BitCast**(D, V): returns the bits of `V` @@ -525,7 +578,8 @@ if the input exceeds the destination ran zero and converts the value to same-sized integer. * `V`: `f32`; `Ret`: `i32` \ - Ret **NearestInt**(V a): returns the integer nearest to `a[i]`. + Ret **NearestInt**(V a): returns the integer nearest to `a[i]`; + results are undefined for NaN. ### Swizzle @@ -652,9 +706,9 @@ more expensive on AVX2/AVX-512 than with ### Reductions -**Note**: the following are only available for full vectors (including scalar). -These 'reduce' all lanes to a single result. This result is broadcasted to all -lanes at no extra cost; you can use `GetLane` to obtain the value. +**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is +broadcasted to all lanes at no extra cost. To obtain a scalar, you can call +`GetLane`. Being a horizontal operation (across lanes of the same vector), these are slower than normal SIMD operations and are typically used outside critical loops. @@ -697,9 +751,6 @@ generate such instructions (implying the finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to provide an alternative to functions which are not supported by HWY_SCALAR. -* `HWY_LANES(T)`: how many lanes of type `T` in a full vector (>= 1). Used by - HWY_FULL/CAPPED. Note: cannot be used in #if because it uses sizeof. - * `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out. @@ -707,26 +758,15 @@ The following signal capabilities and ex * `HWY_CAP_INTEGER64`: support for 64-bit signed/unsigned integer lanes. * `HWY_CAP_FLOAT64`: support for double-precision floating-point lanes. + +The following were used to signal the maximum number of lanes for certain +operations, but this is no longer necessary (nor possible on SVE/RVV), so they +are DEPRECATED: + +* `HWY_GATHER_LANES(T)`. * `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits. * `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits. -The following indicate the maximum number of lanes for certain operations. For -targets that support the feature/operation, the macro evaluates to -`HWY_LANES(T)`, otherwise 1. Using `HWY_CAPPED(T, HWY_GATHER_LANES(T))` -generates the best possible code (or scalar fallback) from the same source code. - -* `HWY_GATHER_LANES(T)`: supports GatherIndex/Offset. -* `HWY_VARIABLE_SHIFT_LANES(T)`: supports per-lane shift amounts (v1 << v2). - DEPRECATED, this always matches HWY_LANES(T) and will be removed. - -As above, but the feature implies the type so there is no T parameter, thus -these can be used in `#if` expressions. - -* `HWY_COMPARE64_LANES`: 64-bit signed integer comparisons. DEPRECATED, this - always matches HWY_LANES(int64_t) and will be removed. -* `HWY_MINMAX64_LANES`: 64-bit signed/unsigned integer min/max. DEPRECATED, - this always matches HWY_LANES(int64_t) and will be removed. - ## Detecting supported targets `SupportedTargets()` returns a cached (initialized on-demand) bitfield of the @@ -778,8 +818,10 @@ policy for selecting `HWY_TARGETS`: and permitted by the compiler, independently of autovectorization), which maximizes coverage in tests. -If none are defined, the default is to select all attainable targets except any -non-best baseline (typically `HWY_SCALAR`), which reduces code size. +If none are defined, but `HWY_IS_TEST` is defined, the default is +`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable +targets except any non-best baseline (typically `HWY_SCALAR`), which reduces +code size. ## Compiler support @@ -787,7 +829,8 @@ Clang and GCC require e.g. -mavx2 flags However, this enables AVX2 instructions in the entire translation unit, which may violate the one-definition rule and cause crashes. Instead, we use target-specific attributes introduced via #pragma. Function using SIMD must -reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. +reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively, +individual functions or lambdas may be prefixed with `HWY_ATTR`. Immediates (compile-time constants) are specified as template arguments to avoid constant-propagation issues with Clang on ARM. diff -up chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE.12 chromium-91.0.4472.77/third_party/highway/src/g3doc/quick_reference.mdE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h.12 2021-06-02 10:56:05.278904609 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.h 2021-05-31 10:37:11.000000000 -0400 @@ -111,6 +111,32 @@ AlignedUniquePtr MakeUniqueAligned(Ar new (ptr) T(std::forward(args)...), AlignedDeleter()); } +// Helpers for array allocators (avoids overflow) +namespace detail { + +// Returns x such that 1u << x == n (if n is a power of two). +static inline constexpr size_t ShiftCount(size_t n) { + return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); +} + +template +T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { + constexpr size_t size = sizeof(T); + + constexpr bool is_pow2 = (size & (size - 1)) == 0; + constexpr size_t bits = ShiftCount(size); + static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); + + const size_t bytes = is_pow2 ? items << bits : items * size; + const size_t check = is_pow2 ? bytes >> bits : bytes / size; + if (check != items) { + return nullptr; // overflowed + } + return static_cast(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); +} + +} // namespace detail + // Aligned memory equivalent of make_unique for array types using the // custom allocators alloc/free. This function calls the constructor with the // passed Args... on every created item. The destructor of each element will be @@ -118,10 +144,11 @@ AlignedUniquePtr MakeUniqueAligned(Ar template AlignedUniquePtr MakeUniqueAlignedArrayWithAlloc( size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { - T* ptr = - static_cast(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)); - for (size_t i = 0; i < items; i++) { - new (ptr + i) T(std::forward(args)...); + T* ptr = detail::AllocateAlignedItems(items, alloc, opaque); + if (ptr != nullptr) { + for (size_t i = 0; i < items; i++) { + new (ptr + i) T(std::forward(args)...); + } } return AlignedUniquePtr(ptr, AlignedDeleter(free, opaque)); } @@ -165,7 +192,7 @@ template AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, FreePtr free, void* opaque) { return AlignedFreeUniquePtr( - static_cast(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)), + detail::AllocateAlignedItems(items, alloc, opaque), AlignedFreer(free, opaque)); } diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc.12 2021-06-02 10:56:05.273904584 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -87,13 +88,39 @@ TEST(AlignedAllocatorTest, FreeNullptr) /*opaque_ptr=*/nullptr); } +TEST(AlignedAllocatorTest, Log2) { + EXPECT_EQ(0u, detail::ShiftCount(1)); + EXPECT_EQ(1u, detail::ShiftCount(2)); + EXPECT_EQ(3u, detail::ShiftCount(8)); +} + +// Allocator returns null when it detects overflow of items * sizeof(T). +TEST(AlignedAllocatorTest, Overflow) { + constexpr size_t max = ~size_t(0); + constexpr size_t msb = (max >> 1) + 1; + using Size5 = std::array; + using Size10 = std::array; + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 2, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 3, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 4, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb + 1, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb / 4, nullptr, nullptr)); +} + TEST(AlignedAllocatorTest, AllocDefaultPointers) { const size_t kSize = 7777; void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr); ASSERT_NE(nullptr, ptr); // Make sure the pointer is actually aligned. - EXPECT_EQ(0, reinterpret_cast(ptr) % kMaxVectorSize); + EXPECT_EQ(0U, reinterpret_cast(ptr) % kMaxVectorSize); char* p = static_cast(ptr); size_t ret = 0; for (size_t i = 0; i < kSize; i++) { @@ -101,7 +128,7 @@ TEST(AlignedAllocatorTest, AllocDefaultP p[i] = static_cast(i & 0x7F); if (i) ret += p[i] * p[i - 1]; } - EXPECT_NE(0, ret); + EXPECT_NE(0U, ret); FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr); } @@ -123,11 +150,11 @@ TEST(AlignedAllocatorTest, CustomAlloc) AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc); ASSERT_NE(nullptr, ptr); // We should have only requested one alloc from the allocator. - EXPECT_EQ(1u, fake_alloc.PendingAllocs()); + EXPECT_EQ(1U, fake_alloc.PendingAllocs()); // Make sure the pointer is actually aligned. - EXPECT_EQ(0, reinterpret_cast(ptr) % kMaxVectorSize); + EXPECT_EQ(0U, reinterpret_cast(ptr) % kMaxVectorSize); FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc); - EXPECT_EQ(0u, fake_alloc.PendingAllocs()); + EXPECT_EQ(0U, fake_alloc.PendingAllocs()); } TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) { @@ -170,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAli TEST(AlignedAllocatorTest, AllocSingleInt) { auto ptr = AllocateAligned(1); ASSERT_NE(nullptr, ptr.get()); - EXPECT_EQ(0, reinterpret_cast(ptr.get()) % kMaxVectorSize); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % kMaxVectorSize); // Force delete of the unique_ptr now to check that it doesn't crash. ptr.reset(nullptr); EXPECT_EQ(nullptr, ptr.get()); @@ -180,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultiple const size_t kSize = 7777; auto ptr = AllocateAligned(kSize); ASSERT_NE(nullptr, ptr.get()); - EXPECT_EQ(0, reinterpret_cast(ptr.get()) % kMaxVectorSize); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % kMaxVectorSize); // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the // underlying type chosen by AllocateAligned() for the std::unique_ptr. EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1])); @@ -191,7 +218,7 @@ TEST(AlignedAllocatorTest, AllocMultiple ptr[i] = static_cast(i); if (i) ret += ptr[i] * ptr[i - 1]; } - EXPECT_NE(0, ret); + EXPECT_NE(0U, ret); } TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) { @@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli auto arr = MakeUniqueAlignedArrayWithAlloc>( 7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc, &counter); - // An array shold still only call a single allocation. + ASSERT_NE(nullptr, arr.get()); + // An array should still only call a single allocation. EXPECT_EQ(1u, fake_alloc.PendingAllocs()); EXPECT_EQ(7, counter); for (size_t i = 0; i < 7; i++) { diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/aligned_allocator_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/base.h.12 2021-06-02 10:56:05.266904549 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/base.h 2021-05-31 10:37:11.000000000 -0400 @@ -34,7 +34,10 @@ //------------------------------------------------------------------------------ // Detect compiler using predefined macros -#ifdef _MSC_VER +// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like +// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that +// purpose. +#if defined(_MSC_VER) && !defined(__clang__) #define HWY_COMPILER_MSVC _MSC_VER #else #define HWY_COMPILER_MSVC 0 @@ -200,6 +203,10 @@ #define HWY_ARCH_X86_64 0 #endif +#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64 +#error "Cannot have both x86-32 and x86-64" +#endif + #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64 #define HWY_ARCH_X86 1 #else @@ -212,14 +219,29 @@ #define HWY_ARCH_PPC 0 #endif -#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) +#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64) +#define HWY_ARCH_ARM_A64 1 +#else +#define HWY_ARCH_ARM_A64 0 +#endif + +#if defined(__arm__) || defined(_M_ARM) +#define HWY_ARCH_ARM_V7 1 +#else +#define HWY_ARCH_ARM_V7 0 +#endif + +#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7 +#error "Cannot have both A64 and V7" +#endif + +#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7 #define HWY_ARCH_ARM 1 #else #define HWY_ARCH_ARM 0 #endif -// There isn't yet a standard __wasm or __wasm__. -#ifdef __EMSCRIPTEN__ +#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__) #define HWY_ARCH_WASM 1 #else #define HWY_ARCH_WASM 0 @@ -231,9 +253,11 @@ #define HWY_ARCH_RVV 0 #endif +// It is an error to detect multiple architectures at the same time, but OK to +// detect none of the above. #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \ - HWY_ARCH_RVV) != 1 -#error "Must detect exactly one platform" + HWY_ARCH_RVV) > 1 +#error "Must not detect more than one architecture" #endif //------------------------------------------------------------------------------ @@ -308,13 +332,26 @@ static constexpr HWY_MAYBE_UNUSED size_t // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name // by concatenating base type and bits. -// RVV already has a builtin type. -#if !HWY_ARCH_RVV +// RVV already has a builtin type and the GCC intrinsics require it. +#if HWY_ARCH_RVV && HWY_COMPILER_GCC +#define HWY_NATIVE_FLOAT16 1 +#else +#define HWY_NATIVE_FLOAT16 0 +#endif + +#if HWY_NATIVE_FLOAT16 +using float16_t = __fp16; +// Clang does not allow __fp16 arguments, but scalar.h requires LaneType +// arguments, so use a wrapper. +// TODO(janwas): replace with _Float16 when that is supported? +#else +#pragma pack(push, 1) struct float16_t { - // __fp16 cannot be used as a function parameter in clang, so use a wrapper. uint16_t bits; }; +#pragma pack(pop) #endif + using float32_t = float; using float64_t = double; @@ -506,6 +543,13 @@ struct Relations { using Narrow = int32_t; }; template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Float = float16_t; + using Wide = float; +}; +template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; @@ -551,13 +595,13 @@ constexpr inline size_t RoundUpTo(size_t // Undefined results for x == 0. HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { -#ifdef _MSC_VER +#if HWY_COMPILER_MSVC unsigned long index; // NOLINT _BitScanForward(&index, x); return index; -#else +#else // HWY_COMPILER_MSVC return static_cast(__builtin_ctz(x)); -#endif +#endif // HWY_COMPILER_MSVC } HWY_API size_t PopCount(uint64_t x) { @@ -565,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) { return static_cast(__builtin_popcountll(x)); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 return _mm_popcnt_u64(x); -#elif HWY_COMPILER_MSVC +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32)); #else x -= ((x >> 1) & 0x55555555U); diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/base.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h.12 2021-06-02 10:56:05.280904620 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.h 2021-05-31 10:37:11.000000000 -0400 @@ -20,7 +20,9 @@ #include "hwy/base.h" -#ifndef __SSE2__ +// Requires SSE2; fails to compile on 32-bit Clang 7 (see +// https://github.com/gperftools/gperftools/issues/946). +#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) #undef HWY_DISABLE_CACHE_CONTROL #define HWY_DISABLE_CACHE_CONTROL #endif @@ -30,6 +32,14 @@ #include // SSE2 #endif +// Windows.h #defines these, which causes infinite recursion. Temporarily +// undefine them in this header; these functions are anyway deprecated. +// TODO(janwas): remove when these functions are removed. +#pragma push_macro("LoadFence") +#pragma push_macro("StoreFence") +#undef LoadFence +#undef StoreFence + namespace hwy { // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. @@ -81,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach #endif } +// Reduces power consumption in spin-loops. No effect on non-x86. +HWY_INLINE HWY_ATTR_CACHE void Pause() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_pause(); +#endif +} + } // namespace hwy +// TODO(janwas): remove when these functions are removed. (See above.) +#pragma pop_macro("StoreFence") +#pragma pop_macro("LoadFence") + #endif // HIGHWAY_HWY_CACHE_CONTROL_H_ diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/cache_control.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc.12 2021-06-02 10:56:05.195904190 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.cc 2021-05-31 10:37:11.000000000 -0400 @@ -19,7 +19,6 @@ #include #include -#include #include #include // iota @@ -37,15 +36,15 @@ using hwy::HWY_NAMESPACE::CombineShiftRi class TwoArray { public: - // Passed to ctor as a value NOT known to the compiler. Must be a multiple of - // the vector lane count * 8. + // Must be a multiple of the vector lane count * 8. static size_t NumItems() { return 3456; } - explicit TwoArray(const size_t num_items) - : a_(AllocateAligned(num_items * 2)), b_(a_.get() + num_items) { - const float init = num_items / NumItems(); // 1, but compiler doesn't know - std::iota(a_.get(), a_.get() + num_items, init); - std::iota(b_, b_ + num_items, init); + TwoArray() + : a_(AllocateAligned(NumItems() * 2)), b_(a_.get() + NumItems()) { + // = 1, but compiler doesn't know + const float init = static_cast(Unpredictable1()); + std::iota(a_.get(), a_.get() + NumItems(), init); + std::iota(b_, b_ + NumItems(), init); } protected: @@ -62,7 +61,7 @@ void RunBenchmark(const char* caption) { const FuncInput inputs[kNumInputs] = {num_items}; Result results[kNumInputs]; - Benchmark benchmark(num_items); + Benchmark benchmark; Params p; p.verbose = false; @@ -101,7 +100,7 @@ void Intro() { // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold! class BenchmarkDot : public TwoArray { public: - explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {} + BenchmarkDot() : dot_{-1.0f} {} FuncOutput operator()(const size_t num_items) { HWY_FULL(float) d; @@ -132,7 +131,8 @@ class BenchmarkDot : public TwoArray { sum[i] += sum[i + power]; } } - return dot_ = GetLane(SumOfLanes(sum[0])); + dot_ = GetLane(SumOfLanes(sum[0])); + return static_cast(dot_); } void Verify(size_t num_items) { if (dot_ == -1.0f) { @@ -157,8 +157,6 @@ class BenchmarkDot : public TwoArray { // INTERMEDIATE: delta coding // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold! struct BenchmarkDelta : public TwoArray { - explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {} - FuncOutput operator()(const size_t num_items) const { #if HWY_TARGET == HWY_SCALAR b_[0] = a_[0]; @@ -197,7 +195,7 @@ struct BenchmarkDelta : public TwoArray Store(a - shifted, df, &b_[i]); } #endif - return b_[num_items - 1]; + return static_cast(b_[num_items - 1]); } void Verify(size_t num_items) { diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/benchmark.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc.12 2021-06-02 10:56:05.189904159 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.cc 2021-05-31 10:37:11.000000000 -0400 @@ -22,27 +22,62 @@ // For runtime dispatch, specify the name of the current file (unfortunately // __FILE__ is not reliable) so that foreach_target.h can re-include it. #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc" -// Re-include this file once per enabled target to generate code for it. +// Generates code for each enabled target by re-including this source file. #include "hwy/foreach_target.h" -#include "hwy/examples/skeleton_shared.h" #include "hwy/highway.h" -// Optional: factor out parts of the implementation into *-inl.h -#include "hwy/examples/skeleton-inl.h" - // Optional, can instead add HWY_ATTR to all functions. HWY_BEFORE_NAMESPACE(); namespace skeleton { namespace HWY_NAMESPACE { -// Compiled once per target via multiple inclusion. -void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, - float* HWY_RESTRICT out) { - printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), - ExampleGatherStrategy()); +// Highway ops reside here; ADL does not find templates nor builtins. +using namespace hwy::HWY_NAMESPACE; + +// Computes log2 by converting to a vector of floats. Compiled once per target. +template +HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values, + uint8_t* HWY_RESTRICT log2) { + // Type tags for converting to other element types (Rebind = same count). + const Rebind d32; + const Rebind d8; + + const auto u8 = Load(d8, values); + const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8))); + const auto exponent = ShiftRight<23>(bits) - Set(d32, 127); + Store(DemoteTo(d8, exponent), d8, log2); +} + +HWY_NOINLINE void CodepathDemo() { + // Highway defaults to portability, but per-target codepaths may be selected + // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros: +#if HWY_CAP_INTEGER64 + const char* gather = "Has int64"; +#else + const char* gather = "No int64"; +#endif + printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather); +} - ExampleMulAdd(in1, in2, out); +HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, + uint8_t* HWY_RESTRICT log2) { + CodepathDemo(); + + // Second argument is necessary on RVV until it supports fractional lengths. + HWY_FULL(float, 4) df; + + const size_t N = Lanes(df); + size_t i = 0; + for (; i + N <= count; i += N) { + OneFloorLog2(df, values + i, log2 + i); + } + // TODO(janwas): implement +#if HWY_TARGET != HWY_RVV + for (; i < count; ++i) { + OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i); + } +#endif } // NOLINTNEXTLINE(google-readability-namespace-comments) @@ -54,22 +89,20 @@ HWY_AFTER_NAMESPACE(); namespace skeleton { -// This macro declares a static array SkeletonHighwayDispatchTable used for -// dynamic dispatch. This macro should be placed in the same namespace that -// defines the Skeleton function above. -HWY_EXPORT(Skeleton); +// This macro declares a static array used for dynamic dispatch; it resides in +// the same outer namespace that contains FloorLog2. +HWY_EXPORT(FloorLog2); // This function is optional and only needed in the case of exposing it in the -// header file. Otherwise using HWY_DYNAMIC_DISPATCH(Skeleton) multiple times in -// this module is equivalent to inlining this optional function.. -void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, - float* HWY_RESTRICT out) { - return HWY_DYNAMIC_DISPATCH(Skeleton)(in1, in2, out); +// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module +// is equivalent to inlining this function. +void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count, + uint8_t* HWY_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out); } // Optional: anything to compile only once, e.g. non-SIMD implementations of -// public functions provided by this module, can go inside #if HWY_ONCE -// (after end_target-inl.h). +// public functions provided by this module, can go inside #if HWY_ONCE. } // namespace skeleton #endif // HWY_ONCE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h.12 2021-06-02 10:56:05.213904281 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.h 2021-05-31 10:37:11.000000000 -0400 @@ -18,15 +18,17 @@ #ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_ #define HIGHWAY_HWY_EXAMPLES_SKELETON_H_ -// Tiny subset of Highway API: essentials for declaring an interface, without -// any implementation details. +#include + +// Platform-specific definitions used for declaring an interface, independent of +// the SIMD instruction set. #include "hwy/base.h" // HWY_RESTRICT namespace skeleton { -// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256. -void Skeleton(const float* HWY_RESTRICT in1, const float* HWY_RESTRICT in2, - float* HWY_RESTRICT out); +// Computes base-2 logarithm by converting to float. Supports dynamic dispatch. +void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count, + uint8_t* HWY_RESTRICT out); } // namespace skeleton diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h.12 2021-06-02 10:56:05.164904033 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -29,41 +29,31 @@ // It is fine to #include normal or *-inl headers. #include -#include "hwy/examples/skeleton_shared.h" #include "hwy/highway.h" HWY_BEFORE_NAMESPACE(); namespace skeleton { namespace HWY_NAMESPACE { -using hwy::HWY_NAMESPACE::MulAdd; +using namespace hwy::HWY_NAMESPACE; -// Computes out[i] = in1[i] * kMultiplier + in2[i] for i < 256. -HWY_MAYBE_UNUSED void ExampleMulAdd(const float* HWY_RESTRICT in1, - const float* HWY_RESTRICT in2, - float* HWY_RESTRICT out) { - // Descriptor(s) for all vector types used in this function. - HWY_FULL(float) df; - - const auto mul = Set(df, kMultiplier); - for (size_t i = 0; i < 256; i += Lanes(df)) { - const auto result = MulAdd(mul, Load(df, in1 + i), Load(df, in2 + i)); - Store(result, df, out + i); +// Example of a type-agnostic (caller-specified lane type) and width-agnostic +// (uses best available instruction set) function in a header. +// +// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size. +template +HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array, + const T* HWY_RESTRICT add_array, + const size_t size, T* HWY_RESTRICT x_array) { + for (size_t i = 0; i < size; i += Lanes(d)) { + const auto mul = Load(d, mul_array + i); + const auto add = Load(d, add_array + i); + auto x = Load(d, x_array + i); + x = MulAdd(mul, x, add); + Store(x, d, x_array + i); } } -// (This doesn't generate SIMD instructions, so is not required here) -HWY_MAYBE_UNUSED const char* ExampleGatherStrategy() { - // Highway functions generate per-target implementations from the same source - // code via HWY_CAPPED(type, HWY_MIN(any_LANES_constants, ..)). If needed, - // entirely different codepaths can also be selected like so: -#if HWY_GATHER_LANES > 1 - return "Has gather"; -#else - return "Gather is limited to one lane"; -#endif -} - // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace skeleton diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_main.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.h diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_shared.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_static_main.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc.12 2021-06-02 10:56:05.170904063 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -12,30 +12,96 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Example of unit test for the "skeleton" module. +// Example of unit test for the "skeleton" library. -#include "hwy/examples/skeleton.h" // Skeleton +#include "hwy/examples/skeleton.h" #include -#include "hwy/tests/test_util-inl.h" // RunTest +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc" +#include "hwy/foreach_target.h" +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" +// Optional: factor out parts of the implementation into *-inl.h +#include "hwy/examples/skeleton-inl.h" + +HWY_BEFORE_NAMESPACE(); namespace skeleton { +namespace HWY_NAMESPACE { + +using namespace hwy::HWY_NAMESPACE; + +// Calls function defined in skeleton.cc. +struct TestFloorLog2 { + template + HWY_NOINLINE void operator()(T /*unused*/, DF df) { + const size_t count = 5 * Lanes(df); + auto in = hwy::AllocateAligned(count); + auto expected = hwy::AllocateAligned(count); + + hwy::RandomState rng; + for (size_t i = 0; i < count; ++i) { + expected[i] = Random32(&rng) & 7; + in[i] = static_cast(1u << expected[i]); + } + auto out = hwy::AllocateAligned(count); + CallFloorLog2(in.get(), count, out.get()); + int sum = 0; + for (size_t i = 0; i < count; ++i) { + // TODO(janwas): implement +#if HWY_TARGET != HWY_RVV + HWY_ASSERT_EQ(expected[i], out[i]); +#endif + sum += out[i]; + } + hwy::PreventElision(sum); + } +}; + +HWY_NOINLINE void TestAllFloorLog2() { + ForPartialVectors()(float()); +} + +// Calls function defined in skeleton-inl.h. +struct TestSumMulAdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + hwy::RandomState rng; + const size_t count = 4096; + EXPECT_TRUE(count % Lanes(d) == 0); + auto mul = hwy::AllocateAligned(count); + auto x = hwy::AllocateAligned(count); + auto add = hwy::AllocateAligned(count); + for (size_t i = 0; i < count; ++i) { + mul[i] = static_cast(Random32(&rng) & 0xF); + x[i] = static_cast(Random32(&rng) & 0xFF); + add[i] = static_cast(Random32(&rng) & 0xFF); + } + double expected_sum = 0.0; + for (size_t i = 0; i < count; ++i) { + expected_sum += mul[i] * x[i] + add[i]; + } -TEST(SkeletonTest, MainTest) { - HWY_ALIGN_MAX float in1[256]; - HWY_ALIGN_MAX float in2[256]; - HWY_ALIGN_MAX float out[256]; - for (size_t i = 0; i < 256; ++i) { - in1[i] = static_cast(i); - in2[i] = in1[i] + 300; + MulAddLoop(d, mul.get(), add.get(), count, x.get()); + HWY_ASSERT_EQ(4344240.0, expected_sum); } +}; - // Tests will run for all compiled targets to ensure all are OK. - hwy::RunTest([&in1, &in2, &out]() { - Skeleton(in1, in2, out); - // Add EXPECT_... calls here. - }); +HWY_NOINLINE void TestAllSumMulAdd() { + ForFloatTypes(ForPartialVectors()); } +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace skeleton +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace skeleton { +HWY_BEFORE_TEST(SkeletonTest); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2); +HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd); } // namespace skeleton +#endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/examples/skeleton_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.h diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/foreach_target.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h.12 2021-06-02 10:56:05.269904564 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/highway.h 2021-05-31 10:37:11.000000000 -0400 @@ -25,10 +25,10 @@ namespace hwy { -// API version (https://semver.org/) +// API version (https://semver.org/); keep in sync with CMakeLists.txt. #define HWY_MAJOR 0 -#define HWY_MINOR 11 -#define HWY_PATCH 1 +#define HWY_MINOR 12 +#define HWY_PATCH 2 //------------------------------------------------------------------------------ // Shorthand for descriptors (defined in shared-inl.h) used to select overloads. @@ -49,7 +49,7 @@ namespace hwy { HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) -// Vector of up to MAX_N lanes. +// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead. #define HWY_CAPPED(T, MAX_N) \ hwy::HWY_NAMESPACE::Simd @@ -75,6 +75,10 @@ namespace hwy { #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_NEON #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_SVE2 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_PPC8 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSE4 @@ -143,6 +147,18 @@ FunctionCache Function #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr #endif +#if HWY_TARGETS & HWY_SVE +#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME +#else +#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_SVE2 +#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME +#else +#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr +#endif + #if HWY_TARGETS & HWY_PPC8 #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME #else @@ -261,8 +277,11 @@ FunctionCache Function #elif HWY_TARGET == HWY_AVX3 #include "hwy/ops/x86_512-inl.h" #elif HWY_TARGET == HWY_PPC8 +#error "PPC is not yet supported" #elif HWY_TARGET == HWY_NEON #include "hwy/ops/arm_neon-inl.h" +#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 +#include "hwy/ops/arm_sve-inl.h" #elif HWY_TARGET == HWY_WASM #include "hwy/ops/wasm_128-inl.h" #elif HWY_TARGET == HWY_RVV diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/highway.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc.12 2021-06-02 10:56:05.276904599 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.cc 2021-05-31 10:37:11.000000000 -0400 @@ -29,128 +29,43 @@ #include #include +#if defined(_WIN32) || defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#include +#endif + +#if defined(__MACH__) +#include +#include +#endif + +#if defined(__HAIKU__) +#include +#endif + #include "hwy/base.h" #if HWY_ARCH_PPC #include // NOLINT __ppc_get_timebase_freq #elif HWY_ARCH_X86 -#ifdef _MSC_VER +#if HWY_COMPILER_MSVC #include #else #include // NOLINT -#endif // _MSC_VER +#endif // HWY_COMPILER_MSVC #endif // HWY_ARCH_X86 namespace hwy { -namespace platform { -namespace { - -#if HWY_ARCH_X86 - -void Cpuid(const uint32_t level, const uint32_t count, - uint32_t* HWY_RESTRICT abcd) { -#if HWY_COMPILER_MSVC - int regs[4]; - __cpuidex(regs, level, count); - for (int i = 0; i < 4; ++i) { - abcd[i] = regs[i]; - } -#else - uint32_t a; - uint32_t b; - uint32_t c; - uint32_t d; - __cpuid_count(level, count, a, b, c, d); - abcd[0] = a; - abcd[1] = b; - abcd[2] = c; - abcd[3] = d; -#endif -} - -std::string BrandString() { - char brand_string[49]; - std::array abcd; - - // Check if brand string is supported (it is on all reasonable Intel/AMD) - Cpuid(0x80000000U, 0, abcd.data()); - if (abcd[0] < 0x80000004U) { - return std::string(); - } - - for (size_t i = 0; i < 3; ++i) { - Cpuid(0x80000002U + i, 0, abcd.data()); - memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd)); - } - brand_string[48] = 0; - return brand_string; -} - -// Returns the frequency quoted inside the brand string. This does not -// account for throttling nor Turbo Boost. -double NominalClockRate() { - const std::string& brand_string = BrandString(); - // Brand strings include the maximum configured frequency. These prefixes are - // defined by Intel CPUID documentation. - const char* prefixes[3] = {"MHz", "GHz", "THz"}; - const double multipliers[3] = {1E6, 1E9, 1E12}; - for (size_t i = 0; i < 3; ++i) { - const size_t pos_prefix = brand_string.find(prefixes[i]); - if (pos_prefix != std::string::npos) { - const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); - if (pos_space != std::string::npos) { - const std::string digits = - brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); - return std::stod(digits) * multipliers[i]; - } - } - } - - return 0.0; -} - -#endif // HWY_ARCH_X86 - -} // namespace - -// Returns tick rate. Invariant means the tick counter frequency is independent -// of CPU throttling or sleep. May be expensive, caller should cache the result. -double InvariantTicksPerSecond() { -#if HWY_ARCH_PPC - return __ppc_get_timebase_freq(); -#elif HWY_ARCH_X86 - // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs. - return NominalClockRate(); -#else - // Fall back to clock_gettime nanoseconds. - return 1E9; -#endif -} - -} // namespace platform namespace { - -// Prevents the compiler from eliding the computations that led to "output". -template -inline void PreventElision(T&& output) { -#if HWY_COMPILER_MSVC == 0 - // Works by indicating to the compiler that "output" is being read and - // modified. The +r constraint avoids unnecessary writes to memory, but only - // works for built-in types (typically FuncOutput). - asm volatile("" : "+r"(output) : : "memory"); -#else - // MSVC does not support inline assembly anymore (and never supported GCC's - // RTL constraints). Self-assignment with #pragma optimize("off") might be - // expected to prevent elision, but it does not with MSVC 2015. Type-punning - // with volatile pointers generates inefficient code on MSVC 2017. - static std::atomic dummy(T{}); - dummy.store(output, std::memory_order_relaxed); -#endif -} - namespace timer { +// Ticks := platform-specific timer values (CPU cycles on x86). Must be +// unsigned to guarantee wraparound on overflow. +using Ticks = uint64_t; + // Start/Stop return absolute timestamps and must be placed immediately before // and after the region to measure. We provide separate Start/Stop functions // because they use different fences. @@ -202,8 +117,8 @@ namespace timer { // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, // divide by InvariantTicksPerSecond. -inline uint64_t Start64() { - uint64_t t; +inline Ticks Start() { + Ticks t; #if HWY_ARCH_PPC asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC @@ -228,8 +143,15 @@ inline uint64_t Start64() { : "rdx", "memory", "cc"); #elif HWY_ARCH_RVV asm volatile("rdcycle %0" : "=r"(t)); -#else - // Fall back to OS - unsure how to reliably query cntvct_el0 frequency. +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + t = counter.QuadPart; +#elif defined(__MACH__) + t = mach_absolute_time(); +#elif defined(__HAIKU__) + t = system_time_nsecs(); // since boot +#else // POSIX timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); t = ts.tv_sec * 1000000000LL + ts.tv_nsec; @@ -237,7 +159,7 @@ inline uint64_t Start64() { return t; } -inline uint64_t Stop64() { +inline Ticks Stop() { uint64_t t; #if HWY_ARCH_PPC asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); @@ -261,61 +183,7 @@ inline uint64_t Stop64() { // "cc" = flags modified by SHL. : "rcx", "rdx", "memory", "cc"); #else - t = Start64(); -#endif - return t; -} - -// Returns a 32-bit timestamp with about 4 cycles less overhead than -// Start64. Only suitable for measuring very short regions because the -// timestamp overflows about once a second. -inline uint32_t Start32() { - uint32_t t; -#if HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); - t = static_cast(__rdtsc()); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - asm volatile( - "lfence\n\t" - "rdtsc\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rdx = TSC >> 32. - : "rdx", "memory"); -#elif HWY_ARCH_RVV - asm volatile("rdcycle %0" : "=r"(t)); -#else - t = static_cast(Start64()); -#endif - return t; -} - -inline uint32_t Stop32() { - uint32_t t; -#if HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - unsigned aux; - t = static_cast(__rdtscp(&aux)); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). - asm volatile( - "rdtscp\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. - : "rcx", "rdx", "memory"); -#else - t = static_cast(Stop64()); + t = Start(); #endif return t; } @@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value } } // namespace robust_statistics +} // namespace +namespace platform { +namespace { -// Ticks := platform-specific timer values (CPU cycles on x86). Must be -// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to -// read than 64 bit. -using Ticks = uint32_t; +// Prevents the compiler from eliding the computations that led to "output". +template +inline void PreventElision(T&& output) { +#if HWY_COMPILER_MSVC == 0 + // Works by indicating to the compiler that "output" is being read and + // modified. The +r constraint avoids unnecessary writes to memory, but only + // works for built-in types (typically FuncOutput). + asm volatile("" : "+r"(output) : : "memory"); +#else + // MSVC does not support inline assembly anymore (and never supported GCC's + // RTL constraints). Self-assignment with #pragma optimize("off") might be + // expected to prevent elision, but it does not with MSVC 2015. Type-punning + // with volatile pointers generates inefficient code on MSVC 2017. + static std::atomic dummy(T{}); + dummy.store(output, std::memory_order_relaxed); +#endif +} + +#if HWY_ARCH_X86 + +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HWY_RESTRICT abcd) { +#if HWY_COMPILER_MSVC + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else + uint32_t a; + uint32_t b; + uint32_t c; + uint32_t d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif +} + +std::string BrandString() { + char brand_string[49]; + std::array abcd; + + // Check if brand string is supported (it is on all reasonable Intel/AMD) + Cpuid(0x80000000U, 0, abcd.data()); + if (abcd[0] < 0x80000004U) { + return std::string(); + } + + for (size_t i = 0; i < 3; ++i) { + Cpuid(static_cast(0x80000002U + i), 0, abcd.data()); + memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd)); + } + brand_string[48] = 0; + return brand_string; +} + +// Returns the frequency quoted inside the brand string. This does not +// account for throttling nor Turbo Boost. +double NominalClockRate() { + const std::string& brand_string = BrandString(); + // Brand strings include the maximum configured frequency. These prefixes are + // defined by Intel CPUID documentation. + const char* prefixes[3] = {"MHz", "GHz", "THz"}; + const double multipliers[3] = {1E6, 1E9, 1E12}; + for (size_t i = 0; i < 3; ++i) { + const size_t pos_prefix = brand_string.find(prefixes[i]); + if (pos_prefix != std::string::npos) { + const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); + if (pos_space != std::string::npos) { + const std::string digits = + brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); + return std::stod(digits) * multipliers[i]; + } + } + } + + return 0.0; +} + +#endif // HWY_ARCH_X86 + +} // namespace + +double InvariantTicksPerSecond() { +#if HWY_ARCH_PPC + return __ppc_get_timebase_freq(); +#elif HWY_ARCH_X86 + // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs. + return NominalClockRate(); +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER freq; + (void)QueryPerformanceFrequency(&freq); + return double(freq.QuadPart); +#elif defined(__MACH__) + // https://developer.apple.com/library/mac/qa/qa1398/_index.html + mach_timebase_info_data_t timebase; + (void)mach_timebase_info(&timebase); + return double(timebase.denom) / timebase.numer * 1E9; +#else + // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency. + return 1E9; // Haiku and clock_gettime return nanoseconds. +#endif +} -// Returns timer overhead / minimum measurable difference. -Ticks TimerResolution() { +double Now() { + static const double mul = 1.0 / InvariantTicksPerSecond(); + return static_cast(timer::Start()) * mul; +} + +uint64_t TimerResolution() { // Nested loop avoids exceeding stack/L1 capacity. - Ticks repetitions[Params::kTimerSamples]; + timer::Ticks repetitions[Params::kTimerSamples]; for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) { - Ticks samples[Params::kTimerSamples]; + timer::Ticks samples[Params::kTimerSamples]; for (size_t i = 0; i < Params::kTimerSamples; ++i) { - const Ticks t0 = timer::Start32(); - const Ticks t1 = timer::Stop32(); + const timer::Ticks t0 = timer::Start(); + const timer::Ticks t1 = timer::Stop(); samples[i] = t1 - t0; } repetitions[rep] = robust_statistics::Mode(samples); @@ -462,18 +439,21 @@ Ticks TimerResolution() { return robust_statistics::Mode(repetitions); } -static const Ticks timer_resolution = TimerResolution(); +} // namespace platform +namespace { + +static const timer::Ticks timer_resolution = platform::TimerResolution(); // Estimates the expected value of "lambda" values with a variable number of // samples until the variability "rel_mad" is less than "max_rel_mad". template -Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, - const Params& p, const Lambda& lambda) { +timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, + const Params& p, const Lambda& lambda) { // Choose initial samples_per_eval based on a single estimated duration. - Ticks t0 = timer::Start32(); + timer::Ticks t0 = timer::Start(); lambda(); - Ticks t1 = timer::Stop32(); - Ticks est = t1 - t0; + timer::Ticks t1 = timer::Stop(); + timer::Ticks est = t1 - t0; static const double ticks_per_second = platform::InvariantTicksPerSecond(); const size_t ticks_per_eval = static_cast(ticks_per_second * p.seconds_per_eval); @@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max est == 0 ? p.min_samples_per_eval : ticks_per_eval / est; samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval); - std::vector samples; + std::vector samples; samples.reserve(1 + samples_per_eval); samples.push_back(est); // Percentage is too strict for tiny differences, so also allow a small // absolute "median absolute deviation". - const Ticks max_abs_mad = (timer_resolution + 99) / 100; + const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100; *rel_mad = 0.0; // ensure initialized for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { samples.reserve(samples.size() + samples_per_eval); for (size_t i = 0; i < samples_per_eval; ++i) { - t0 = timer::Start32(); + t0 = timer::Start(); lambda(); - t1 = timer::Stop32(); + t1 = timer::Stop(); samples.push_back(t1 - t0); } @@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max NANOBENCHMARK_CHECK(est != 0); // Median absolute deviation (mad) is a robust measure of 'variability'. - const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( + const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( samples.data(), samples.size(), est); - *rel_mad = static_cast(int(abs_mad)) / est; + *rel_mad = static_cast(abs_mad) / static_cast(est); if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) { if (p.verbose) { - printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n", - samples.size(), est, abs_mad, *rel_mad * 100.0); + printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n", + samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0); } return est; } @@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i return unique; } -// Returns how often we need to call func for sufficient precision, or zero -// on failure (e.g. the elapsed time is too long for a 32-bit tick count). +// Returns how often we need to call func for sufficient precision. size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, const Params& p) { // Min elapsed ticks for any input. - Ticks min_duration = ~0u; + timer::Ticks min_duration = ~timer::Ticks(0); for (const FuncInput input : unique) { - // Make sure a 32-bit timer is sufficient. - const uint64_t t0 = timer::Start64(); - PreventElision(func(arg, input)); - const uint64_t t1 = timer::Stop64(); - const uint64_t elapsed = t1 - t0; - if (elapsed >= (1ULL << 30)) { - fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n", - input); - return 0; - } - double rel_mad; - const Ticks total = SampleUntilStable( + const timer::Ticks total = SampleUntilStable( p.target_rel_mad, &rel_mad, p, - [func, arg, input]() { PreventElision(func(arg, input)); }); + [func, arg, input]() { platform::PreventElision(func(arg, input)); }); min_duration = std::min(min_duration, total - timer_resolution); } @@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui const size_t num_skip = min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration; if (p.verbose) { - printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution, - max_skip, min_duration, num_skip); + printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n", + size_t(timer_resolution), max_skip, size_t(min_duration), num_skip); } return num_skip; } @@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co } // Returns total ticks elapsed for all inputs. -Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs, - const Params& p, double* max_rel_mad) { +timer::Ticks TotalDuration(const Func func, const uint8_t* arg, + const InputVec* inputs, const Params& p, + double* max_rel_mad) { double rel_mad; - const Ticks duration = + const timer::Ticks duration = SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() { for (const FuncInput input : *inputs) { - PreventElision(func(arg, input)); + platform::PreventElision(func(arg, input)); } }); *max_rel_mad = std::max(*max_rel_mad, rel_mad); @@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const // Returns overhead of accessing inputs[] and calling a function; this will // be deducted from future TotalDuration return values. -Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) { +timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, + const Params& p) { double rel_mad; // Zero tolerance because repeatability is crucial and EmptyFunc is fast. return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() { for (const FuncInput input : *inputs) { - PreventElision(EmptyFunc(arg, input)); + platform::PreventElision(EmptyFunc(arg, input)); } }); } } // namespace -int Unpredictable1() { return timer::Start64() != ~0ULL; } +int Unpredictable1() { return timer::Start() != ~0ULL; } size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, const size_t num_inputs, Result* results, const Params& p) { @@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); InputVec subset(full.size() - num_skip); - const Ticks overhead = Overhead(arg, &full, p); - const Ticks overhead_skip = Overhead(arg, &subset, p); + const timer::Ticks overhead = Overhead(arg, &full, p); + const timer::Ticks overhead_skip = Overhead(arg, &subset, p); if (overhead < overhead_skip) { - fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead, - overhead_skip); + fprintf(stderr, "Measurement failed: overhead %zu < %zu\n", + size_t(overhead), size_t(overhead_skip)); return 0; } if (p.verbose) { - printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(), - overhead, overhead_skip); + printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(), + size_t(overhead), size_t(overhead_skip)); } double max_rel_mad = 0.0; - const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); + const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); for (size_t i = 0; i < unique.size(); ++i) { FillSubset(full, unique[i], num_skip, &subset); - const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad); + const timer::Ticks total_skip = + TotalDuration(func, arg, &subset, p, &max_rel_mad); if (total < total_skip) { - fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip); + fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total), + size_t(total_skip)); return 0; } - const Ticks duration = (total - overhead) - (total_skip - overhead_skip); + const timer::Ticks duration = + (total - overhead) - (total_skip - overhead_skip); results[i].input = unique[i]; results[i].ticks = static_cast(duration) * mul; results[i].variability = static_cast(max_rel_mad); diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h.12 2021-06-02 10:56:05.272904579 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.h 2021-05-31 10:37:11.000000000 -0400 @@ -44,11 +44,6 @@ // central tendency of the measurement samples with the "half sample mode", // which is more robust to outliers and skewed data than the mean or median. -// WARNING if included from multiple translation units compiled with distinct -// flags: this header requires textual inclusion and a predefined NB_NAMESPACE -// macro that is unique to the current compile flags. We must also avoid -// standard library headers such as vector and functional that define functions. - #include #include @@ -79,6 +74,16 @@ namespace platform { // This call may be expensive, callers should cache the result. double InvariantTicksPerSecond(); +// Returns current timestamp [in seconds] relative to an unspecified origin. +// Features: monotonic (no negative elapsed time), steady (unaffected by system +// time changes), high-resolution (on the order of microseconds). +double Now(); + +// Returns ticks elapsed in back to back timer calls, i.e. a function of the +// timer resolution (minimum measurable difference) and overhead. +// This call is expensive, callers should cache the result. +uint64_t TimerResolution(); + } // namespace platform // Returns 1, but without the compiler knowing what the value is. This prevents diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc.12 2021-06-02 10:56:05.275904594 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -15,11 +15,11 @@ #include "hwy/nanobenchmark.h" #include -#include // strtol -#include // sleep #include +#include "hwy/tests/test_util-inl.h" + namespace hwy { namespace { @@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in template void MeasureDiv(const FuncInput (&inputs)[N]) { + printf("Measuring integer division (output on final two lines)\n"); Result results[N]; Params params; params.max_evals = 4; // avoid test timeout @@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp } } -template -void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) { - printf("Expect a 'measurement failed' below:\n"); - Result results[N]; - - const size_t num_results = Measure( - [](const void*, const FuncInput input) -> FuncOutput { - // Loop until the sleep succeeds (not interrupted by signal). We assume - // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit. - while (sleep(2) != 0) { - } - return input; - }, - nullptr, inputs, N, results); - NANOBENCHMARK_CHECK(num_results == 0); - (void)num_results; -} - -void RunAll(const int argc, char** /*argv*/) { - // unpredictable == 1 but the compiler doesn't know that. - const int unpredictable = argc != 999; +TEST(NanobenchmarkTest, RunAll) { + const int unpredictable = Unpredictable1(); // == 1, unknown to compiler. static const FuncInput inputs[] = {static_cast(unpredictable) + 2, static_cast(unpredictable + 9)}; MeasureDiv(inputs); MeasureRandom(inputs); - EnsureLongMeasurementFails(inputs); } } // namespace } // namespace hwy - -int main(int argc, char* argv[]) { - hwy::RunAll(argc, argv); - return 0; -} diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/nanobenchmark_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h.12 2021-06-02 10:56:05.239904412 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { +namespace detail { // for code folding and Raw128 + // Macros used to define single and double function calls for multiple types // for full and half vectors. These macros are undefined at the end of the file. @@ -133,7 +135,7 @@ namespace HWY_NAMESPACE { HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args) // float and double -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \ HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \ @@ -181,7 +183,7 @@ namespace HWY_NAMESPACE { HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) // Emulation of some intrinsics on armv7. -#if !defined(__aarch64__) +#if HWY_ARCH_ARM_V7 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] @@ -294,7 +296,7 @@ struct Raw128 { using type = float32x4_t; }; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 template <> struct Raw128 { using type = float64x2_t; @@ -352,7 +354,7 @@ struct Raw128 { using type = float32x2_t; }; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 template <> struct Raw128 { using type = float64x1_t; @@ -437,12 +439,14 @@ struct Raw128 { using type = int8x8_t; }; +} // namespace detail + template using Full128 = Simd; template class Vec128 { - using Raw = typename Raw128::type; + using Raw = typename detail::Raw128::type; public: HWY_INLINE Vec128() {} @@ -480,7 +484,8 @@ class Vec128 { // FF..FF or 0, also for floating-point - see README. template class Mask128 { - using Raw = typename Raw128::type; + // ARM C Language Extensions return and expect unsigned type. + using Raw = typename detail::Raw128, N>::type; public: HWY_INLINE Mask128() {} @@ -573,7 +578,7 @@ HWY_INLINE Vec128 BitCastFro Vec128 v) { return Vec128(vreinterpret_s64_u8(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, Vec128 v) { return Vec128(vreinterpret_f64_u8(v.raw)); @@ -615,7 +620,7 @@ HWY_INLINE Vec128 BitCastFromBy return Vec128(vreinterpretq_s64_u8(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, Vec128 v) { return Vec128(vreinterpretq_f64_u8(v.raw)); @@ -664,15 +669,25 @@ template HWY_INLINE Vec128 Undefined(Simd /*d*/) { HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") - typename Raw128::type a; + typename detail::Raw128::type a; return Vec128(a); HWY_DIAGNOSTICS(pop) } -// ------------------------------ Extract lane +// Returns a vector with lane i=[0, N) set to "first" + i. +template +Vec128 Iota(const Simd d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + +// ------------------------------ GetLane HWY_INLINE uint8_t GetLane(const Vec128 v) { - return vget_lane_u8(vget_low_u8(v.raw), 0); + return vgetq_lane_u8(v.raw, 0); } template HWY_INLINE uint8_t GetLane(const Vec128 v) { @@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128< } HWY_INLINE int8_t GetLane(const Vec128 v) { - return vget_lane_s8(vget_low_s8(v.raw), 0); + return vgetq_lane_s8(v.raw, 0); } template HWY_INLINE int8_t GetLane(const Vec128 v) { @@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128 v) { - return vget_lane_u16(vget_low_u16(v.raw), 0); + return vgetq_lane_u16(v.raw, 0); } template HWY_INLINE uint16_t GetLane(const Vec128 v) { @@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128 } HWY_INLINE int16_t GetLane(const Vec128 v) { - return vget_lane_s16(vget_low_s16(v.raw), 0); + return vgetq_lane_s16(v.raw, 0); } template HWY_INLINE int16_t GetLane(const Vec128 v) { @@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128< } HWY_INLINE uint32_t GetLane(const Vec128 v) { - return vget_lane_u32(vget_low_u32(v.raw), 0); + return vgetq_lane_u32(v.raw, 0); } template HWY_INLINE uint32_t GetLane(const Vec128 v) { @@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128 } HWY_INLINE int32_t GetLane(const Vec128 v) { - return vget_lane_s32(vget_low_s32(v.raw), 0); + return vgetq_lane_s32(v.raw, 0); } template HWY_INLINE int32_t GetLane(const Vec128 v) { @@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128< } HWY_INLINE uint64_t GetLane(const Vec128 v) { - return vget_lane_u64(vget_low_u64(v.raw), 0); + return vgetq_lane_u64(v.raw, 0); } HWY_INLINE uint64_t GetLane(const Vec128 v) { return vget_lane_u64(v.raw, 0); } HWY_INLINE int64_t GetLane(const Vec128 v) { - return vget_lane_s64(vget_low_s64(v.raw), 0); + return vgetq_lane_s64(v.raw, 0); } HWY_INLINE int64_t GetLane(const Vec128 v) { return vget_lane_s64(v.raw, 0); } HWY_INLINE float GetLane(const Vec128 v) { - return vget_lane_f32(vget_low_f32(v.raw), 0); + return vgetq_lane_f32(v.raw, 0); } HWY_INLINE float GetLane(const Vec128 v) { return vget_lane_f32(v.raw, 0); @@ -741,9 +756,9 @@ HWY_INLINE float GetLane(const Vec128 v) { return vget_lane_f32(v.raw, 0); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE double GetLane(const Vec128 v) { - return vget_lane_f64(vget_low_f64(v.raw), 0); + return vgetq_lane_f64(v.raw, 0); } HWY_INLINE double GetLane(const Vec128 v) { return vget_lane_f64(v.raw, 0); @@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu // ------------------------------ Average // Returns (a + b + 1) / 2 - -// Unsigned HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) @@ -802,6 +815,7 @@ HWY_INLINE Vec128 Abs(const Vec HWY_INLINE Vec128 Abs(const Vec128 v) { return Vec128(vabsq_s32(v.raw)); } +// i64 is implemented after BroadcastSignBit. HWY_INLINE Vec128 Abs(const Vec128 v) { return Vec128(vabsq_f32(v.raw)); } @@ -823,7 +837,7 @@ HWY_INLINE Vec128 Abs(const Ve return Vec128(vabs_f32(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 Abs(const Vec128 v) { return Vec128(vabsq_f64(v.raw)); } @@ -839,7 +853,7 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vn HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below HWY_INLINE Vec128 Neg(const Vec128 v) { -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return Vec128(vneg_s64(v.raw)); #else return Zero(Simd()) - v; @@ -847,7 +861,7 @@ HWY_INLINE Vec128 Neg(const } HWY_INLINE Vec128 Neg(const Vec128 v) { -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return Vec128(vnegq_s64(v.raw)); #else return Zero(Full128()) - v; @@ -876,6 +890,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, v // ------------------------------ Shl +HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); +} +template +HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); +} + HWY_INLINE Vec128 operator<<(const Vec128 v, const Vec128 bits) { return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); @@ -905,6 +929,16 @@ HWY_INLINE Vec128 operator< return Vec128(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); } +HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s8(v.raw, bits.raw)); +} +template +HWY_INLINE Vec128 operator<<(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s8(v.raw, bits.raw)); +} + HWY_INLINE Vec128 operator<<(const Vec128 v, const Vec128 bits) { return Vec128(vshlq_s16(v.raw, bits.raw)); @@ -936,6 +970,18 @@ HWY_INLINE Vec128 operator<< // ------------------------------ Shr (Neg) +HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int8x16_t neg_bits = Neg(BitCast(Full128(), bits)).raw; + return Vec128(vshlq_u8(v.raw, neg_bits)); +} +template +HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + const int8x8_t neg_bits = Neg(BitCast(Simd(), bits)).raw; + return Vec128(vshl_u8(v.raw, neg_bits)); +} + HWY_INLINE Vec128 operator>>(const Vec128 v, const Vec128 bits) { const int16x8_t neg_bits = Neg(BitCast(Full128(), bits)).raw; @@ -971,6 +1017,16 @@ HWY_INLINE Vec128 operator> return Vec128(vshl_u64(v.raw, neg_bits)); } +HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); +} +template +HWY_INLINE Vec128 operator>>(const Vec128 v, + const Vec128 bits) { + return Vec128(vshl_s8(v.raw, Neg(bits).raw)); +} + HWY_INLINE Vec128 operator>>(const Vec128 v, const Vec128 bits) { return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); @@ -1059,7 +1115,7 @@ HWY_INLINE Vec128 operator*( HWY_INLINE Vec128 MulHigh(const Vec128 a, const Vec128 b) { int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 int32x4_t rhi = vmull_high_s16(a.raw, b.raw); #else int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); @@ -1070,7 +1126,7 @@ HWY_INLINE Vec128 MulHigh(const HWY_INLINE Vec128 MulHigh(const Vec128 a, const Vec128 b) { uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); #else uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); @@ -1139,24 +1195,37 @@ HWY_INLINE Vec128 ApproximateR return Vec128(vrecpe_f32(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) #else -// Emulated with approx reciprocal + Newton-Raphson + mul +// Not defined on armv7: approximate +namespace detail { + +HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( + const Vec128 recip, const Vec128 divisor) { + return Vec128(vrecpsq_f32(recip.raw, divisor.raw)); +} +template +HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( + const Vec128 recip, Vec128 divisor) { + return Vec128(vrecps_f32(recip.raw, divisor.raw)); +} + +} // namespace detail + template HWY_INLINE Vec128 operator/(const Vec128 a, const Vec128 b) { auto x = ApproximateReciprocal(b); - // Newton-Raphson on 1/x - b - const auto two = Set(Simd(), 2); - x = x * (two - b * x); - x = x * (two - b * x); - x = x * (two - b * x); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); + x *= detail::ReciprocalNewtonRaphsonStep(x, b); return a * x; } #endif -// Absolute value of difference. +// ------------------------------ Absolute value of difference. + HWY_INLINE Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Vec128(vabdq_f32(a.raw, b.raw)); } @@ -1169,7 +1238,7 @@ HWY_INLINE Vec128 AbsDiff(cons // ------------------------------ Floating-point multiply-add variants // Returns add + mul * x -#if defined(__aarch64__) +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 template HWY_INLINE Vec128 MulAdd(const Vec128 mul, const Vec128 x, @@ -1180,6 +1249,17 @@ HWY_INLINE Vec128 MulAdd(const Ve const Vec128 add) { return Vec128(vfmaq_f32(add.raw, mul.raw, x.raw)); } +#else +// Emulate FMA for floats. +template +HWY_INLINE Vec128 MulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return mul * x + add; +} +#endif + +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 MulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { @@ -1190,18 +1270,10 @@ HWY_INLINE Vec128 MulAdd(const V const Vec128 add) { return Vec128(vfmaq_f64(add.raw, mul.raw, x.raw)); } -#else -// Emulate FMA for floats. -template -HWY_INLINE Vec128 MulAdd(const Vec128 mul, - const Vec128 x, - const Vec128 add) { - return mul * x + add; -} #endif // Returns add - mul * x -#if defined(__aarch64__) +#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 template HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, const Vec128 x, @@ -1213,7 +1285,17 @@ HWY_INLINE Vec128 NegMulAdd(const const Vec128 add) { return Vec128(vfmsq_f32(add.raw, mul.raw, x.raw)); } +#else +// Emulate FMA for floats. +template +HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, + const Vec128 x, + const Vec128 add) { + return add - mul * x; +} +#endif +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { @@ -1224,14 +1306,6 @@ HWY_INLINE Vec128 NegMulAdd(cons const Vec128 add) { return Vec128(vfmsq_f64(add.raw, mul.raw, x.raw)); } -#else -// Emulate FMA for floats. -template -HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, - const Vec128 x, - const Vec128 add) { - return add - mul * x; -} #endif // Returns mul * x - sub @@ -1241,12 +1315,6 @@ HWY_INLINE Vec128 MulSub(const const Vec128 sub) { return MulAdd(mul, x, Neg(sub)); } -template -HWY_INLINE Vec128 MulSub(const Vec128 mul, - const Vec128 x, - const Vec128 sub) { - return MulAdd(mul, x, Neg(sub)); -} // Returns -mul * x - sub template @@ -1255,14 +1323,23 @@ HWY_INLINE Vec128 NegMulSub(co const Vec128 sub) { return Neg(MulAdd(mul, x, sub)); } + +#if HWY_ARCH_ARM_A64 +template +HWY_INLINE Vec128 MulSub(const Vec128 mul, + const Vec128 x, + const Vec128 sub) { + return MulAdd(mul, x, Neg(sub)); +} template HWY_INLINE Vec128 NegMulSub(const Vec128 mul, const Vec128 x, const Vec128 sub) { return Neg(MulAdd(mul, x, sub)); } +#endif -// ------------------------------ Floating-point square root +// ------------------------------ Floating-point square root (IfThenZeroElse) // Approximate reciprocal square root HWY_INLINE Vec128 ApproximateReciprocalSqrt(const Vec128 v) { @@ -1275,80 +1352,36 @@ HWY_INLINE Vec128 ApproximateR } // Full precision square root -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) #else -// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt. -template -HWY_INLINE Vec128 Sqrt(const Vec128 v) { - auto b = v; - auto Y = ApproximateReciprocalSqrt(v); - auto x = v * Y; - const auto half = Set(Simd(), 0.5); - const auto oneandhalf = Set(Simd(), 1.5); - for (size_t i = 0; i < 3; i++) { - b = b * Y * Y; - Y = oneandhalf - half * b; - x = x * Y; - } - return IfThenZeroElse(v == Zero(Simd()), x); -} -#endif - -// ================================================== COMPARE - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. +namespace detail { -template -HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); - return Mask128{m.raw}; +HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, + const Vec128 recip) { + return Vec128(vrsqrtsq_f32(root.raw, recip.raw)); +} +template +HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, + Vec128 recip) { + return Vec128(vrsqrts_f32(root.raw, recip.raw)); } -#define HWY_NEON_BUILD_TPL_HWY_COMPARE -#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 -#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ - const Vec128 a, const Vec128 b -#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw - -// ------------------------------ Equality -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) -#if defined(__aarch64__) -HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) -#else -// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) -#endif +} // namespace detail -// ------------------------------ Strict inequality +// Not defined on armv7: approximate +template +HWY_INLINE Vec128 Sqrt(const Vec128 v) { + auto recip = ApproximateReciprocalSqrt(v); -// Signed/float < (no unsigned) -#if defined(__aarch64__) -HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) -#else -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) -#endif -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); + recip *= detail::ReciprocalSqrtStep(v * recip, recip); -// Signed/float > (no unsigned) -#if defined(__aarch64__) -HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE) -#else -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE) + const auto root = v * recip; + return IfThenZeroElse(v == Zero(Simd()), root); +} #endif -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE) - -// ------------------------------ Weak inequality - -// Float <= >= -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE) - -#undef HWY_NEON_BUILD_TPL_HWY_COMPARE -#undef HWY_NEON_BUILD_RET_HWY_COMPARE -#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE -#undef HWY_NEON_BUILD_ARG_HWY_COMPARE // ================================================== LOGICAL @@ -1357,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. template HWY_INLINE Vec128 Not(const Vec128 v) { - const Full128 d8; - return Vec128(vmvnq_u8(BitCast(d8, v).raw)); + const Full128 d; + const Repartition d8; + return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); } template HWY_INLINE Vec128 Not(const Vec128 v) { - const Repartition> d8; - return Vec128(vmvn_u8(BitCast(d8, v).raw)); + const Simd d; + const Repartition d8; + using V8 = decltype(Zero(d8)); + return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); } // ------------------------------ And @@ -1463,33 +1499,38 @@ HWY_API Vec128 BroadcastSignBit(co return ShiftRight(v); } -// ------------------------------ Make mask +// ================================================== MASK -template -HWY_INLINE Mask128 TestBit(Vec128 v, Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} +// ------------------------------ To/from vector -// Mask and Vec are the same (true = FF..FF). +// Mask and Vec have the same representation (true = FF..FF). template HWY_INLINE Mask128 MaskFromVec(const Vec128 v) { - return Mask128(v.raw); + const Simd, N> du; + return Mask128(BitCast(du, v).raw); } +// DEPRECATED template HWY_INLINE Vec128 VecFromMask(const Mask128 v) { - return Vec128(v.raw); + return BitCast(Simd(), Vec128, N>(v.raw)); } template -HWY_INLINE Vec128 VecFromMask(Simd /* tag */, - const Mask128 v) { - return Vec128(v.raw); +HWY_INLINE Vec128 VecFromMask(Simd d, const Mask128 v) { + return BitCast(d, Vec128, N>(v.raw)); } -// IfThenElse(mask, yes, no) -// Returns mask ? b : a. +// ------------------------------ RebindMask + +template +HWY_API Mask128 RebindMask(Simd dto, Mask128 m) { + static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); + return MaskFromVec(BitCast(dto, VecFromMask(Simd(), m))); +} + +// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a. + #define HWY_NEON_BUILD_TPL_HWY_IF #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ @@ -1524,7 +1565,6 @@ HWY_INLINE Vec128 ZeroIfNegative(V return Max(zero, v); } - // ------------------------------ Mask logical template @@ -1557,30 +1597,183 @@ HWY_API Mask128 Xor(const Mask128< return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } -// ------------------------------ Min (IfThenElse, BroadcastSignBit) +// ================================================== COMPARE -namespace detail { +// Comparisons fill a lane with 1-bits if the condition is true, else 0. -#if defined(__aarch64__) +// ------------------------------ Shuffle2301 (for i64 compares) -HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { - return Vec128(vcgtq_u64(a.raw, b.raw)); +// Swap 32-bit halves in 64-bits +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64_u32(v.raw)); +} +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64_s32(v.raw)); +} +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64_f32(v.raw)); +} +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_u32(v.raw)); } -HWY_INLINE Vec128 Gt(Vec128 a, - Vec128 b) { - return Vec128(vcgt_u64(a.raw, b.raw)); +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_s32(v.raw)); +} +HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { + return Vec128(vrev64q_f32(v.raw)); } -HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { - return Vec128(vcgtq_s64(a.raw, b.raw)); +#define HWY_NEON_BUILD_TPL_HWY_COMPARE +#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 +#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ + const Vec128 a, const Vec128 b +#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw + +// ------------------------------ Equality +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) +#else +// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) +#endif + +// ------------------------------ Strict inequality (signed, float) +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) +#else +HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) +#endif +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) + +// ------------------------------ Weak inequality (float) +HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) + +#undef HWY_NEON_BUILD_TPL_HWY_COMPARE +#undef HWY_NEON_BUILD_RET_HWY_COMPARE +#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE +#undef HWY_NEON_BUILD_ARG_HWY_COMPARE + +// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq) + +#if HWY_ARCH_ARM_V7 + +template +HWY_INLINE Mask128 operator==(const Vec128 a, + const Vec128 b) { + const Simd d32; + const Simd d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); +} + +template +HWY_INLINE Mask128 operator==(const Vec128 a, + const Vec128 b) { + const Simd d32; + const Simd d64; + const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); + const auto cmp64 = cmp32 & Shuffle2301(cmp32); + return MaskFromVec(BitCast(d64, cmp64)); } -HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { - return Vec128(vcgt_s64(a.raw, b.raw)); + +HWY_INLINE Mask128 operator<(const Vec128 a, + const Vec128 b) { + const int64x2_t sub = vqsubq_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec128(sub))); +} +HWY_INLINE Mask128 operator<(const Vec128 a, + const Vec128 b) { + const int64x1_t sub = vqsub_s64(a.raw, b.raw); + return MaskFromVec(BroadcastSignBit(Vec128(sub))); } #endif -} // namespace detail +// ------------------------------ Reversed comparisons + +template +HWY_API Mask128 operator>(Vec128 a, Vec128 b) { + return operator<(b, a); +} +template +HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { + return operator<=(b, a); +} + +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + +// ------------------------------ TestBit (Eq) + +#define HWY_NEON_BUILD_TPL_HWY_TESTBIT +#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 +#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ + Vec128 v, Vec128 bit +#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw + +#if HWY_ARCH_ARM_A64 +HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) +#else +// No 64-bit versions on armv7 +HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) +HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) + +template +HWY_INLINE Mask128 TestBit(Vec128 v, + Vec128 bit) { + return (v & bit) == bit; +} +template +HWY_INLINE Mask128 TestBit(Vec128 v, + Vec128 bit) { + return (v & bit) == bit; +} + +#endif +#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT +#undef HWY_NEON_BUILD_RET_HWY_TESTBIT +#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT +#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT + +// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) +HWY_INLINE Vec128 Abs(const Vec128 v) { +#if HWY_ARCH_ARM_A64 + return Vec128(vabsq_s64(v.raw)); +#else + const auto zero = Zero(Full128()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} +HWY_INLINE Vec128 Abs(const Vec128 v) { +#if HWY_ARCH_ARM_A64 + return Vec128(vabs_s64(v.raw)); +#else + const auto zero = Zero(Simd()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + +// ------------------------------ Min (IfThenElse, BroadcastSignBit) + +#if HWY_ARCH_ARM_A64 + +HWY_INLINE Mask128 operator<(Vec128 a, Vec128 b) { + return Mask128(vcltq_u64(a.raw, b.raw)); +} +HWY_INLINE Mask128 operator<(Vec128 a, + Vec128 b) { + return Mask128(vclt_u64(a.raw, b.raw)); +} + +#endif // Unsigned HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) @@ -1588,8 +1781,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, template HWY_INLINE Vec128 Min(const Vec128 a, const Vec128 b) { -#if defined(__aarch64__) - return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); #else const Simd du; const Simd di; @@ -1603,8 +1796,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, v template HWY_INLINE Vec128 Min(const Vec128 a, const Vec128 b) { -#if defined(__aarch64__) - return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, b, a); #else const Vec128 sign = detail::SaturatedSub(a, b); return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); @@ -1612,7 +1805,7 @@ HWY_INLINE Vec128 Min(const } // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN. -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2) #else HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) @@ -1626,8 +1819,8 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, template HWY_INLINE Vec128 Max(const Vec128 a, const Vec128 b) { -#if defined(__aarch64__) - return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); #else const Simd du; const Simd di; @@ -1641,8 +1834,8 @@ HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, v template HWY_INLINE Vec128 Max(const Vec128 a, const Vec128 b) { -#if defined(__aarch64__) - return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); +#if HWY_ARCH_ARM_A64 + return IfThenElse(b < a, a, b); #else const Vec128 sign = detail::SaturatedSub(a, b); return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); @@ -1650,7 +1843,7 @@ HWY_INLINE Vec128 Max(const } // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN. -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2) #else HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) @@ -1696,7 +1889,7 @@ HWY_INLINE Vec128 LoadU(Full128(vld1q_f32(aligned)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 LoadU(Full128 /* tag */, const double* HWY_RESTRICT aligned) { return Vec128(vld1q_f64(aligned)); @@ -1741,7 +1934,7 @@ HWY_INLINE Vec128 LoadU(Simd(vld1_f32(p)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 LoadU(Simd /* tag */, const double* HWY_RESTRICT p) { return Vec128(vld1_f64(p)); @@ -1755,73 +1948,72 @@ HWY_INLINE Vec128 LoadU(Simd< // we don't actually care what is in it, and we don't want // to introduce extra overhead by initializing it to something. -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const uint8_t* HWY_RESTRICT p) { - uint32x2_t a = Undefined(d).raw; + uint32x2_t a = Undefined(Simd()).raw; uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_u8_u32(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const uint16_t* HWY_RESTRICT p) { - uint32x2_t a = Undefined(d).raw; + uint32x2_t a = Undefined(Simd()).raw; uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_u16_u32(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const uint32_t* HWY_RESTRICT p) { - uint32x2_t a = Undefined(d).raw; + uint32x2_t a = Undefined(Simd()).raw; uint32x2_t b = vld1_lane_u32(p, a, 0); return Vec128(b); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const int8_t* HWY_RESTRICT p) { - int32x2_t a = Undefined(d).raw; + int32x2_t a = Undefined(Simd()).raw; int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_s8_s32(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const int16_t* HWY_RESTRICT p) { - int32x2_t a = Undefined(d).raw; + int32x2_t a = Undefined(Simd()).raw; int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_s16_s32(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const int32_t* HWY_RESTRICT p) { - int32x2_t a = Undefined(d).raw; + int32x2_t a = Undefined(Simd()).raw; int32x2_t b = vld1_lane_s32(p, a, 0); return Vec128(b); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const float* HWY_RESTRICT p) { - float32x2_t a = Undefined(d).raw; + float32x2_t a = Undefined(Simd()).raw; float32x2_t b = vld1_lane_f32(p, a, 0); return Vec128(b); } // ------------------------------ Load 16 -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const uint8_t* HWY_RESTRICT p) { - uint16x4_t a = Undefined(d).raw; + uint16x4_t a = Undefined(Simd()).raw; uint16x4_t b = vld1_lane_u16(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_u8_u16(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const uint16_t* HWY_RESTRICT p) { - uint16x4_t a = Undefined(d).raw; + uint16x4_t a = Undefined(Simd()).raw; uint16x4_t b = vld1_lane_u16(p, a, 0); return Vec128(b); } - -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const int8_t* HWY_RESTRICT p) { - int16x4_t a = Undefined(d).raw; + int16x4_t a = Undefined(Simd()).raw; int16x4_t b = vld1_lane_s16(reinterpret_cast(p), a, 0); return Vec128(vreinterpret_s8_s16(b)); } -HWY_INLINE Vec128 LoadU(Simd d, +HWY_INLINE Vec128 LoadU(Simd /*tag*/, const int16_t* HWY_RESTRICT p) { - int16x4_t a = Undefined(d).raw; + int16x4_t a = Undefined(Simd()).raw; int16x4_t b = vld1_lane_s16(p, a, 0); return Vec128(b); } @@ -1902,7 +2094,7 @@ HWY_INLINE void StoreU(const Vec128 v, Full128 /* tag */, double* HWY_RESTRICT aligned) { vst1q_f64(aligned, v.raw); @@ -1947,7 +2139,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd /* tag */, double* HWY_RESTRICT p) { vst1_f64(p, v.raw); @@ -1959,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, uint8_t* HWY_RESTRICT p) { uint32x2_t a = vreinterpret_u32_u8(v.raw); - vst1_lane_u32(p, a, 0); + vst1_lane_u32(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, uint16_t* HWY_RESTRICT p) { uint32x2_t a = vreinterpret_u32_u16(v.raw); - vst1_lane_u32(p, a, 0); + vst1_lane_u32(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, uint32_t* HWY_RESTRICT p) { @@ -1973,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, int8_t* HWY_RESTRICT p) { int32x2_t a = vreinterpret_s32_s8(v.raw); - vst1_lane_s32(p, a, 0); + vst1_lane_s32(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, int16_t* HWY_RESTRICT p) { int32x2_t a = vreinterpret_s32_s16(v.raw); - vst1_lane_s32(p, a, 0); + vst1_lane_s32(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, int32_t* HWY_RESTRICT p) { @@ -1994,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, uint8_t* HWY_RESTRICT p) { uint16x4_t a = vreinterpret_u16_u8(v.raw); - vst1_lane_u16(p, a, 0); + vst1_lane_u16(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, uint16_t* HWY_RESTRICT p) { @@ -2003,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128 v, Simd, int8_t* HWY_RESTRICT p) { int16x4_t a = vreinterpret_s16_s8(v.raw); - vst1_lane_s16(p, a, 0); + vst1_lane_s16(reinterpret_cast(p), a, 0); } HWY_INLINE void StoreU(const Vec128 v, Simd, int16_t* HWY_RESTRICT p) { @@ -2068,18 +2260,18 @@ HWY_INLINE Vec128 PromoteTo(Fu const Vec128 v) { return Vec128(vmovl_u32(v.raw)); } -HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, +HWY_INLINE Vec128 PromoteTo(Full128 d, const Vec128 v) { - return Vec128(vmovl_u8(v.raw)); + return BitCast(d, Vec128(vmovl_u8(v.raw))); } -HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, +HWY_INLINE Vec128 PromoteTo(Full128 d, const Vec128 v) { uint16x8_t a = vmovl_u8(v.raw); - return Vec128(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a)))); + return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); } -HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, +HWY_INLINE Vec128 PromoteTo(Full128 d, const Vec128 v) { - return Vec128(vmovl_u16(v.raw)); + return BitCast(d, Vec128(vmovl_u16(v.raw))); } // Unsigned: zero-extend to half vector. @@ -2105,9 +2297,9 @@ HWY_INLINE Vec128 PromoteTo return Vec128(vget_low_u64(vmovl_u32(v.raw))); } template -HWY_INLINE Vec128 PromoteTo(Simd /* tag */, +HWY_INLINE Vec128 PromoteTo(Simd d, const Vec128 v) { - return Vec128(vget_low_s16(vmovl_u8(v.raw))); + return BitCast(d, Vec128(vget_low_u16(vmovl_u8(v.raw)))); } template HWY_INLINE Vec128 PromoteTo(Simd /* tag */, @@ -2170,12 +2362,14 @@ HWY_INLINE Vec128 PromoteTo( HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, const Vec128 v) { - return Vec128(vcvt_f32_f16(v.raw)); + const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); + return Vec128(f32); } template HWY_INLINE Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128(vget_low_f32(vcvt_f32_f16(v.raw))); + const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); + return Vec128(vget_low_f32(f32)); } #else @@ -2204,7 +2398,7 @@ HWY_INLINE Vec128 PromoteTo(Si #endif -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, const Vec128 v) { @@ -2298,12 +2492,13 @@ HWY_INLINE Vec128 DemoteTo(Si HWY_INLINE Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{vcvt_f16_f32(v.raw)}; + return Vec128{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))}; } template HWY_INLINE Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))}; + const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw)); + return Vec128(vreinterpret_u16_f16(f16)); } #else @@ -2339,7 +2534,7 @@ HWY_INLINE Vec128 DemoteTo } #endif -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { @@ -2397,7 +2592,7 @@ HWY_INLINE Vec128 DemoteTo(Si const Vec128 v) { Vec128 a = DemoteTo(Simd(), v); Vec128 b; - uint16x8_t c = vcombine_s16(a.raw, b.raw); + int16x8_t c = vcombine_s16(a.raw, b.raw); return Vec128(vqmovn_s16(c)); } @@ -2426,7 +2621,7 @@ HWY_INLINE Vec128 ConvertTo( return Vec128(vcvt_s32_f32(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 ConvertTo(Full128 /* tag */, const Vec128 v) { @@ -2451,7 +2646,7 @@ HWY_INLINE Vec128 ConvertTo( // ------------------------------ Round (IfThenElse, mask, logical) -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 // Toward nearest integer HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) @@ -2472,18 +2667,26 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, // representation, clearing the lowest 23-exp mantissa bits. This requires 9 // integer operations and 3 constants, which is likely more expensive. +namespace detail { + +// The original value is already the desired result if NaN or the magnitude is +// large (i.e. the value is already an integer). +template +HWY_API Mask128 UseInt(const Vec128 v) { + return Abs(v) < Set(Simd(), MantissaEnd()); +} + +} // namespace detail + template HWY_INLINE Vec128 Trunc(const Vec128 v) { const Simd df; - const Simd di; + const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); - // The original value is already the desired result if NaN or the magnitude is - // large (i.e. the value is already an integer). - const auto max = Set(df, MantissaEnd()); - return IfThenElse(Abs(v) < max, int_f, v); + return IfThenElse(detail::UseInt(v), int_f, v); } template @@ -2506,7 +2709,7 @@ HWY_INLINE Vec128 Round(const template HWY_INLINE Vec128 Ceil(const Vec128 v) { const Simd df; - const Simd di; + const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); @@ -2514,9 +2717,7 @@ HWY_INLINE Vec128 Ceil(const V // Truncating a positive non-integer ends up smaller; if so, add 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); - // Keep original if NaN or the magnitude is large (already an int). - const auto max = Set(df, MantissaEnd()); - return IfThenElse(Abs(v) < max, int_f - neg1, v); + return IfThenElse(detail::UseInt(v), int_f - neg1, v); } template @@ -2530,16 +2731,14 @@ HWY_INLINE Vec128 Floor(const // Truncating a negative non-integer ends up larger; if so, subtract 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); - // Keep original if NaN or the magnitude is large (already an int). - const auto max = Set(df, MantissaEnd()); - return IfThenElse(Abs(v) < max, int_f + neg1, v); + return IfThenElse(detail::UseInt(v), int_f + neg1, v); } #endif // ------------------------------ NearestInt (Round) -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 NearestInt(const Vec128 v) { return Vec128(vcvtnq_s32_f32(v.raw)); @@ -2596,7 +2795,7 @@ HWY_INLINE Vec128 LowerHalf( HWY_INLINE Vec128 LowerHalf(const Vec128 v) { return Vec128(vget_low_f32(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 LowerHalf(const Vec128 v) { return Vec128(vget_low_f64(v.raw)); } @@ -2629,7 +2828,7 @@ HWY_INLINE Vec128 UpperHalf( HWY_INLINE Vec128 UpperHalf(const Vec128 v) { return Vec128(vget_high_f32(v.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 UpperHalf(const Vec128 v) { return Vec128(vget_high_f64(v.raw)); } @@ -2714,7 +2913,7 @@ HWY_INLINE Vec128 ShiftRightLanes( // ------------------------------ Broadcast/splat any lane -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 // Unsigned template HWY_INLINE Vec128 Broadcast(const Vec128 v) { @@ -2886,7 +3085,7 @@ HWY_API Vec128 TableLookupBytes(const const Vec128 from) { const Full128 d; const Repartition d8; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, BitCast(d8, from).raw))); #else @@ -2911,33 +3110,58 @@ HWY_INLINE Vec128 TableLookupBytes BitCast(d8, from).raw))); } -// ------------------------------ Hard-coded shuffles +// ------------------------------ TableLookupLanes -// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -// Shuffle0321 rotates one lane to the right (the previous least-significant -// lane is now most-significant). These could also be implemented via -// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. +// Returned by SetTableIndices for use by TableLookupLanes. +template +struct Indices128 { + typename detail::Raw128::type raw; +}; -// Swap 32-bit halves in 64-bits -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64_u32(v.raw)); -} -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64_s32(v.raw)); -} -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64_f32(v.raw)); +template +HWY_INLINE Indices128 SetTableIndices(Simd d, const int32_t* idx) { +#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) + for (size_t i = 0; i < N; ++i) { + HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); + } +#endif + + const Repartition d8; + alignas(16) uint8_t control[16] = {0}; + for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + control[idx_lane * sizeof(T) + idx_byte] = + static_cast(idx[idx_lane] * sizeof(T) + idx_byte); + } + } + return Indices128{BitCast(d, Load(d8, control)).raw}; } -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_u32(v.raw)); + +template +HWY_INLINE Vec128 TableLookupLanes( + const Vec128 v, const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_s32(v.raw)); +template +HWY_INLINE Vec128 TableLookupLanes( + const Vec128 v, const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } -HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_f32(v.raw)); +template +HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + const Simd di; + const auto idx_i = BitCast(di, Vec128{idx.raw}); + return BitCast(Simd(), TableLookupBytes(BitCast(di, v), idx_i)); } +// ------------------------------ Other shuffles (TableLookupBytes) + +// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). +// Shuffle0321 rotates one lane to the right (the previous least-significant +// lane is now most-significant). These could also be implemented via +// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. + // Swap 64-bit halves template HWY_INLINE Vec128 Shuffle1032(const Vec128 v) { @@ -2975,49 +3199,6 @@ HWY_INLINE Vec128 Shuffle0123(const V return TableLookupBytes(v, BitCast(d, Load(d8, bytes))); } -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices for use by TableLookupLanes. -template -struct Indices128 { - uint8x16_t raw; -}; - -template -HWY_INLINE Indices128 SetTableIndices(const Full128, const int32_t* idx) { -#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) - const size_t N = 16 / sizeof(T); - for (size_t i = 0; i < N; ++i) { - HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); - } -#endif - - const Full128 d8; - alignas(16) uint8_t control[16]; - for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { - const size_t idx_lane = idx_byte / sizeof(T); - const size_t mod = idx_byte % sizeof(T); - control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; - } - return Indices128{Load(d8, control).raw}; -} - -HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128(idx.raw)); -} -HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128(idx.raw)); -} -HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - const Full128 di; - const Full128 df; - return BitCast(df, - TableLookupBytes(BitCast(di, v), Vec128(idx.raw))); -} - // ------------------------------ Interleave lanes // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides @@ -3029,7 +3210,7 @@ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Inter HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 // For 64 bit types, we only have the "q" version of the function defined as // interleaving 64-wide registers with 64-wide types in them makes no sense. HWY_INLINE Vec128 InterleaveLower(const Vec128 a, @@ -3079,7 +3260,7 @@ HWY_INLINE Vec128 InterleaveLower const Vec128 b) { return Vec128(vzip1q_f32(a.raw, b.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 InterleaveLower(const Vec128 a, const Vec128 b) { return Vec128(vzip1q_f64(a.raw, b.raw)); @@ -3090,10 +3271,10 @@ HWY_INLINE Vec128 InterleaveUpper const Vec128 b) { return Vec128(vzip2q_f32(a.raw, b.raw)); } -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_s64(a.raw, b.raw)); + return Vec128(vzip2q_f64(a.raw, b.raw)); } #endif @@ -3105,119 +3286,125 @@ HWY_INLINE Vec128 InterleaveUppe // Full vectors HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_u8(a.raw, b.raw)); + return Vec128(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw))); } HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_u16(a.raw, b.raw)); + return Vec128(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw))); } HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_u32(a.raw, b.raw)); + return Vec128(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw))); } HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_s8(a.raw, b.raw)); + return Vec128(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw))); } HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_s16(a.raw, b.raw)); + return Vec128(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw))); } HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1q_s32(a.raw, b.raw)); + return Vec128(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_u8(a.raw, b.raw)); + return Vec128(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_u16(a.raw, b.raw)); + return Vec128(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_u32(a.raw, b.raw)); + return Vec128(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_s8(a.raw, b.raw)); + return Vec128(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_s16(a.raw, b.raw)); + return Vec128(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw))); } HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2q_s32(a.raw, b.raw)); + return Vec128(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw))); } // Half vectors or less template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_u8(a.raw, b.raw)); + return Vec128( + vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_u16(a.raw, b.raw)); + return Vec128( + vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_u32(a.raw, b.raw)); + return Vec128( + vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_s8(a.raw, b.raw)); + return Vec128( + vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_s16(a.raw, b.raw)); + return Vec128( + vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128(vzip1_s32(a.raw, b.raw)); + return Vec128( + vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_u8(a.raw, b.raw)); + return Vec128(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_u16(a.raw, b.raw)); + return Vec128(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_u32(a.raw, b.raw)); + return Vec128(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_s8(a.raw, b.raw)); + return Vec128(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_s16(a.raw, b.raw)); + return Vec128(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw))); } template HWY_INLINE Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128(vzip2_s32(a.raw, b.raw)); + return Vec128(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw))); } // ------------------------------ Blocks @@ -3274,84 +3461,113 @@ HWY_INLINE Vec128 OddEven(const Vec12 // ================================================== MISC -// Returns a vector with lane i=[0, N) set to "first" + i. -template -Vec128 Iota(const Simd d, const T2 first) { - HWY_ALIGN T lanes[16 / sizeof(T)]; - for (size_t i = 0; i < 16 / sizeof(T); ++i) { - lanes[i] = static_cast(first + static_cast(i)); +// ------------------------------ Scatter (Store) + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; } - return Load(d, lanes); } -// ------------------------------ Gather (requires GetLane) +// ------------------------------ Gather (Load/Store) template HWY_API Vec128 GatherOffset(const Simd d, const T* HWY_RESTRICT base, const Vec128 offset) { - static_assert(N == 1, "NEON does not support full gather"); - static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); - const uintptr_t address = reinterpret_cast(base) + GetLane(offset); - T val; - CopyBytes(reinterpret_cast(address), &val); - return Set(d, val); + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); } template HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, const Vec128 index) { - static_assert(N == 1, "NEON does not support full gather"); - static_assert(sizeof(T) == sizeof(Index), "T must match Index"); - return Set(d, base[GetLane(index)]); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); } -// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301) +// ------------------------------ Reductions -#if !defined(__aarch64__) +namespace detail { -template -HWY_INLINE Mask128 operator==(const Vec128 a, - const Vec128 b) { - const Simd d32; - const Simd d64; - const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); - const auto cmp64 = cmp32 & Shuffle2301(cmp32); - return MaskFromVec(BitCast(d64, cmp64)); +// N=1 for any T: no-op +template +HWY_API Vec128 SumOfLanes(const Vec128 v) { + return v; } - -template -HWY_INLINE Mask128 operator==(const Vec128 a, - const Vec128 b) { - const Simd d32; - const Simd d64; - const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); - const auto cmp64 = cmp32 & Shuffle2301(cmp32); - return MaskFromVec(BitCast(d64, cmp64)); +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; } -HWY_INLINE Mask128 operator<(const Vec128 a, - const Vec128 b) { - const int64x2_t sub = vqsubq_s64(a.raw, b.raw); - return MaskFromVec(BroadcastSignBit(Vec128(sub))); +// u32/i32/f32: N=2 +template +HWY_API Vec128 SumOfLanes(const Vec128 v10) { + return v10 + Shuffle2301(v10); } -HWY_INLINE Mask128 operator<(const Vec128 a, - const Vec128 b) { - const int64x1_t sub = vqsub_s64(a.raw, b.raw); - return MaskFromVec(BroadcastSignBit(Vec128(sub))); +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Shuffle2301(v10)); } - -template -HWY_INLINE Mask128 operator>(const Vec128 a, - const Vec128 b) { - return b < a; +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Shuffle2301(v10)); } -#endif - -// ------------------------------ Reductions -#if defined(__aarch64__) -// Supported for 32b and 64b vector types. Returns the sum in each lane. +// full vectors +#if HWY_ARCH_ARM_A64 HWY_INLINE Vec128 SumOfLanes(const Vec128 v) { return Vec128(vdupq_n_u32(vaddvq_u32(v.raw))); } @@ -3398,20 +3614,15 @@ HWY_INLINE Vec128 SumOfLanes(co } #endif -namespace detail { - -// For u32/i32/f32. -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Min(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Min(v20_31_20_31, v31_20_31_20); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Max(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); @@ -3419,15 +3630,13 @@ HWY_API Vec128 MaxOfLanes(hwy::Siz } // For u64/i64[/f64]. -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Min(v10, v01); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Max(v10, v01); } @@ -3435,6 +3644,10 @@ HWY_API Vec128 MaxOfLanes(hwy::Siz } // namespace detail template +HWY_API Vec128 SumOfLanes(const Vec128 v) { + return detail::SumOfLanes(v); +} +template HWY_API Vec128 MinOfLanes(const Vec128 v) { return detail::MinOfLanes(hwy::SizeTag(), v); } @@ -3457,18 +3670,18 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Vec128 values = BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 // Can't vaddv - we need two separate bytes (16 bits). const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); const uint8x8_t x4 = vpadd_u8(x2, x2); const uint8x8_t x8 = vpadd_u8(x4, x4); - return vreinterpret_u16_u8(x8)[0]; + return vget_lane_u64(vreinterpret_u64_u8(x8), 0); #else // Don't have vpaddq, so keep doubling lane size. const uint16x8_t x2 = vpaddlq_u8(values.raw); const uint32x4_t x4 = vpaddlq_u16(x2); const uint64x2_t x8 = vpaddlq_u32(x4); - return (uint64_t(x8[1]) << 8) | x8[0]; + return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); #endif } @@ -3484,7 +3697,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Vec128 slice(Load(Simd(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddv_u8(values.raw); #else const uint16x4_t x2 = vpaddl_u8(values.raw); @@ -3503,7 +3716,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_u16(values.raw); #else const uint32x4_t x2 = vpaddlq_u16(values.raw); @@ -3522,7 +3735,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Simd du; const Vec128 slice(Load(Simd(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddv_u16(values.raw); #else const uint32x2_t x2 = vpaddl_u16(values.raw); @@ -3539,7 +3752,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_u32(values.raw); #else const uint64x2_t x2 = vpaddlq_u32(values.raw); @@ -3557,7 +3770,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Simd du; const Vec128 slice(Load(Simd(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddv_u32(values.raw); #else const uint64x1_t x2 = vpaddl_u32(values.raw); @@ -3572,7 +3785,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_u64(values.raw); #else return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); @@ -3612,13 +3825,13 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag const int8x16_t ones = vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_s8(ones); #else const int16x8_t x2 = vpaddlq_s8(ones); const int32x4_t x4 = vpaddlq_s16(x2); const int64x2_t x8 = vpaddlq_s32(x4); - return x8[0] + x8[1]; + return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1); #endif } template @@ -3627,12 +3840,12 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag const int16x8_t ones = vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_s16(ones); #else const int32x4_t x2 = vpaddlq_s16(ones); const int64x2_t x4 = vpaddlq_s32(x2); - return x4[0] + x4[1]; + return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1); #endif } @@ -3642,26 +3855,26 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag const int32x4_t ones = vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 return vaddvq_s32(ones); #else const int64x2_t x2 = vpaddlq_s32(ones); - return x2[0] + x2[1]; + return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1); #endif } template HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { -#if defined(__aarch64__) +#if HWY_ARCH_ARM_A64 const Full128 di; const int64x2_t ones = vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); return vaddvq_s64(ones); #else - const Full128 di; - const int64x2_t ones = - vshrq_n_u64(BitCast(di, VecFromMask(Full128(), mask)).raw, 63); - return ones[0] + ones[1]; + const Full128 du; + const auto mask_u = VecFromMask(du, RebindMask(du, mask)); + const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); + return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1); #endif } @@ -3690,9 +3903,15 @@ HWY_INLINE size_t StoreMaskBits(const Ma // Full template HWY_INLINE bool AllFalse(const Mask128 m) { +#if HWY_ARCH_ARM_A64 + const Full128 d32; + const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128(), m))); + return (vmaxvq_u32(m32.raw) == 0); +#else const auto v64 = BitCast(Full128(), VecFromMask(Full128(), m)); uint32x2_t a = vqmovn_u64(v64.raw); - return vreinterpret_u64_u32(a)[0] == 0; + return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0; +#endif } // Partial @@ -3711,8 +3930,160 @@ HWY_INLINE bool AllTrue(const Mask128 Load8Bytes(Full128 /*d*/, + const uint8_t* bytes) { + return Vec128(vreinterpretq_u8_u64( + vld1q_dup_u64(reinterpret_cast(bytes)))); +} + +// Load 8 bytes and return half-reg with N <= 8 bytes. +template +HWY_INLINE Vec128 Load8Bytes(Simd d, + const uint8_t* bytes) { + return Load(d, bytes); +} + template -HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, + const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Repartition d8; + const Simd du; + + // ARM does not provide an equivalent of AVX2 permutevar, so we need byte + // indices for VTBL (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, + 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, + 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, + 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, + 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, + 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, + 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, + 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, + 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, + 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, + 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, + 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, + 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, + 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, + 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, + 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, + 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, + 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, + 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, + 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, + 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, + 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, + 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, + 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, + 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, + 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, + 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, + 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, + 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, + 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, + 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, + 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, + 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, + 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, + 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, + 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, + 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, + 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, + 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, + 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, + 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, + 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, + 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, + 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, + 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, + 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, + 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, + 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, + 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, + 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, + 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, + 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, + 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, + 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, + 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, + 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, + 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, + 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, + 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, + 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, + 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, + 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, + 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, + 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, + 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, + 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, + 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, + 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, + 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, + 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, + 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, + 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, + 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, + 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, + 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, + 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, + 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, + 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, + 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, + 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, + 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, + 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, + 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, + 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, + 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, + 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, + 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, + 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, + 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, + 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, + 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, + 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, + 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, + 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, + 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, + 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, + 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, + 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, + 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, + 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, + 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, + 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, + 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, + 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, + 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, + 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, + const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. @@ -3742,7 +4113,8 @@ HWY_INLINE Vec128 Idx32x4FromBits( #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64 template -HWY_INLINE Vec128 Idx64x2FromBits(const uint64_t mask_bits) { +HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, + const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. @@ -3761,59 +4133,15 @@ HWY_INLINE Vec128 Idx64x2FromBits( // Helper function called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); -} -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); -} - -#if HWY_CAP_INTEGER64 - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -} -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -} - -#endif - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); -} - -#if HWY_CAP_FLOAT64 - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx64x2FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); +template +HWY_API Vec128 Compress(Vec128 v, const uint64_t mask_bits) { + const auto idx = + detail::IdxFromBits(hwy::SizeTag(), mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } -#endif - } // namespace detail template @@ -3831,6 +4159,79 @@ HWY_API size_t CompressStore(Vec128 v0, + const Vec128 v1, + const Vec128 v2, + Full128 /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw}; + vst3q_u8(unaligned, triple); +} + +// 64 bits +HWY_API void StoreInterleaved3(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; + vst3_u8(unaligned, triple); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void StoreInterleaved3(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + alignas(16) uint8_t buf[24]; + const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; + vst3_u8(buf, triple); + CopyBytes(buf, unaligned); +} + +// ------------------------------ StoreInterleaved4 + +// 128 bits +HWY_API void StoreInterleaved4(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + const Vec128 v3, + Full128 /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; + vst4q_u8(unaligned, quad); +} + +// 64 bits +HWY_API void StoreInterleaved4(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + const Vec128 v3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; + vst4_u8(unaligned, quad); +} + +// <= 32 bits: avoid writing more than N bytes by copying to buffer +template +HWY_API void StoreInterleaved4(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + const Vec128 v3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + alignas(16) uint8_t buf[32]; + const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; + vst4_u8(buf, quad); + CopyBytes(buf, unaligned); +} + // ================================================== Operator wrapper // These apply to all x86_*-inl.h because there are no restrictions on V. @@ -3885,7 +4286,8 @@ HWY_API auto Le(V a, V b) -> decltype(a return a <= b; } -#if !defined(__aarch64__) +namespace detail { // for code folding +#if HWY_ARCH_ARM_V7 #undef vuzp1_s8 #undef vuzp1_u8 #undef vuzp1_s16 @@ -3972,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 #undef HWY_NEON_DEF_FUNCTION_UINTS #undef HWY_NEON_EVAL +} // namespace detail // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/arm_neon-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h.12 2021-06-02 10:56:05.230904367 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -39,6 +39,11 @@ using TFromV = TFromD>; hwy::EnableIf>() && !IsFloat>()>* = nullptr #define HWY_IF_FLOAT_V(V) hwy::EnableIf>()>* = nullptr +// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4 +template +using Full = Simd> (-kShift)) + : (HWY_LANES(T) << kShift)>; + // ================================================== MACROS // Generate specializations and function definitions using X macros. Although @@ -58,29 +63,30 @@ namespace detail { // for code folding // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the // preprocessor cannot easily do it. -#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP) - -#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP) - -#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP) - -#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP) +// TODO(janwas): GCC does not yet support fractional LMUL +#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP) // SEW for unsigned: #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \ @@ -153,63 +159,61 @@ namespace detail { // for code folding // Assemble types for use in x-macros #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t -#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL -#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t +#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL +#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t #define HWY_RVV_M(MLEN) vbool##MLEN##_t } // namespace detail // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly -// TODO(janwas): do we want fractional LMUL? (can encode as negative) -// Mixed-precision code can use LMUL 1..8 and that should be enough unless they -// need many registers. -#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - using HWY_RVV_D(CHAR, SEW, LMUL) = \ - Simd; \ - using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \ - template <> \ - struct DFromV_t { \ - using Lane = HWY_RVV_T(BASE, SEW); \ - using type = Simd; \ +// Until we have full intrinsic support for fractional LMUL, mixed-precision +// code can use LMUL 1..8 (adequate unless they need many registers). +#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + using HWY_RVV_D(CHAR, SEW, LMUL) = Full; \ + using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL); \ + template <> \ + struct DFromV_t { \ + using Lane = HWY_RVV_T(BASE, SEW); \ + using type = Full; \ }; using Vf16m1 = vfloat16m1_t; using Vf16m2 = vfloat16m2_t; using Vf16m4 = vfloat16m4_t; using Vf16m8 = vfloat16m8_t; -using Df16m1 = Simd; -using Df16m2 = Simd; -using Df16m4 = Simd; -using Df16m8 = Simd; +using Df16m1 = Full; +using Df16m2 = Full; +using Df16m4 = Full; +using Df16m8 = Full; HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _) #undef HWY_SPECIALIZE // vector = f(d), e.g. Zero -#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \ (void)Lanes(d); \ - return v##OP##_##CHAR##SEW##m##LMUL(); \ + return v##OP##_##CHAR##SEW##LMUL(); \ } // vector = f(vector), e.g. Not -#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_v_##CHAR##SEW##m##LMUL(v); \ + return v##OP##_v_##CHAR##SEW##LMUL(v); \ } // vector = f(vector, scalar), e.g. detail::Add -#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ - return v##OP##_##CHAR##SEW##m##LMUL(a, b); \ +#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ + return v##OP##_##CHAR##SEW##LMUL(a, b); \ } // vector = f(vector, vector), e.g. Add -#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b); \ + return v##OP##_vv_##CHAR##SEW##LMUL(a, b); \ } // ================================================== INIT @@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _) // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL! // vlenb is not exposed through intrinsics and vreadvl is not VLMAX. -#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \ - return v##OP##SEW##m##LMUL(); \ +#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) { \ + return v##OP##SEW##LMUL(); \ } HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e) @@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero, template using VFromD = decltype(Zero(D())); +// Partial +template +HWY_API VFromD> Zero(Simd /*tag*/) { + return Zero(Full()); +} + // ------------------------------ Set // vector = f(d, scalar), e.g. Set -#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \ (void)Lanes(d); \ - return v##OP##_##CHAR##SEW##m##LMUL(arg); \ + return v##OP##_##CHAR##SEW##LMUL(arg); \ } HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x) HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f) #undef HWY_RVV_SET +// Partial vectors +template +HWY_API VFromD> Set(Simd /*tag*/, T arg) { + return Set(Full(), arg); +} + // ------------------------------ Undefined // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized @@ -265,7 +281,7 @@ HWY_API VFromD Undefined(D d) { namespace detail { // u8: no change -#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ return v; \ @@ -276,25 +292,25 @@ namespace detail { } // Other integers -#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v); \ - } \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \ - return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v); \ +#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ + } \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \ + return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ } // Float: first cast to/from unsigned -#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL( \ - v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v)); \ - } \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) { \ - return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL( \ - v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v)); \ +#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ + v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v)); \ + } \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ + HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) { \ + return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL( \ + v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ } HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _) @@ -315,6 +331,12 @@ HWY_API VFromD BitCast(D d, FromV v) return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } +// Partial +template +HWY_API VFromD> BitCast(Simd /*tag*/, FromV v) { + return BitCast(Full(), v); +} + namespace detail { template >> @@ -336,6 +358,12 @@ HWY_API VFromD Iota0(const D /*d*/) return BitCastToUnsigned(Iota0(DU())); } +// Partial +template +HWY_API VFromD> Iota0(Simd /*tag*/) { + return Iota0(Full()); +} + } // namespace detail // ================================================== LOGICAL @@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) { // ------------------------------ Or // Scalar argument plus mask. Used by VecFromMask. -#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm, \ HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \ - return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm); \ + return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm); \ } namespace detail { @@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, // ------------------------------ ShiftLeft[Same] // Intrinsics do not define .vi forms, so use .vx instead. -#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits); \ - } \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ - return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast(bits)); \ +#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits); \ + } \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast(bits)); \ } HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll) @@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi #undef HWY_RVV_SHIFT // ------------------------------ Shl -#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits); \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, bits); \ } HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll) -#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(v, \ - detail::BitCastToUnsigned(bits)); \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \ } HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll) @@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons // ------------------------------ MulAdd // Note: op is still named vv, not vvv. -#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ HWY_RVV_V(BASE, SEW, LMUL) add) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x); \ + return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x); \ } HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc) @@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub // of all bits; SLEN 8 / LMUL 4 = half of all bits. // mask = f(vector, vector) -#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_M(MLEN) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ (void)Lanes(DFromV()); \ - return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b); \ + return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b); \ } // ------------------------------ Eq @@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo #undef HWY_RVV_RETM_ARGMM // ------------------------------ IfThenElse -#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ - HWY_RVV_V(BASE, SEW, LMUL) no) { \ - return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes); \ +#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ + HWY_RVV_V(BASE, SEW, LMUL) no) { \ + return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes); \ } HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge) @@ -710,7 +737,7 @@ template using MFromD = decltype(MaskFromVec(Zero(D()))); template -HWY_API MFromD RebindMask(const D d, const MFrom mask) { +HWY_API MFromD RebindMask(const D /*d*/, const MFrom mask) { // No need to check lane size/LMUL are the same: if not, casting MFrom to // MFromD would fail. return mask; @@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, // ------------------------------ Load -#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - (void)Lanes(d); \ - return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p); \ +#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + (void)Lanes(d); \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL(p); \ } HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le) #undef HWY_RVV_LOAD -// Partial load +// Partial template HWY_API VFromD> Load(Simd d, const T* HWY_RESTRICT p) { return Load(d, p); @@ -800,16 +827,22 @@ HWY_API VFromD LoadU(D d, const TFrom // ------------------------------ Store -#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ - HWY_RVV_D(CHAR, SEW, LMUL) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - (void)Lanes(d); \ - return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v); \ +#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(CHAR, SEW, LMUL) d, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ + (void)Lanes(d); \ + return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v); \ } HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se) #undef HWY_RVV_RET_ARGVDP +// Partial +template +HWY_API void Store(VFromD> v, Simd d, T* HWY_RESTRICT p) { + return Store(v, Full(), p); +} + // ------------------------------ StoreU // RVV only requires lane alignment, not natural alignment of the entire vector. @@ -825,19 +858,62 @@ HWY_API void Stream(const V v, D d, T* H Store(v, d, aligned); } +// ------------------------------ ScatterOffset + +#define HWY_RVV_SCATTER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ + HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, detail::BitCastToUnsigned(offset), v); \ + } +HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sx) +#undef HWY_RVV_SCATTER + +// Partial +template +HWY_API void ScatterOffset(VFromD> v, Simd d, + T* HWY_RESTRICT base, + VFromD, N>> offset) { + return ScatterOffset(v, Full(), base, offset); +} + +// ------------------------------ ScatterIndex + +template +HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, + const VFromD> index) { + return ScatterOffset(v, d, base, ShiftLeft<2>(index)); +} + +template +HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, + const VFromD> index) { + return ScatterOffset(v, d, base, ShiftLeft<3>(index)); +} + // ------------------------------ GatherOffset -#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ - HWY_RVV_V(int, SEW, LMUL) offset) { \ - return v##OP##ei##SEW##_v_##CHAR##SEW##m##LMUL( \ - base, detail::BitCastToUnsigned(offset)); \ +#define HWY_RVV_GATHER(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ + const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ + HWY_RVV_V(int, SEW, LMUL) offset) { \ + return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ + base, detail::BitCastToUnsigned(offset)); \ } HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lx) #undef HWY_RVV_GATHER +// Partial +template +HWY_API VFromD> GatherOffset(Simd d, + const T* HWY_RESTRICT base, + VFromD, N>> offset) { + return GatherOffset(Full(), base, offset); +} + // ------------------------------ GatherIndex template @@ -852,37 +928,101 @@ HWY_API VFromD GatherIndex(D d, const return GatherOffset(d, base, ShiftLeft<3>(index)); } -// ================================================== CONVERT +// ------------------------------ StoreInterleaved3 -// ------------------------------ PromoteTo U +#define HWY_RVV_STORE3(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b, \ + HWY_RVV_V(BASE, SEW, LMUL) c, HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ + const v##BASE##SEW##LMUL##x3_t triple = \ + vcreate_##CHAR##SEW##LMUL##x3(a, b, c); \ + return v##OP##e8_v_##CHAR##SEW##LMUL##x3(unaligned, triple); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_STORE3(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved3, sseg3) +HWY_RVV_STORE3(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved3, sseg3) -HWY_API Vu16m2 PromoteTo(Du16m2 /* d */, Vu8m1 v) { return vzext_vf2_u16m2(v); } -HWY_API Vu16m4 PromoteTo(Du16m4 /* d */, Vu8m2 v) { return vzext_vf2_u16m4(v); } -HWY_API Vu16m8 PromoteTo(Du16m8 /* d */, Vu8m4 v) { return vzext_vf2_u16m8(v); } +#undef HWY_RVV_STORE3 -HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, Vu8m1 v) { return vzext_vf4_u32m4(v); } -HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, Vu8m2 v) { return vzext_vf4_u32m8(v); } +// Partial +template +HWY_API void StoreInterleaved3(VFromD> v0, VFromD> v1, + VFromD> v2, Simd /*tag*/, + T* unaligned) { + return StoreInterleaved3(v0, v1, v2, Full(), unaligned); +} + +// ------------------------------ StoreInterleaved4 + +#define HWY_RVV_STORE4(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API void NAME( \ + HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ + HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ + HWY_RVV_D(CHAR, SEW, LMUL) /* d */, \ + HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) { \ + const v##BASE##SEW##LMUL##x4_t quad = \ + vcreate_##CHAR##SEW##LMUL##x4(v0, v1, v2, v3); \ + return v##OP##e8_v_##CHAR##SEW##LMUL##x4(aligned, quad); \ + } +// Segments are limited to 8 registers, so we can only go up to LMUL=2. +HWY_RVV_STORE4(uint, u, 8, m1, /*kShift=*/0, 8, StoreInterleaved4, sseg4) +HWY_RVV_STORE4(uint, u, 8, m2, /*kShift=*/1, 4, StoreInterleaved4, sseg4) -HWY_API Vu32m2 PromoteTo(Du32m2 /* d */, const Vu16m1 v) { - return vzext_vf2_u32m2(v); -} -HWY_API Vu32m4 PromoteTo(Du32m4 /* d */, const Vu16m2 v) { - return vzext_vf2_u32m4(v); -} -HWY_API Vu32m8 PromoteTo(Du32m8 /* d */, const Vu16m4 v) { - return vzext_vf2_u32m8(v); -} +#undef HWY_RVV_STORE4 -HWY_API Vu64m2 PromoteTo(Du64m2 /* d */, const Vu32m1 v) { - return vzext_vf2_u64m2(v); -} -HWY_API Vu64m4 PromoteTo(Du64m4 /* d */, const Vu32m2 v) { - return vzext_vf2_u64m4(v); -} -HWY_API Vu64m8 PromoteTo(Du64m8 /* d */, const Vu32m4 v) { - return vzext_vf2_u64m8(v); +// Partial +template +HWY_API void StoreInterleaved4(VFromD> v0, VFromD> v1, + VFromD> v2, VFromD> v3, + Simd /*tag*/, T* unaligned) { + return StoreInterleaved4(v0, v1, v2, v3, Full(), unaligned); } +// ================================================== CONVERT + +#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN) \ + HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ + PromoteTo(HWY_RVV_D(CHAR, BITS, LMUL) /*d*/, \ + HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ + return OP##CHAR##BITS##LMUL(v); \ + } + +// TODO(janwas): GCC does not yet support fractional LMUL +#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2)*/ \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4) + +#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ + /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4)*/ \ + /*HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2)*/ \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1) \ + HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2) + +// ------------------------------ PromoteTo + +HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 16, uint, 8) +HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 32, uint, 16) +HWY_RVV_PROMOTE_X2(vzext_vf2_, uint, u, 64, uint, 32) +HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8) + +HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 16, int, 8) +HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 32, int, 16) +HWY_RVV_PROMOTE_X2(vsext_vf2_, int, i, 64, int, 32) +HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8) + +HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 32, float, 16) +HWY_RVV_PROMOTE_X2(vfwcvt_f_f_v_, float, f, 64, float, 32) + +// i32 to f64 +HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32) + +#undef HWY_RVV_PROMOTE_X4 +#undef HWY_RVV_PROMOTE_X2 +#undef HWY_RVV_PROMOTE + template HWY_API VFromD> PromoteTo(Simd d, VFromD> v) { @@ -901,67 +1041,6 @@ HWY_API VFromD> Promote return BitCast(d, PromoteTo(Simd(), v)); } -// ------------------------------ PromoteTo I - -HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); } -HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); } -HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); } - -HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); } -HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); } - -HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) { - return vsext_vf2_i32m2(v); -} -HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) { - return vsext_vf2_i32m4(v); -} -HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) { - return vsext_vf2_i32m8(v); -} - -HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) { - return vsext_vf2_i64m2(v); -} -HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) { - return vsext_vf2_i64m4(v); -} -HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) { - return vsext_vf2_i64m8(v); -} - -// ------------------------------ PromoteTo F - -HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) { - return vfwcvt_f_f_v_f32m2(v); -} -HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) { - return vfwcvt_f_f_v_f32m4(v); -} -HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) { - return vfwcvt_f_f_v_f32m8(v); -} - -HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) { - return vfwcvt_f_f_v_f64m2(v); -} -HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) { - return vfwcvt_f_f_v_f64m4(v); -} -HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) { - return vfwcvt_f_f_v_f64m8(v); -} - -HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) { - return vfwcvt_f_x_v_f64m2(v); -} -HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) { - return vfwcvt_f_x_v_f64m4(v); -} -HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) { - return vfwcvt_f_x_v_f64m8(v); -} - // ------------------------------ DemoteTo U // First clamp negative numbers to zero to match x86 packus. @@ -1062,19 +1141,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */, // ------------------------------ ConvertTo F -#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) { \ - return vfcvt_f_x_v_f##SEW##m##LMUL(v); \ + return vfcvt_f_x_v_f##SEW##LMUL(v); \ } \ /* Truncates (rounds toward zero). */ \ HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \ HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v); \ + return vfcvt_rtz_x_f_v_i##SEW##LMUL(v); \ } \ /* Uses default rounding mode. */ \ HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return vfcvt_x_f_v_i##SEW##m##LMUL(v); \ + return vfcvt_x_f_v_i##SEW##LMUL(v); \ } // API only requires f32 but we provide f64 for internal use (otherwise, it @@ -1082,16 +1161,23 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */, HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _) #undef HWY_RVV_CONVERT +// Partial +template +HWY_API VFromD> ConvertTo(Simd /*tag*/, FromV v) { + return ConvertTo(Full(), v); +} + // ================================================== SWIZZLE // ------------------------------ Compress -#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ - return v##OP##_vm_##CHAR##SEW##m##LMUL(mask, v, v); \ +#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ + return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v); \ } +HWY_RVV_FOREACH_UI16(HWY_RVV_COMPRESS, Compress, compress) HWY_RVV_FOREACH_UI32(HWY_RVV_COMPRESS, Compress, compress) HWY_RVV_FOREACH_UI64(HWY_RVV_COMPRESS, Compress, compress) HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress) @@ -1121,10 +1207,10 @@ HWY_API VFromD SetTableIndices(D d, // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX // to 2048! We could instead use vrgatherei16. -#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx); \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, idx); \ } HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather) @@ -1216,7 +1302,6 @@ HWY_API V OffsetsOf128BitBlocks(const D using T = MakeUnsigned>; return detail::And(iota0, static_cast(~(LanesPerBlock(d) - 1))); } - } // namespace detail template @@ -1244,9 +1329,9 @@ HWY_API V Broadcast(const V v) { // ------------------------------ GetLane -#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v); \ +#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); \ } HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x) @@ -1255,11 +1340,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL // ------------------------------ ShiftLeftLanes -// vector = f(vector, size_t) -#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) { \ - return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes); \ +// vector = f(vector, vector, size_t) +#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ + size_t lanes) { \ + return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes); \ } namespace detail { @@ -1270,7 +1356,7 @@ template HWY_API V ShiftLeftLanes(const V v) { using D = DFromV; const RebindToSigned di; - const auto shifted = detail::SlideUp(v, kLanes); + const auto shifted = detail::SlideUp(v, v, kLanes); // Match x86 semantics by zeroing lower lanes in 128-bit blocks constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di); const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1); @@ -1300,7 +1386,7 @@ template HWY_API V ShiftRightLanes(const V v) { using D = DFromV; const RebindToSigned di; - const auto shifted = detail::SlideDown(v, kLanes); + const auto shifted = detail::SlideDown(v, v, kLanes); // Match x86 semantics by zeroing upper lanes in 128-bit blocks constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di); const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1); @@ -1342,7 +1428,7 @@ HWY_API V ConcatUpperLower(const V hi, c template HWY_API V ConcatLowerLower(const V hi, const V lo) { // Move lower half into upper - const auto hi_up = detail::SlideUp(hi, Lanes(DFromV()) / 2); + const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV()) / 2); return ConcatUpperLower(hi_up, lo); } @@ -1351,7 +1437,7 @@ HWY_API V ConcatLowerLower(const V hi, c template HWY_API V ConcatUpperUpper(const V hi, const V lo) { // Move upper half into lower - const auto lo_down = detail::SlideDown(lo, Lanes(DFromV()) / 2); + const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV()) / 2); return ConcatUpperLower(hi, lo_down); } @@ -1360,8 +1446,8 @@ HWY_API V ConcatUpperUpper(const V hi, c template HWY_API V ConcatLowerUpper(const V hi, const V lo) { // Move half of both inputs to the other half - const auto hi_up = detail::SlideUp(hi, Lanes(DFromV()) / 2); - const auto lo_down = detail::SlideDown(lo, Lanes(DFromV()) / 2); + const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV()) / 2); + const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV()) / 2); return ConcatUpperLower(hi_up, lo_down); } @@ -1428,61 +1514,55 @@ HWY_API V Combine(const V a, const V b) // ================================================== REDUCE // vector = f(vector, zero_m1) -#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) { \ - vsetvlmax_e##SEW##m##LMUL(); \ - return Set(HWY_RVV_D(CHAR, SEW, LMUL)(), \ - GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \ - v0, v, v0))); \ +#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \ + vsetvlmax_e##SEW##LMUL(); \ + return Set( \ + HWY_RVV_D(CHAR, SEW, LMUL)(), \ + GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \ } // ------------------------------ SumOfLanes namespace detail { - HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum) HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum) - } // namespace detail template HWY_API V SumOfLanes(const V v) { using T = TFromV; - const auto v0 = Zero(Simd()); // always m1 + const auto v0 = Zero(Full()); // always m1 return detail::RedSum(v, v0); } // ------------------------------ MinOfLanes namespace detail { - HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu) HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin) HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin) - } // namespace detail template HWY_API V MinOfLanes(const V v) { using T = TFromV; - const Simd d1; // always m1 + const Full d1; // always m1 const auto neutral = Set(d1, HighestValue()); return detail::RedMin(v, neutral); } // ------------------------------ MaxOfLanes namespace detail { - HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu) HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax) HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax) - } // namespace detail template HWY_API V MaxOfLanes(const V v) { using T = TFromV; - const Simd d1; // always m1 + const Full d1; // always m1 const auto neutral = Set(d1, LowestValue()); return detail::RedMax(v, neutral); } @@ -1507,7 +1587,7 @@ HWY_API VFromD LoadDup128(D d, const #define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP) \ HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \ /* LMUL=1 is always enough */ \ - Simd d8; \ + Full d8; \ const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN; \ /* TODO(janwas): how to convert vbool* to vuint?*/ \ /*Store(m, d8, p);*/ \ @@ -1518,6 +1598,22 @@ HWY_API VFromD LoadDup128(D d, const HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _) #undef HWY_RVV_STORE_MASK_BITS +// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) + +// Disallow for 8-bit because Iota is likely to overflow. +template +HWY_API MFromD FirstN(const D d, const size_t n) { + const RebindToSigned di; + return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n))); +} + +template +HWY_API MFromD FirstN(const D d, const size_t n) { + const auto zero = Zero(d); + const auto one = Set(d, 1); + return Eq(detail::SlideUp(one, zero, n), one); +} + // ------------------------------ Neg template @@ -1526,9 +1622,9 @@ HWY_API V Neg(const V v) { } // vector = f(vector), but argument is repeated -#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \ +#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v); \ + return v##OP##_vv_##CHAR##SEW##LMUL(v, v); \ } HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn) @@ -1565,7 +1661,6 @@ template HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) { return Lt(Abs(v), Set(DFromV(), MantissaEnd>())); } - } // namespace detail template @@ -1636,10 +1731,8 @@ HWY_API VFromD Iota(const D d, TFromD // Using vwmul does not work for m8, so use mulh instead. Highway only provides // MulHigh for 16-bit, so use a private wrapper. namespace detail { - HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu) HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh) - } // namespace detail template @@ -1649,7 +1742,7 @@ HWY_API VFromD> dw; - return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo)); + return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo)); } // ================================================== END MACROS diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/rvv-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h.12 2021-06-02 10:56:05.237904402 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -19,7 +19,6 @@ #include #include // std::min -#include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" @@ -199,7 +198,7 @@ HWY_API Vec1 BroadcastSignBit(const V template HWY_API Mask1 RebindMask(Sisd /*tag*/, Mask1 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); - return Mask1(m.raw); + return Mask1{m.bits}; } // v must be 0 or FF..FF. @@ -224,6 +223,11 @@ Vec1 VecFromMask(Sisd /* tag */, c return v; } +template +HWY_INLINE Mask1 FirstN(Sisd /*tag*/, size_t n) { + return Mask1::FromBool(n != 0); +} + // Returns mask ? yes : no. template HWY_INLINE Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, @@ -357,9 +361,9 @@ HWY_INLINE Vec1 operator>>(const Vec1 template HWY_INLINE Vec1 operator+(Vec1 a, Vec1 b) { - const uint64_t a64 = static_cast(a.raw); - const uint64_t b64 = static_cast(b.raw); - return Vec1((a64 + b64) & ~T(0)); + const uint64_t a64 = static_cast(a.raw); + const uint64_t b64 = static_cast(b.raw); + return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); } HWY_INLINE Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); @@ -370,9 +374,9 @@ HWY_INLINE Vec1 operator+(const template HWY_INLINE Vec1 operator-(Vec1 a, Vec1 b) { - const uint64_t a64 = static_cast(a.raw); - const uint64_t b64 = static_cast(b.raw); - return Vec1((a64 - b64) & ~T(0)); + const uint64_t a64 = static_cast(a.raw); + const uint64_t b64 = static_cast(b.raw); + return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); } HWY_INLINE Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); @@ -388,21 +392,25 @@ HWY_INLINE Vec1 operator-(const // Unsigned HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); } HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535))); } // Signed HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); } HWY_INLINE Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767))); } // ------------------------------ Saturating subtraction @@ -412,21 +420,25 @@ HWY_INLINE Vec1 SaturatedAdd(co // Unsigned HWY_INLINE Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); } HWY_INLINE Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535))); } // Signed HWY_INLINE Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); } HWY_INLINE Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)); + return Vec1( + static_cast(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767))); } // ------------------------------ Average @@ -435,11 +447,11 @@ HWY_INLINE Vec1 SaturatedSub(co HWY_INLINE Vec1 AverageRound(const Vec1 a, const Vec1 b) { - return Vec1((a.raw + b.raw + 1) / 2); + return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } HWY_INLINE Vec1 AverageRound(const Vec1 a, const Vec1 b) { - return Vec1((a.raw + b.raw + 1) / 2); + return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } // ------------------------------ Absolute value @@ -514,15 +526,15 @@ HWY_INLINE Vec1 operator/(const Vec1< // Returns the upper 16 bits of a * b in each lane. HWY_INLINE Vec1 MulHigh(const Vec1 a, const Vec1 b) { - return Vec1((a.raw * b.raw) >> 16); + return Vec1(static_cast((a.raw * b.raw) >> 16)); } HWY_INLINE Vec1 MulHigh(const Vec1 a, const Vec1 b) { // Cast to uint32_t first to prevent overflow. Otherwise the result of // uint16_t * uint16_t is in "int" which may overflow. In practice the result // is the same but this way it is also defined. - return Vec1( - (static_cast(a.raw) * static_cast(b.raw)) >> 16); + return Vec1(static_cast( + (static_cast(a.raw) * static_cast(b.raw)) >> 16)); } // Multiplies even lanes (0, 2 ..) and returns the double-wide result. @@ -617,6 +629,31 @@ HWY_INLINE Vec1 Round(const Vec1 v return Vec1(static_cast(rounded)); } +// Round-to-nearest even. +HWY_INLINE Vec1 NearestInt(const Vec1 v) { + using T = float; + using TI = int32_t; + + const T abs = Abs(v).raw; + const bool signbit = std::signbit(v.raw); + + if (!(abs < MantissaEnd())) { // Huge or NaN + // Check if too large to cast or NaN + if (!(abs <= static_cast(LimitsMax()))) { + return Vec1(signbit ? LimitsMin() : LimitsMax()); + } + return Vec1(static_cast(v.raw)); + } + const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); + const TI rounded = static_cast(v.raw + bias); + if (rounded == 0) return Vec1(0); + // Round to even + if ((rounded & 1) && std::abs(static_cast(rounded) - v.raw) == T(0.5)) { + return Vec1(rounded - (signbit ? -1 : 1)); + } + return Vec1(rounded); +} + template HWY_INLINE Vec1 Trunc(const Vec1 v) { using TI = MakeSigned; @@ -641,7 +678,8 @@ V Ceiling(const V v) { Bits bits; CopyBytes(&v, &bits); - const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias; + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); // Already an integer. if (exponent >= kMantissaBits) return v; // |v| <= 1 => 0 or 1. @@ -672,7 +710,8 @@ V Floor(const V v) { Bits bits; CopyBytes(&v, &bits); - const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias; + const int exponent = + static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); // Already an integer. if (exponent >= kMantissaBits) return v; // |v| <= 1 => -1 or 0. @@ -772,6 +811,26 @@ HWY_INLINE void StoreU(const Vec1 v, return Store(v, d, p); } +// ------------------------------ StoreInterleaved3 + +HWY_API void StoreInterleaved3(const Vec1 v0, const Vec1 v1, + const Vec1 v2, Sisd d, + uint8_t* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); +} + +HWY_API void StoreInterleaved4(const Vec1 v0, const Vec1 v1, + const Vec1 v2, const Vec1 v3, + Sisd d, + uint8_t* HWY_RESTRICT unaligned) { + StoreU(v0, d, unaligned + 0); + StoreU(v1, d, unaligned + 1); + StoreU(v2, d, unaligned + 2); + StoreU(v3, d, unaligned + 3); +} + // ------------------------------ Stream template @@ -779,12 +838,29 @@ HWY_INLINE void Stream(const Vec1 v, return Store(v, d, aligned); } +// ------------------------------ Scatter + +template +HWY_INLINE void ScatterOffset(Vec1 v, Sisd d, T* base, + const Vec1 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + uint8_t* const base8 = reinterpret_cast(base) + offset.raw; + return Store(v, d, reinterpret_cast(base8)); +} + +template +HWY_INLINE void ScatterIndex(Vec1 v, Sisd d, T* HWY_RESTRICT base, + const Vec1 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return Store(v, d, base + index.raw); +} + // ------------------------------ Gather template HWY_INLINE Vec1 GatherOffset(Sisd d, const T* base, const Vec1 offset) { - static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); const uintptr_t addr = reinterpret_cast(base) + offset.raw; return Load(d, reinterpret_cast(addr)); } @@ -792,7 +868,7 @@ HWY_INLINE Vec1 GatherOffset(Sisd template HWY_INLINE Vec1 GatherIndex(Sisd d, const T* HWY_RESTRICT base, const Vec1 index) { - static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); return Load(d, base + index.raw); } @@ -833,15 +909,20 @@ HWY_INLINE Vec1 DemoteTo(Sisd static HWY_INLINE Vec1 PromoteTo(Sisd /* tag */, const Vec1 v) { +#if HWY_NATIVE_FLOAT16 uint16_t bits16; CopyBytes<2>(&v.raw, &bits16); +#else + const uint16_t bits16 = v.raw.bits; +#endif const uint32_t sign = bits16 >> 15; const uint32_t biased_exp = (bits16 >> 10) & 0x1F; const uint32_t mantissa = bits16 & 0x3FF; // Subnormal or zero if (biased_exp == 0) { - const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + const float subnormal = + (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); return Vec1(sign ? -subnormal : subnormal); } @@ -867,8 +948,12 @@ static HWY_INLINE Vec1 Demote // Tiny or zero => zero. Vec1 out; if (exp < -24) { - bits32 = 0; - CopyBytes<2>(&bits32, &out); +#if HWY_NATIVE_FLOAT16 + const uint16_t zero = 0; + CopyBytes<2>(&zero, &out.raw); +#else + out.raw.bits = 0; +#endif return out; } @@ -890,7 +975,12 @@ static HWY_INLINE Vec1 Demote HWY_DASSERT(mantissa16 < 1024); const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; HWY_DASSERT(bits16 < 0x10000); - CopyBytes<2>(&bits16, &out); +#if HWY_NATIVE_FLOAT16 + const uint16_t narrowed = static_cast(bits16); // big-endian safe + CopyBytes<2>(&narrowed, &out.raw); +#else + out.raw.bits = static_cast(bits16); +#endif return out; } @@ -919,18 +1009,6 @@ HWY_INLINE Vec1 U8FromU32(const return DemoteTo(Sisd(), v); } -// Approximation of round-to-nearest for numbers representable as int32_t. -HWY_INLINE Vec1 NearestInt(const Vec1 v) { - const float f = v.raw; - if (std::isinf(f) || - std::fabs(f) > static_cast(LimitsMax())) { - return Vec1(std::signbit(f) ? LimitsMin() - : LimitsMax()); - } - const float bias = f < 0.0f ? -0.5f : 0.5f; - return Vec1(static_cast(f + bias)); -} - // ================================================== SWIZZLE // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*, diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/scalar-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h.12 2021-06-02 10:56:05.224904336 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -31,11 +31,6 @@ #undef HWY_ALIGN #undef HWY_LANES -#undef HWY_GATHER_LANES -#undef HWY_VARIABLE_SHIFT_LANES -#undef HWY_COMPARE64_LANES -#undef HWY_MINMAX64_LANES - #undef HWY_CAP_INTEGER64 #undef HWY_CAP_FLOAT64 #undef HWY_CAP_GE256 @@ -53,11 +48,6 @@ #define HWY_ALIGN alignas(16) #define HWY_LANES(T) (16 / sizeof(T)) -#define HWY_GATHER_LANES(T) 1 -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_COMPARE64_LANES 2 -#define HWY_MINMAX64_LANES 1 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 #define HWY_CAP_GE256 0 @@ -73,11 +63,6 @@ #define HWY_ALIGN alignas(32) #define HWY_LANES(T) (32 / sizeof(T)) -#define HWY_GATHER_LANES(T) HWY_LANES(T) -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_COMPARE64_LANES 4 -#define HWY_MINMAX64_LANES 1 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 #define HWY_CAP_GE256 1 @@ -96,11 +81,6 @@ #define HWY_ALIGN alignas(64) #define HWY_LANES(T) (64 / sizeof(T)) -#define HWY_GATHER_LANES(T) HWY_LANES(T) -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_COMPARE64_LANES 8 -#define HWY_MINMAX64_LANES 8 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 #define HWY_CAP_GE256 1 @@ -121,11 +101,6 @@ #define HWY_ALIGN alignas(16) #define HWY_LANES(T) (16 / sizeof(T)) -#define HWY_GATHER_LANES(T) 1 -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_COMPARE64_LANES 2 -#define HWY_MINMAX64_LANES 2 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 #define HWY_CAP_GE256 0 @@ -142,19 +117,14 @@ #define HWY_ALIGN alignas(16) #define HWY_LANES(T) (16 / sizeof(T)) -#define HWY_GATHER_LANES(T) 1 -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_MINMAX64_LANES 2 -#define HWY_COMPARE64_LANES 2 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_GE256 0 #define HWY_CAP_GE512 0 -#ifdef __arm__ -#define HWY_CAP_FLOAT64 0 -#else +#if HWY_ARCH_ARM_A64 #define HWY_CAP_FLOAT64 1 +#else +#define HWY_CAP_FLOAT64 0 #endif #define HWY_NAMESPACE N_NEON @@ -162,17 +132,34 @@ // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. //----------------------------------------------------------------------------- +// SVE[2] +#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE + +// SVE only requires lane alignment, not natural alignment of the entire vector. +#define HWY_ALIGN alignas(8) +// Upper bound, not the actual lane count! +#define HWY_LANES(T) (256 / sizeof(T)) + +#define HWY_CAP_INTEGER64 1 +#define HWY_CAP_FLOAT64 1 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_SVE2 +#define HWY_NAMESPACE N_SVE2 +#else +#define HWY_NAMESPACE N_SVE +#endif + +// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE? + +//----------------------------------------------------------------------------- // WASM #elif HWY_TARGET == HWY_WASM #define HWY_ALIGN alignas(16) #define HWY_LANES(T) (16 / sizeof(T)) -#define HWY_GATHER_LANES(T) 1 -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -#define HWY_COMPARE64_LANES 2 -#define HWY_MINMAX64_LANES 2 - #define HWY_CAP_INTEGER64 0 #define HWY_CAP_FLOAT64 0 #define HWY_CAP_GE256 0 @@ -194,11 +181,6 @@ // mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h. #define HWY_LANES(T) (4096 / sizeof(T)) -#define HWY_GATHER_LANES(T) HWY_LANES(T) -#define HWY_VARIABLE_SHIFT_LANES(T) HWY_LANES(T) -// Cannot use HWY_LANES/sizeof here because these are used in an #if. -#define HWY_COMPARE64_LANES 256 -#define HWY_MINMAX64_LANES 256 #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 @@ -215,13 +197,9 @@ #elif HWY_TARGET == HWY_SCALAR #define HWY_ALIGN +// For internal use only; use Lanes(d) instead. #define HWY_LANES(T) 1 -#define HWY_GATHER_LANES(T) 1 -#define HWY_VARIABLE_SHIFT_LANES(T) 1 -#define HWY_COMPARE64_LANES 1 -#define HWY_MINMAX64_LANES 1 - #define HWY_CAP_INTEGER64 1 #define HWY_CAP_FLOAT64 1 #define HWY_CAP_GE256 0 @@ -265,3 +243,7 @@ #else #define HWY_ATTR #endif + +// DEPRECATED +#undef HWY_GATHER_LANES +#define HWY_GATHER_LANES(T) HWY_LANES(T) diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/set_macros-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h.12 2021-06-02 10:56:05.235904392 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -14,6 +14,8 @@ // Per-target definitions shared by ops/*.h and user code. +#include + // Separate header because foreach_target.h re-enables its include guard. #include "hwy/ops/set_macros-inl.h" @@ -106,7 +108,7 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr si } // Targets with non-constexpr Lanes define this themselves. -#if HWY_TARGET != HWY_RVV +#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE // (Potentially) non-constant actual size of the vector at runtime, subject to // the limit imposed by the Simd. Useful for advancing loop counters. diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/shared-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h.12 2021-06-02 10:56:05.242904427 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -19,8 +19,6 @@ #include #include -#include - #include "hwy/base.h" #include "hwy/ops/shared-inl.h" @@ -177,6 +175,16 @@ HWY_API Vec128 Undefined(Simd +Vec128 Iota(const Simd d, const T2 first) { + HWY_ALIGN T lanes[16 / sizeof(T)]; + for (size_t i = 0; i < 16 / sizeof(T); ++i) { + lanes[i] = static_cast(first + static_cast(i)); + } + return Load(d, lanes); +} + // ================================================== ARITHMETIC // ------------------------------ Addition @@ -273,24 +281,24 @@ HWY_API Vec128 operator-(const template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { - return Vec128{wasm_u8x16_add_saturate(a.raw, b.raw)}; + return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { - return Vec128{wasm_u16x8_add_saturate(a.raw, b.raw)}; + return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { - return Vec128{wasm_i8x16_add_saturate(a.raw, b.raw)}; + return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { - return Vec128{wasm_i16x8_add_saturate(a.raw, b.raw)}; + return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; } // ------------------------------ Saturating subtraction @@ -301,24 +309,24 @@ HWY_API Vec128 SaturatedAdd( template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { - return Vec128{wasm_u8x16_sub_saturate(a.raw, b.raw)}; + return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { - return Vec128{wasm_u16x8_sub_saturate(a.raw, b.raw)}; + return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { - return Vec128{wasm_i8x16_sub_saturate(a.raw, b.raw)}; + return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { - return Vec128{wasm_i16x8_sub_saturate(a.raw, b.raw)}; + return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; } // ------------------------------ Average @@ -352,6 +360,12 @@ template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i32x4_abs(v.raw)}; } +template +HWY_API Vec128 Abs(const Vec128 v) { + // TODO(janwas): use wasm_i64x2_abs when available + const Vec128 mask = wasm_i64x2_shr(v.raw, 63); + return ((v ^ mask) - mask); +} template HWY_API Vec128 Abs(const Vec128 v) { @@ -396,9 +410,38 @@ HWY_API Vec128 ShiftRight(co return Vec128{wasm_i32x4_shr(v.raw, kBits)}; } +// 8-bit +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRight(Vec128{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const Simd di; + const Simd du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ Shift lanes by same variable #bits -// Unsigned (no u8) +// Unsigned template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { @@ -420,7 +463,7 @@ HWY_API Vec128 ShiftRightSa return Vec128{wasm_u32x4_shr(v.raw, bits)}; } -// Signed (no i8) +// Signed template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { @@ -442,6 +485,35 @@ HWY_API Vec128 ShiftRightSam return Vec128{wasm_i32x4_shr(v.raw, bits)}; } +// 8-bit +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftLeftSame(Vec128>{v.raw}, bits).raw}; + return shifted & Set(d8, (0xFF << bits) & 0xFF); +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, + const int bits) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRightSame(Vec128{v.raw}, bits).raw}; + return shifted & Set(d8, 0xFF >> bits); +} + +template +HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { + const Simd di; + const Simd du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ Minimum // Unsigned @@ -607,29 +679,29 @@ template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. - const auto al = wasm_i32x4_widen_low_u16x8(a.raw); - const auto ah = wasm_i32x4_widen_high_u16x8(a.raw); - const auto bl = wasm_i32x4_widen_low_u16x8(b.raw); - const auto bh = wasm_i32x4_widen_high_u16x8(b.raw); + const auto al = wasm_u32x4_extend_low_u16x8(a.raw); + const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); + const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); + const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec128{ - wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. - const auto al = wasm_i32x4_widen_low_i16x8(a.raw); - const auto ah = wasm_i32x4_widen_high_i16x8(a.raw); - const auto bl = wasm_i32x4_widen_low_i16x8(b.raw); - const auto bh = wasm_i32x4_widen_high_i16x8(b.raw); + const auto al = wasm_i32x4_extend_low_i16x8(a.raw); + const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); + const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); + const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec128{ - wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; + wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } // Multiplies even lanes (0, 2 ..) and returns the double-width result. @@ -765,53 +837,76 @@ HWY_API Vec128 ApproximateReci // Toward nearest integer, ties to even template HWY_API Vec128 Round(const Vec128 v) { - // TODO(eustas): is it f32x4.nearest? (not implemented yet) - alignas(16) float input[4]; - alignas(16) float output[4]; - wasm_v128_store(input, v.raw); - for (size_t i = 0; i < 4; ++i) { - output[i] = std::nearbyint(input[i]); - } - return Vec128{wasm_v128_load(output)}; + // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not + // yet have an instruction for that (f32x4.nearest is not implemented). We + // rely on rounding after addition with a large value such that no mantissa + // bits remain (assuming the current mode is nearest-even). We may need a + // compiler flag for precise floating-point to prevent "optimizing" this out. + const Simd df; + const auto max = Set(df, MantissaEnd()); + const auto large = CopySignToAbs(max, v); + const auto added = large + v; + const auto rounded = added - large; + + // Keep original if NaN or the magnitude is large (already an int). + return IfThenElse(Abs(v) < max, rounded, v); } +namespace detail { + +// Truncating to integer and converting back to float is correct except when the +// input magnitude is large, in which case the input was already an integer +// (because mantissa >> exponent is zero). +template +HWY_API Mask128 UseInt(const Vec128 v) { + return Abs(v) < Set(Simd(), MantissaEnd()); +} + +} // namespace detail + // Toward zero, aka truncate template HWY_API Vec128 Trunc(const Vec128 v) { // TODO(eustas): is it f32x4.trunc? (not implemented yet) - alignas(16) float input[4]; - alignas(16) float output[4]; - wasm_v128_store(input, v.raw); - for (size_t i = 0; i < 4; ++i) { - output[i] = std::trunc(input[i]); - } - return Vec128{wasm_v128_load(output)}; + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); } // Toward +infinity, aka ceiling template -HWY_API Vec128 Ceil(const Vec128 v) { +HWY_INLINE Vec128 Ceil(const Vec128 v) { // TODO(eustas): is it f32x4.ceil? (not implemented yet) - alignas(16) float input[4]; - alignas(16) float output[4]; - wasm_v128_store(input, v.raw); - for (size_t i = 0; i < 4; ++i) { - output[i] = std::ceil(input[i]); - } - return Vec128{wasm_v128_load(output)}; + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a positive non-integer ends up smaller; if so, add 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); + + return IfThenElse(detail::UseInt(v), int_f - neg1, v); } // Toward -infinity, aka floor template -HWY_API Vec128 Floor(const Vec128 v) { +HWY_INLINE Vec128 Floor(const Vec128 v) { // TODO(eustas): is it f32x4.floor? (not implemented yet) - alignas(16) float input[4]; - alignas(16) float output[4]; - wasm_v128_store(input, v.raw); - for (size_t i = 0; i < 4; ++i) { - output[i] = std::floor(input[i]); - } - return Vec128{wasm_v128_load(output)}; + const Simd df; + const RebindToSigned di; + + const auto integer = ConvertTo(di, v); // round toward 0 + const auto int_f = ConvertTo(df, integer); + + // Truncating a negative non-integer ends up larger; if so, subtract 1. + const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); + + return IfThenElse(detail::UseInt(v), int_f + neg1, v); } // ================================================== COMPARE @@ -902,12 +997,12 @@ HWY_API Mask128 operator>(co // Otherwise, the lower half decides. const auto m_eq = a32 == b32; - const auto lo_in_hi = wasm_v32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0); + const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0); const auto lo_gt = And(m_eq, lo_in_hi); const auto gt = Or(lo_gt, m_gt); // Copy result in upper 32 bits to lower 32 bits. - return Mask128{wasm_v32x4_shuffle(gt, gt, 3, 3, 1, 1)}; + return Mask128{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)}; } template @@ -935,6 +1030,14 @@ HWY_API Mask128 operator>=(con return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; } +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { + const RebindToSigned di; // Signed comparisons may be cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + // ================================================== LOGICAL // ------------------------------ Not @@ -1015,7 +1118,7 @@ HWY_API Vec128 BroadcastSignBit(co } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - return VecFromMask(v < Zero(Simd())); + return VecFromMask(Simd(), v < Zero(Simd())); } // ------------------------------ Mask @@ -1278,26 +1381,73 @@ HWY_API void Stream(Vec128 v, Simd wasm_v128_store(aligned, v.raw); } -// ------------------------------ Gather +// ------------------------------ Scatter (Store) + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +// ------------------------------ Gather (Load/Store) template HWY_API Vec128 GatherOffset(const Simd d, const T* HWY_RESTRICT base, const Vec128 offset) { - static_assert(N == 1, "Wasm does not support full gather"); - static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); - const uintptr_t address = reinterpret_cast(base) + GetLane(offset); - T val; - CopyBytes(reinterpret_cast(address), &val); - return Set(d, val); + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); } template HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, const Vec128 index) { - static_assert(N == 1, "Wasm does not support full gather"); - static_assert(sizeof(T) == sizeof(Index), "T must match Index"); - return Set(d, base[GetLane(index)]); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); } // ================================================== SWIZZLE @@ -1346,12 +1496,12 @@ HWY_API Vec128 LowerHalf(Vec12 template HWY_API Vec128 UpperHalf(Vec128 v) { // TODO(eustas): use swizzle? - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } template <> HWY_INLINE Vec128 UpperHalf(Vec128 v) { // TODO(eustas): use swizzle? - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } // ------------------------------ Shift vector by constant #bytes @@ -1366,64 +1516,64 @@ HWY_API Vec128 ShiftLeftBytes(const V return v; case 1: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)}; case 2: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)}; case 3: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)}; case 4: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)}; case 5: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}; case 6: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; case 7: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; case 8: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; case 9: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; case 10: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; case 11: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; case 12: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; case 13: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; case 14: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1)}; case 15: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0)}; } @@ -1447,69 +1597,69 @@ HWY_API Vec128 ShiftRightBytes(const return v; case 1: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)}; case 2: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16)}; case 3: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16)}; case 4: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16)}; case 5: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16)}; case 6: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16)}; case 7: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16)}; case 8: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16)}; case 9: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 10: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 11: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 12: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 13: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 14: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; case 15: - return Vec128{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, + return Vec128{wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)}; } @@ -1535,72 +1685,72 @@ HWY_API Vec128 CombineShiftRightBytes return lo; case 1: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)}; case 2: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)}; case 3: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)}; case 4: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)}; case 5: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)}; case 6: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)}; case 7: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22)}; case 8: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)}; case 9: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)}; case 10: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)}; case 11: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)}; case 12: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)}; case 13: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)}; case 14: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)}; case 15: - return Vec128{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, + return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)}; } @@ -1613,28 +1763,28 @@ HWY_API Vec128 CombineShiftRightBytes template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{wasm_v16x8_shuffle( + return Vec128{wasm_i16x8_shuffle( v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ - wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } // Signed template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{wasm_v16x8_shuffle( + return Vec128{wasm_i16x8_shuffle( v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ - wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } // Float @@ -1642,7 +1792,7 @@ template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ - wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; + wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } // ------------------------------ Shuffle bytes with variable indices @@ -1652,16 +1802,23 @@ HWY_API Vec128 Broadcast(const template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { - // TODO(eustas): use swizzle? (shuffle does not work for variable indices) +// Not yet available in all engines, see +// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md +// V8 implementation of this had a bug, fixed on 2021-04-03: +// https://chromium-review.googlesource.com/c/v8/v8/+/2822951 +#if 0 + return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; +#else alignas(16) uint8_t control[16]; alignas(16) uint8_t input[16]; alignas(16) uint8_t output[16]; wasm_v128_store(control, from.raw); wasm_v128_store(input, bytes.raw); for (size_t i = 0; i < 16; ++i) { - output[i] = input[control[i]]; + output[i] = control[i] < 16 ? input[control[i]] : 0; } return Vec128{wasm_v128_load(output)}; +#endif } // ------------------------------ Hard-coded shuffles @@ -1673,101 +1830,102 @@ HWY_API Vec128 TableLookupBytes(co // Swap 32-bit halves in 64-bit halves. HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } // Swap 64-bit halves HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)}; + return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } // Rotate right 32 bits HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } // Rotate left 32 bits HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } // Reverse HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; + return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. -template +template struct Indices128 { __v128_u raw; }; -template -HWY_API Indices128 SetTableIndices(Full128, const int32_t* idx) { +template +HWY_API Indices128 SetTableIndices(Simd d, const int32_t* idx) { #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) - const size_t N = 16 / sizeof(T); for (size_t i = 0; i < N; ++i) { HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); } #endif - const Full128 d8; - alignas(16) uint8_t control[16]; // = Lanes() - for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { - const size_t idx_lane = idx_byte / sizeof(T); - const size_t mod = idx_byte % sizeof(T); - control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; + const Repartition d8; + alignas(16) uint8_t control[16] = {0}; + for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + control[idx_lane * sizeof(T) + idx_byte] = + static_cast(idx[idx_lane] * sizeof(T) + idx_byte); + } } - return Indices128{Load(d8, control).raw}; + return Indices128{Load(d8, control).raw}; } -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128{idx.raw}); +template +HWY_API Vec128 TableLookupLanes( + const Vec128 v, const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } - -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128{idx.raw}); +template +HWY_API Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } - -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - const Full128 di; - const Full128 df; +template +HWY_API Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + const Simd di; + const Simd df; return BitCast(df, - TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); + TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); } // ------------------------------ Zip lanes @@ -1778,33 +1936,33 @@ HWY_API Vec128 TableLookupLanes(c template HWY_API Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v8x16_shuffle( + return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 ZipLower(const Vec128 a, const Vec128 b) { return Vec128{ - wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 ZipLower(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v8x16_shuffle( + return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 ZipLower(const Vec128 a, const Vec128 b) { return Vec128{ - wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; + wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } @@ -1812,13 +1970,13 @@ template HWY_API Vec128 ZipUpper(const Vec128 a, const Vec128 b) { return Vec128{ - wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 ZipUpper(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, + return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } @@ -1826,7 +1984,7 @@ template HWY_API Vec128 ZipUpper(const Vec128 a, const Vec128 b) { return Vec128{ - wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; + wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } // ------------------------------ Interleave lanes @@ -1842,17 +2000,17 @@ HWY_API Vec128 InterleaveLower(const template <> HWY_INLINE Vec128 InterleaveLower( const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template <> HWY_INLINE Vec128 InterleaveLower(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template <> HWY_INLINE Vec128 InterleaveLower(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template @@ -1862,17 +2020,17 @@ HWY_API Vec128 InterleaveUpper(const template <> HWY_INLINE Vec128 InterleaveUpper( const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template <> HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template <> HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } // ------------------------------ Blocks @@ -1880,13 +2038,13 @@ HWY_INLINE Vec128 InterleaveUpper // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API Vec128 ConcatLowerLower(const Vec128 hi, const Vec128 lo) { - return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)}; + return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; } // hiH,hiL loH,loL |-> hiH,loH (= upper halves) template HWY_API Vec128 ConcatUpperUpper(const Vec128 hi, const Vec128 lo) { - return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)}; + return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; } // hiH,hiL loH,loL |-> hiL,loH (= inner halves) @@ -1898,7 +2056,7 @@ HWY_API Vec128 ConcatLowerUpper(const // hiH,hiL loH,loL |-> hiH,loL (= outer halves) template HWY_API Vec128 ConcatUpperLower(const Vec128 hi, const Vec128 lo) { - return Vec128{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)}; + return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 3)}; } // ------------------------------ Odd/even lanes @@ -1917,12 +2075,12 @@ HWY_API Vec128 odd_even_impl(hwy::Siz template HWY_API Vec128 odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128 a, const Vec128 b) { - return Vec128{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; + return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; } template HWY_API Vec128 odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } // TODO(eustas): implement // template @@ -1939,7 +2097,7 @@ HWY_API Vec128 OddEven(const Vec128 HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { - return Vec128{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; + return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } // ================================================== CONVERT @@ -1950,52 +2108,52 @@ HWY_INLINE Vec128 OddEven( template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i16x8_widen_low_u8x16(v.raw)}; + return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ - wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))}; + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i16x8_widen_low_u8x16(v.raw)}; + return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ - wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))}; + wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i32x4_widen_low_u16x8(v.raw)}; + return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i32x4_widen_low_u16x8(v.raw)}; + return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } // Signed: replicate sign bit. template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i16x8_widen_low_i8x16(v.raw)}; + return Vec128{wasm_i16x8_extend_low_i8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ - wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))}; + wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i32x4_widen_low_i16x8(v.raw)}; + return Vec128{wasm_i32x4_extend_low_i16x8(v.raw)}; } template @@ -2122,7 +2280,7 @@ HWY_API Vec128 U8FromU32(con wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } -// ------------------------------ Convert i32 <=> f32 +// ------------------------------ Convert i32 <=> f32 (Round) template HWY_API Vec128 ConvertTo(Simd /* tag */, @@ -2133,33 +2291,16 @@ HWY_API Vec128 ConvertTo(Simd< template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { - return Vec128{wasm_i32x4_trunc_saturate_f32x4(v.raw)}; + return Vec128{wasm_i32x4_trunc_sat_f32x4(v.raw)}; } template HWY_API Vec128 NearestInt(const Vec128 v) { - const __f32x4 c00 = wasm_f32x4_splat(0.0f); - const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00)); - const __f32x4 c05 = wasm_f32x4_splat(0.5f); - // +0.5 for non-negative lane, -0.5 for other. - const __f32x4 delta = wasm_f32x4_add(c05, corr); - // Shift input by 0.5 away from 0. - const __f32x4 fixed = wasm_f32x4_add(v.raw, delta); - return Vec128{wasm_i32x4_trunc_saturate_f32x4(fixed)}; + return ConvertTo(Simd(), Round(v)); } // ================================================== MISC -// Returns a vector with lane i=[0, N) set to "first" + i. -template -Vec128 Iota(const Simd d, const T2 first) { - HWY_ALIGN T lanes[16 / sizeof(T)]; - for (size_t i = 0; i < 16 / sizeof(T); ++i) { - lanes[i] = static_cast(first + static_cast(i)); - } - return Load(d, lanes); -} - // ------------------------------ Mask namespace detail { @@ -2167,20 +2308,13 @@ namespace detail { template HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { - const __i8x16 slice = - wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8); - // Each u32 lane has byte[i] = (1 << i) or 0. - const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice); - // OR together 4 bytes of each u32 to get the 4 bits. - const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16); - const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z); - const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8); - const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0); - const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24); - // Assemble four nibbles into 16 bits. - alignas(16) uint32_t lanes[4]; - wasm_v128_store(lanes, nibble_per_u32); - return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, mask.raw); + + constexpr uint64_t kMagic = 0x103070F1F3F80ULL; + const uint64_t lo = ((lanes[0] * kMagic) >> 56); + const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; + return (hi + lo); } template @@ -2241,8 +2375,7 @@ constexpr __i8x16 BytesAbove() { template HWY_API uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive( - BitsFromMask(hwy::SizeTag(), mask)); + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } template @@ -2290,7 +2423,15 @@ HWY_API size_t CountTrue(const Mask128 HWY_API bool AllFalse(const Mask128 m) { - return !wasm_i8x16_any_true(m.raw); +#if 0 + // Casting followed by wasm_i8x16_any_true results in wasm error: + // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 + const auto v8 = BitCast(Full128(), VecFromMask(Full128(), m)); + return !wasm_i8x16_any_true(v8.raw); +#else + return (wasm_i64x2_extract_lane(m.raw, 0) | + wasm_i64x2_extract_lane(m.raw, 1)) == 0; +#endif } // Full vector, type-dependent @@ -2336,6 +2477,139 @@ HWY_API bool AllTrue(const Mask128 namespace detail { template +HWY_INLINE Vec128 Idx16x8FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Rebind d8; + const Simd du; + + // We need byte indices for TableLookupBytes (one vector's worth for each of + // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We + // can instead store lane indices and convert to byte indices (2*lane + 0..1), + // with the doubling baked into the table. Unpacking nibbles is likely more + // costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, + 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, + 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, + 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, + 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, + 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, + 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, + 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, + 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, + 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, + 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, + 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, + 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, + 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, + 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, + 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, + 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, + 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, + 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, + 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, + 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, + 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, + 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, + 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, + 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, + 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, + 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, + 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, + 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, + 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, + 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, + 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, + 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, + 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, + 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, + 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, + 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, + 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, + 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, + 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, + 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, + 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, + 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, + 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, + 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, + 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, + 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, + 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, + 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, + 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, + 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, + 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, + 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, + 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, + 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, + 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, + 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, + 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, + 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, + 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, + 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, + 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, + 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, + 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, + 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, + 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, + 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, + 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, + 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, + 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, + 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, + 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, + 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, + 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, + 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, + 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, + 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, + 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, + 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, + 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, + 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, + 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, + 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, + 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, + 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, + 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, + 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, + 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, + 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, + 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, + 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, + 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, + 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, + 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, + 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, + 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, + 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, + 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, + 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, + 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, + 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, + 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, + 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, + 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, + 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, + 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); @@ -2383,57 +2657,37 @@ HWY_INLINE Vec128 Idx64x2FromBits( #endif -// Helper function called by both Compress and CompressStore - avoids a +// Helper functions called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); +template +HWY_API Vec128 Compress(hwy::SizeTag<2> /*tag*/, Vec128 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx16x8FromBits(mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); + +template +HWY_API Vec128 Compress(hwy::SizeTag<4> /*tag*/, Vec128 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx32x4FromBits(mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } -#if HWY_CAP_INTEGER64 +#if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64 -template -HWY_API Vec128 Compress(Vec128 v, +template +HWY_API Vec128 Compress(hwy::SizeTag<8> /*tag*/, + Vec128 v, const uint64_t mask_bits) { const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -} -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -} - -#endif - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx32x4FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); -} - -#if HWY_CAP_FLOAT64 - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { - const auto idx = detail::Idx64x2FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } #endif @@ -2442,7 +2696,8 @@ HWY_API Vec128 Compress(Vec12 template HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + return detail::Compress(hwy::SizeTag(), v, + detail::BitsFromMask(mask)); } // ------------------------------ CompressStore @@ -2451,63 +2706,284 @@ template HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, Simd d, T* HWY_RESTRICT aligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); - Store(detail::Compress(v, mask_bits), d, aligned); + Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); return PopCount(mask_bits); } +// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, +// TableLookupBytes) + +// 128 bits +HWY_API void StoreInterleaved3(const Vec128 a, const Vec128 b, + const Vec128 c, Full128 d, + uint8_t* HWY_RESTRICT unaligned) { + const auto k5 = Set(d, 5); + const auto k6 = Set(d, 6); + + // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = Load(d, tbl_r0); + const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0.. + const auto int0 = r0 | g0 | b0; + StoreU(int0, d, unaligned + 0 * 16); + + // Second vector: g10,r10, bgr[9:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. + const auto r1 = TableLookupBytes(a, shuf_r1); + const auto g1 = TableLookupBytes(b, shuf_g1); + const auto b1 = TableLookupBytes(c, shuf_b1); + const auto int1 = r1 | g1 | b1; + StoreU(int1, d, unaligned + 1 * 16); + + // Third vector: bgr[15:11], b10 + const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. + const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. + const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A + const auto r2 = TableLookupBytes(a, shuf_r2); + const auto g2 = TableLookupBytes(b, shuf_g2); + const auto b2 = TableLookupBytes(c, shuf_b2); + const auto int2 = r2 | g2 | b2; + StoreU(int2, d, unaligned + 2 * 16); +} + +// 64 bits +HWY_API void StoreInterleaved3(const Vec128 a, + const Vec128 b, + const Vec128 c, Simd d, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and first result. + const Full128 d_full; + const auto k5 = Set(d_full, 5); + const auto k6 = Set(d_full, 6); + + const Vec128 full_a{a.raw}; + const Vec128 full_b{b.raw}; + const Vec128 full_c{c.raw}; + + // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = Load(d_full, tbl_r0); + const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. + const auto int0 = r0 | g0 | b0; + StoreU(int0, d_full, unaligned + 0 * 16); + + // Second (HALF) vector: bgr[7:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. + const auto r1 = TableLookupBytes(full_a, shuf_r1); + const auto g1 = TableLookupBytes(full_b, shuf_g1); + const auto b1 = TableLookupBytes(full_c, shuf_b1); + const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; + StoreU(int1, d, unaligned + 1 * 16); +} + +// <= 32 bits +template +HWY_API void StoreInterleaved3(const Vec128 a, + const Vec128 b, + const Vec128 c, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and result. + const Full128 d_full; + + const Vec128 full_a{a.raw}; + const Vec128 full_b{b.raw}; + const Vec128 full_c{c.raw}; + + // Shuffle (a,b,c) vector bytes to bgr[3:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // + 0x80, 0x80, 0x80, 0x80}; + const auto shuf_r0 = Load(d_full, tbl_r0); + const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0); + const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0); + const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 + const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. + const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. + const auto int0 = r0 | g0 | b0; + alignas(16) uint8_t buf[16]; + StoreU(int0, d_full, buf); + CopyBytes(buf, unaligned); +} + +// ------------------------------ StoreInterleaved4 + +// 128 bits +HWY_API void StoreInterleaved4(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + const Vec128 v3, Full128 d, + uint8_t* HWY_RESTRICT unaligned) { + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto ba8 = ZipUpper(v0, v1); + const auto dc8 = ZipUpper(v2, v3); + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 + const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8 + const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC + StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16); + StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16); + StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16); + StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16); +} + +// 64 bits +HWY_API void StoreInterleaved4(const Vec128 in0, + const Vec128 in1, + const Vec128 in2, + const Vec128 in3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Vec128 v0{in0.raw}; + const Vec128 v1{in1.raw}; + const Vec128 v2{in2.raw}; + const Vec128 v3{in3.raw}; + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 + const Full128 d_full; + StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16); + StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16); +} + +// <= 32 bits +template +HWY_API void StoreInterleaved4(const Vec128 in0, + const Vec128 in1, + const Vec128 in2, + const Vec128 in3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Vec128 v0{in0.raw}; + const Vec128 v1{in1.raw}; + const Vec128 v2{in2.raw}; + const Vec128 v3{in3.raw}; + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0 + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + alignas(16) uint8_t buf[16]; + const Full128 d_full; + StoreU(BitCast(d_full, dcba_0), d_full, buf); + CopyBytes<4 * N>(buf, unaligned); +} + // ------------------------------ Reductions namespace detail { -// For u32/i32/f32. -template -HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +// N=1 for any T: no-op +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} + +// N=4 (full) +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = v3210 + v1032; const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return v20_31_20_31 + v31_20_31_20; } -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Min(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Min(v20_31_20_31, v31_20_31_20); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Max(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Max(v20_31_20_31, v31_20_31_20); } -// For u64/i64/f64. -template -HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +// u64/i64/f64: + +// N=2 (full) +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return v10 + v01; } -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Min(v10, v01); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Max(v10, v01); } } // namespace detail -// Supported for u/i/f 32/64. Returns the sum in each lane. +// Supported for u/i/f 32/64. Returns the same value in each lane. template HWY_API Vec128 SumOfLanes(const Vec128 v) { return detail::SumOfLanes(hwy::SizeTag(), v); diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/wasm_128-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h.12 2021-06-02 10:56:05.240904417 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -154,27 +154,28 @@ HWY_API Vec128 Zero(Simd HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { - return Vec128{_mm_set1_epi8(t)}; + return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const uint16_t t) { - return Vec128{_mm_set1_epi16(t)}; + return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const uint32_t t) { - return Vec128{_mm_set1_epi32(t)}; + return Vec128{_mm_set1_epi32(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint64_t t) { - return Vec128{_mm_set1_epi64x(t)}; + return Vec128{ + _mm_set1_epi64x(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { - return Vec128{_mm_set1_epi8(t)}; + return Vec128{_mm_set1_epi8(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { - return Vec128{_mm_set1_epi16(t)}; + return Vec128{_mm_set1_epi16(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { @@ -182,7 +183,8 @@ HWY_API Vec128 Set(Simd HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { - return Vec128{_mm_set1_epi64x(t)}; + return Vec128{ + _mm_set1_epi64x(static_cast(t))}; // NOLINT } template HWY_API Vec128 Set(Simd /* tag */, const float t) { @@ -510,7 +512,8 @@ HWY_API Mask128 Xor(const Mask128< template HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); - return Mask128{m.raw}; + const Simd d; + return MaskFromVec(BitCast(Simd(), VecFromMask(d, m))); } // ------------------------------ Equality @@ -683,6 +686,14 @@ HWY_API Mask128 operator>=(co return Mask128{_mm_cmpge_pd(a.raw, b.raw)}; } +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask128 FirstN(const Simd d, size_t num) { + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); +} + // ================================================== ARITHMETIC // ------------------------------ Addition @@ -894,7 +905,7 @@ template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{_mm_abs_epi32(v.raw)}; } - +// i64 is implemented after BroadcastSignBit. template HWY_API Vec128 Abs(const Vec128 v) { const Vec128 mask{_mm_set1_epi32(0x7FFFFFFF)}; @@ -959,7 +970,6 @@ HWY_API Vec128 Mu // ------------------------------ ShiftLeft -// Unsigned template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi16(v.raw, kBits)}; @@ -988,6 +998,16 @@ HWY_API Vec128 ShiftLeft(con return Vec128{_mm_slli_epi64(v.raw, kBits)}; } +template +HWY_API Vec128 ShiftLeft(const Vec128 v) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + // ------------------------------ ShiftRight template @@ -1004,6 +1024,15 @@ HWY_API Vec128 ShiftRight(c } template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRight(Vec128{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srai_epi16(v.raw, kBits)}; } @@ -1012,6 +1041,15 @@ HWY_API Vec128 ShiftRight(co return Vec128{_mm_srai_epi32(v.raw, kBits)}; } +template +HWY_API Vec128 ShiftRight(const Vec128 v) { + const Simd di; + const Simd du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // i64 is implemented after BroadcastSignBit. // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) @@ -1039,15 +1077,24 @@ HWY_API Vec128 BroadcastSign return VecFromMask(v < Zero(Simd())); #else // Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires - // two constants and domain crossing. 32-bit compare only requires Zero() - // plus a shuffle to replicate the upper 32 bits. + // two constants and domain crossing. 32-bit shift avoids generating a zero. const Simd d32; - const auto sign = BitCast(d32, v) < Zero(d32); + const auto sign = ShiftRight<31>(BitCast(d32, v)); return Vec128{ _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; #endif } +template +HWY_API Vec128 Abs(const Vec128 v) { +#if HWY_TARGET == HWY_AVX3 + return Vec128{_mm_abs_epi64(v.raw)}; +#else + const auto zero = Zero(Simd()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + template HWY_API Vec128 ShiftRight(const Vec128 v) { #if HWY_TARGET == HWY_AVX3 @@ -1097,6 +1144,15 @@ HWY_API Vec128 ShiftLeftSame return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +template +HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftLeftSame(Vec128>{v.raw}, bits).raw}; + return shifted & Set(d8, (0xFF << bits) & 0xFF); +} + // ------------------------------ ShiftRightSame (BroadcastSignBit) template @@ -1116,6 +1172,16 @@ HWY_API Vec128 ShiftRightSa } template +HWY_API Vec128 ShiftRightSame(Vec128 v, + const int bits) { + const Simd d8; + // Use raw instead of BitCast to support N=1. + const Vec128 shifted{ + ShiftRightSame(Vec128{v.raw}, bits).raw}; + return shifted & Set(d8, 0xFF >> bits); +} + +template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; @@ -1140,6 +1206,15 @@ HWY_API Vec128 ShiftRightSam #endif } +template +HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { + const Simd di; + const Simd du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ Negate template @@ -1729,32 +1804,196 @@ HWY_API void Stream(const Vec128 +HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 4) { + _mm_i32scatter_epi32(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 index) { + if (N == 4) { + _mm_i32scatter_epi32(base, index.raw, v.raw, 4); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4); + } +} + +template +HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 2) { + _mm_i64scatter_epi64(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128 v, + Simd /* tag */, T* HWY_RESTRICT base, + const Vec128 index) { + if (N == 2) { + _mm_i64scatter_epi64(base, index.raw, v.raw, 8); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8); + } +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +template +HWY_INLINE void ScatterOffset(Vec128 v, Simd /* tag */, + float* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 4) { + _mm_i32scatter_ps(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_INLINE void ScatterIndex(Vec128 v, Simd /* tag */, + float* HWY_RESTRICT base, + const Vec128 index) { + if (N == 4) { + _mm_i32scatter_ps(base, index.raw, v.raw, 4); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4); + } +} + +template +HWY_INLINE void ScatterOffset(Vec128 v, Simd /* tag */, + double* HWY_RESTRICT base, + const Vec128 offset) { + if (N == 2) { + _mm_i64scatter_pd(base, offset.raw, v.raw, 1); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1); + } +} +template +HWY_INLINE void ScatterIndex(Vec128 v, Simd /* tag */, + double* HWY_RESTRICT base, + const Vec128 index) { + if (N == 2) { + _mm_i64scatter_pd(base, index.raw, v.raw, 8); + } else { + const __mmask8 mask = (1u << N) - 1; + _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8); + } +} +#else // HWY_TARGET == HWY_AVX3 + +template +HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, + const Vec128 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) T lanes[N]; + Store(v, d, lanes); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + +// ------------------------------ Gather (Load/Store) + #if HWY_TARGET == HWY_SSE4 template HWY_API Vec128 GatherOffset(const Simd d, const T* HWY_RESTRICT base, const Vec128 offset) { - static_assert(N == 1, "SSE4 does not support full gather"); - static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); - const uintptr_t address = reinterpret_cast(base) + GetLane(offset); - T val; - CopyBytes(reinterpret_cast(address), &val); - return Set(d, val); + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + alignas(16) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + alignas(16) T lanes[N]; + const uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); + } + return Load(d, lanes); } template HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, const Vec128 index) { - static_assert(N == 1, "SSE4 does not support full gather"); - static_assert(sizeof(T) == sizeof(Index), "T must match Index"); - return Set(d, base[GetLane(index)]); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + alignas(16) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + alignas(16) T lanes[N]; + for (size_t i = 0; i < N; ++i) { + lanes[i] = base[index_lanes[i]]; + } + return Load(d, lanes); } #else @@ -1832,6 +2071,8 @@ HWY_API Vec128 GatherIndex(Si #endif // HWY_TARGET != HWY_SSE4 +HWY_DIAGNOSTICS(pop) + // ================================================== SWIZZLE // ------------------------------ Extract half @@ -1859,10 +2100,10 @@ HWY_INLINE Vec128 UpperHalf(V // ------------------------------ Shift vector by constant #bytes // 0x01..0F, kBytes = 1 => 0x02..0F00 -template -HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { +template +HWY_API Vec128 ShiftLeftBytes(const Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - return Vec128{_mm_slli_si128(v.raw, kBytes)}; + return Vec128{_mm_slli_si128(v.raw, kBytes)}; } template @@ -1873,10 +2114,10 @@ HWY_API Vec128 ShiftLeftLanes(cons } // 0x01..0F, kBytes = 1 => 0x0001..0E -template -HWY_API Vec128 ShiftRightBytes(const Vec128 v) { +template +HWY_API Vec128 ShiftRightBytes(const Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - return Vec128{_mm_srli_si128(v.raw, kBytes)}; + return Vec128{_mm_srli_si128(v.raw, kBytes)}; } template @@ -2041,44 +2282,47 @@ HWY_API Vec128 Shuffle0123(const // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. -template +template struct Indices128 { __m128i raw; }; -template -HWY_API Indices128 SetTableIndices(Full128, const int32_t* idx) { +template +HWY_API Indices128 SetTableIndices(Simd d, const int32_t* idx) { #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) - const size_t N = 16 / sizeof(T); for (size_t i = 0; i < N; ++i) { HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); } #endif - const Full128 d8; - alignas(16) uint8_t control[16]; - for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { - const size_t idx_lane = idx_byte / sizeof(T); - const size_t mod = idx_byte % sizeof(T); - control[idx_byte] = static_cast(idx[idx_lane] * sizeof(T) + mod); + const Repartition d8; + alignas(16) uint8_t control[16] = {0}; + for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { + for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { + control[idx_lane * sizeof(T) + idx_byte] = + static_cast(idx[idx_lane] * sizeof(T) + idx_byte); + } } - return Indices128{Load(d8, control).raw}; + return Indices128{Load(d8, control).raw}; } -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128{idx.raw}); +template +HWY_API Vec128 TableLookupLanes( + const Vec128 v, const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - return TableLookupBytes(v, Vec128{idx.raw}); +template +HWY_API Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + return TableLookupBytes(v, Vec128{idx.raw}); } -HWY_API Vec128 TableLookupLanes(const Vec128 v, - const Indices128 idx) { - const Full128 di; - const Full128 df; +template +HWY_API Vec128 TableLookupLanes(const Vec128 v, + const Indices128 idx) { + const Simd di; + const Simd df; return BitCast(df, - TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); + TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); } // ------------------------------ Interleave lanes @@ -2286,47 +2530,47 @@ HWY_INLINE Vec128 ConcatUpperLow namespace detail { -template -HWY_API Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, - const Vec128 b) { - const Full128 d; - const Full128 d8; +template +HWY_API Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, + const Vec128 b) { + const Simd d; + const Repartition d8; alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); } -template -HWY_API Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; +template +HWY_API Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; } -template -HWY_API Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x33)}; +template +HWY_API Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x33)}; } -template -HWY_API Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x0F)}; +template +HWY_API Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, + const Vec128 b) { + return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x0F)}; } } // namespace detail -template -HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { +template +HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return detail::OddEven(hwy::SizeTag(), a, b); } -template <> -HWY_INLINE Vec128 OddEven(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; +template +HWY_INLINE Vec128 OddEven(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; } -template <> -HWY_INLINE Vec128 OddEven(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_blend_pd(a.raw, b.raw, 1)}; +template +HWY_INLINE Vec128 OddEven(const Vec128 a, + const Vec128 b) { + return Vec128{_mm_blend_pd(a.raw, b.raw, 1)}; } // ------------------------------ Shl (ZipLower, Mul) @@ -2764,7 +3008,7 @@ HWY_API Vec128 U8FromU32(con return LowerHalf(LowerHalf(BitCast(d8, quad))); } -// ------------------------------ Convert integer <=> floating point +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) template HWY_API Vec128 ConvertTo(Simd /* tag */, @@ -2779,13 +3023,20 @@ HWY_API Vec128 ConvertTo(Simd (void)dd; return Vec128{_mm_cvtepi64_pd(v.raw)}; #else - alignas(16) int64_t lanes_i[2]; - Store(v, Simd(), lanes_i); - alignas(16) double lanes_d[2]; - for (size_t i = 0; i < N; ++i) { - lanes_d[i] = static_cast(lanes_i[i]); - } - return Load(dd, lanes_d); + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const Repartition d32; + const Repartition d64; + + // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 + const auto k84_63 = Set(d64, 0x4530000080000000ULL); + const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); + + // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) + const auto k52 = Set(d32, 0x43300000); + const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); + + const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); + return (v_upper - k84_63_52) + v_lower; // order matters! #endif } @@ -2922,6 +3173,142 @@ HWY_API size_t CountTrue(const Mask128 +HWY_INLINE Vec128 Idx16x8FromBits(const uint64_t mask_bits) { + HWY_DASSERT(mask_bits < 256); + const Simd d; + const Rebind d8; + const Simd du; + + // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need + // byte indices for PSHUFB (one vector's worth for each of 256 combinations of + // 8 mask bits). Loading them directly would require 4 KiB. We can instead + // store lane indices and convert to byte indices (2*lane + 0..1), with the + // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane + // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. + // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles + // is likely more costly than the higher cache footprint from storing bytes. + alignas(16) constexpr uint8_t table[256 * 8] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, + 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, + 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, + 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, + 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, + 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, + 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, + 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, + 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, + 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, + 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, + 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, + 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, + 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, + 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, + 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, + 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, + 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, + 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, + 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, + 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, + 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, + 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, + 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, + 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, + 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, + 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, + 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, + 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, + 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, + 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, + 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, + 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, + 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, + 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, + 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, + 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, + 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, + 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, + 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, + 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, + 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, + 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, + 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, + 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, + 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, + 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, + 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, + 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, + 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, + 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, + 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, + 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, + 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, + 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, + 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, + 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, + 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, + 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, + 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, + 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, + 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, + 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, + 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, + 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, + 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, + 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, + 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, + 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, + 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, + 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, + 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, + 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, + 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, + 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, + 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, + 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, + 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, + 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, + 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, + 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, + 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, + 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, + 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, + 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, + 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, + 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, + 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, + 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, + 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, + 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, + 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, + 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, + 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, + 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, + 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, + 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, + 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, + 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, + 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, + 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, + 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, + 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, + 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, + 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, + 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; + + const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; + const Vec128 pairs = ZipLower(byte_idx, byte_idx); + return BitCast(d, pairs + Set(du, 0x0100)); +} + +template HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); @@ -2968,71 +3355,42 @@ HWY_INLINE Vec128 Idx64x2FromBits( // Helper function called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_epi32(mask_bits, v.raw)}; -#else - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); -#endif -} -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_epi32(mask_bits, v.raw)}; -#else - const auto idx = detail::Idx32x4FromBits(mask_bits); - return TableLookupBytes(v, idx); -#endif -} - -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_epi64(mask_bits, v.raw)}; -#else - const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -#endif -} -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_epi64(mask_bits, v.raw)}; -#else - const auto idx = detail::Idx64x2FromBits(mask_bits); - return TableLookupBytes(v, idx); -#endif +template +HWY_API Vec128 Compress(hwy::SizeTag<2> /*tag*/, Vec128 v, + const uint64_t mask_bits) { + const auto idx = detail::Idx16x8FromBits(mask_bits); + using D = Simd; + const RebindToSigned di; + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { +template +HWY_API Vec128 Compress(hwy::SizeTag<4> /*tag*/, Vec128 v, + const uint64_t mask_bits) { + using D = Simd; + using TI = MakeSigned; + const Rebind di; #if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_ps(mask_bits, v.raw)}; + return BitCast(D(), Vec128{_mm_maskz_compress_epi32( + mask_bits, BitCast(di, v).raw)}); #else - const auto idx = detail::Idx32x4FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); + const auto idx = detail::Idx32x4FromBits(mask_bits); + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); #endif } -template -HWY_API Vec128 Compress(Vec128 v, - const uint64_t mask_bits) { +template +HWY_API Vec128 Compress(hwy::SizeTag<8> /*tag*/, Vec128 v, + const uint64_t mask_bits) { + using D = Simd; + using TI = MakeSigned; + const Rebind di; #if HWY_TARGET == HWY_AVX3 - return Vec128{_mm_maskz_compress_pd(mask_bits, v.raw)}; + return BitCast(D(), Vec128{_mm_maskz_compress_epi64( + mask_bits, BitCast(di, v).raw)}); #else - const auto idx = detail::Idx64x2FromBits(mask_bits); - const Simd df; - const Simd di; - return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); + const auto idx = detail::Idx64x2FromBits(mask_bits); + return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); #endif } @@ -3040,7 +3398,8 @@ HWY_API Vec128 Compress(Vec12 template HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + return detail::Compress(hwy::SizeTag(), v, + detail::BitsFromMask(mask)); } // ------------------------------ CompressStore @@ -3050,63 +3409,285 @@ HWY_API size_t CompressStore(Vec128 d, T* HWY_RESTRICT aligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). - Store(detail::Compress(v, mask_bits), d, aligned); + Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); return PopCount(mask_bits); } +// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, +// TableLookupBytes) + +// 128 bits +HWY_API void StoreInterleaved3(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, Full128 d, + uint8_t* HWY_RESTRICT unaligned) { + const auto k5 = Set(d, 5); + const auto k6 = Set(d, 6); + + // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = Load(d, tbl_r0); + const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0.. + const auto int0 = r0 | g0 | b0; + StoreU(int0, d, unaligned + 0 * 16); + + // Second vector: g10,r10, bgr[9:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. + const auto r1 = TableLookupBytes(v0, shuf_r1); + const auto g1 = TableLookupBytes(v1, shuf_g1); + const auto b1 = TableLookupBytes(v2, shuf_b1); + const auto int1 = r1 | g1 | b1; + StoreU(int1, d, unaligned + 1 * 16); + + // Third vector: bgr[15:11], b10 + const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. + const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. + const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A + const auto r2 = TableLookupBytes(v0, shuf_r2); + const auto g2 = TableLookupBytes(v1, shuf_g2); + const auto b2 = TableLookupBytes(v2, shuf_b2); + const auto int2 = r2 | g2 | b2; + StoreU(int2, d, unaligned + 2 * 16); +} + +// 64 bits +HWY_API void StoreInterleaved3(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, Simd d, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and first result. + const Full128 d_full; + const auto k5 = Set(d_full, 5); + const auto k6 = Set(d_full, 6); + + const Vec128 full_a{v0.raw}; + const Vec128 full_b{v1.raw}; + const Vec128 full_c{v2.raw}; + + // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = Load(d_full, tbl_r0); + const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. + const auto int0 = r0 | g0 | b0; + StoreU(int0, d_full, unaligned + 0 * 16); + + // Second (HALF) vector: bgr[7:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. + const auto r1 = TableLookupBytes(full_a, shuf_r1); + const auto g1 = TableLookupBytes(full_b, shuf_g1); + const auto b1 = TableLookupBytes(full_c, shuf_b1); + const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; + StoreU(int1, d, unaligned + 1 * 16); +} + +// <= 32 bits +template +HWY_API void StoreInterleaved3(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors for the shuffles and result. + const Full128 d_full; + + const Vec128 full_a{v0.raw}; + const Vec128 full_b{v1.raw}; + const Vec128 full_c{v2.raw}; + + // Shuffle (v0,v1,v2) vector bytes to bgr[3:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // + 0x80, 0x80, 0x80, 0x80}; + const auto shuf_r0 = Load(d_full, tbl_r0); + const auto shuf_g0 = CombineShiftRightBytes<15>(shuf_r0, shuf_r0); + const auto shuf_b0 = CombineShiftRightBytes<14>(shuf_r0, shuf_r0); + const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 + const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. + const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. + const auto int0 = r0 | g0 | b0; + alignas(16) uint8_t buf[16]; + StoreU(int0, d_full, buf); + CopyBytes(buf, unaligned); +} + +// ------------------------------ StoreInterleaved4 + +// 128 bits +HWY_API void StoreInterleaved4(const Vec128 v0, + const Vec128 v1, + const Vec128 v2, + const Vec128 v3, Full128 d, + uint8_t* HWY_RESTRICT unaligned) { + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto ba8 = ZipUpper(v0, v1); + const auto dc8 = ZipUpper(v2, v3); + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 + const auto dcba_8 = ZipLower(ba8, dc8); // d..aB d..a8 + const auto dcba_C = ZipUpper(ba8, dc8); // d..aF d..aC + StoreU(BitCast(d, dcba_0), d, unaligned + 0 * 16); + StoreU(BitCast(d, dcba_4), d, unaligned + 1 * 16); + StoreU(BitCast(d, dcba_8), d, unaligned + 2 * 16); + StoreU(BitCast(d, dcba_C), d, unaligned + 3 * 16); +} + +// 64 bits +HWY_API void StoreInterleaved4(const Vec128 in0, + const Vec128 in1, + const Vec128 in2, + const Vec128 in3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Vec128 v0{in0.raw}; + const Vec128 v1{in1.raw}; + const Vec128 v2{in2.raw}; + const Vec128 v3{in3.raw}; + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + const auto dcba_4 = ZipUpper(ba0, dc0); // d..a7 d..a4 + const Full128 d_full; + StoreU(BitCast(d_full, dcba_0), d_full, unaligned + 0 * 16); + StoreU(BitCast(d_full, dcba_4), d_full, unaligned + 1 * 16); +} + +// <= 32 bits +template +HWY_API void StoreInterleaved4(const Vec128 in0, + const Vec128 in1, + const Vec128 in2, + const Vec128 in3, + Simd /*tag*/, + uint8_t* HWY_RESTRICT unaligned) { + // Use full vectors to reduce the number of stores. + const Vec128 v0{in0.raw}; + const Vec128 v1{in1.raw}; + const Vec128 v2{in2.raw}; + const Vec128 v3{in3.raw}; + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b3 a3 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d3 c3 .. d0 c0 + const auto dcba_0 = ZipLower(ba0, dc0); // d..a3 d..a0 + alignas(16) uint8_t buf[16]; + const Full128 d_full; + StoreU(BitCast(d_full, dcba_0), d_full, buf); + CopyBytes<4 * N>(buf, unaligned); +} + // ------------------------------ Reductions namespace detail { -// For u32/i32/f32. -template -HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +// N=1 for any T: no-op +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, + const Vec128 v) { + return v; +} + +// u32/i32/f32: + +// N=2 +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; +} +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, + const Vec128 v10) { + return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); +} + +// N=4 (full) +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = v3210 + v1032; const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return v20_31_20_31 + v31_20_31_20; } -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Min(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Min(v20_31_20_31, v31_20_31_20); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, - const Vec128 v3210) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Max(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Max(v20_31_20_31, v31_20_31_20); } -// For u64/i64/f64. -template -HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +// u64/i64/f64: + +// N=2 (full) +template +HWY_API Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return v10 + v01; } -template -HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Min(v10, v01); } -template -HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, - const Vec128 v10) { +template +HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Max(v10, v01); } } // namespace detail -// Supported for u/i/f 32/64. Returns the sum in each lane. +// Supported for u/i/f 32/64. Returns the same value in each lane. template HWY_API Vec128 SumOfLanes(const Vec128 v) { return detail::SumOfLanes(hwy::SizeTag(), v); diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_128-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h.12 2021-06-02 10:56:05.234904387 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -20,6 +20,20 @@ // particular, "Broadcast", pack and zip behavior may be surprising. #include // AVX2+ + +#if defined(_MSC_VER) && defined(__clang__) +// Including should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +#include +// avxintrin defines __m256i and must come before avx2intrin. +#include +#include // _pext_u64 +#include +#include +#include +#endif + #include #include @@ -148,23 +162,24 @@ HWY_API Vec256 Set(Full256{_mm256_set1_epi16(static_cast(t))}; // NOLINT } HWY_API Vec256 Set(Full256 /* tag */, const uint32_t t) { - return Vec256{_mm256_set1_epi32(static_cast(t))}; // NOLINT + return Vec256{_mm256_set1_epi32(static_cast(t))}; } HWY_API Vec256 Set(Full256 /* tag */, const uint64_t t) { return Vec256{ _mm256_set1_epi64x(static_cast(t))}; // NOLINT } HWY_API Vec256 Set(Full256 /* tag */, const int8_t t) { - return Vec256{_mm256_set1_epi8(t)}; + return Vec256{_mm256_set1_epi8(static_cast(t))}; // NOLINT } HWY_API Vec256 Set(Full256 /* tag */, const int16_t t) { - return Vec256{_mm256_set1_epi16(t)}; + return Vec256{_mm256_set1_epi16(static_cast(t))}; // NOLINT } HWY_API Vec256 Set(Full256 /* tag */, const int32_t t) { return Vec256{_mm256_set1_epi32(t)}; } HWY_API Vec256 Set(Full256 /* tag */, const int64_t t) { - return Vec256{_mm256_set1_epi64x(t)}; + return Vec256{ + _mm256_set1_epi64x(static_cast(t))}; // NOLINT } HWY_API Vec256 Set(Full256 /* tag */, const float t) { return Vec256{_mm256_set1_ps(t)}; @@ -340,6 +355,8 @@ HWY_API Vec256 VecFromMask(Full256 return Vec256{v.raw}; } +// ------------------------------ IfThenElse + // mask ? yes : no template HWY_API Vec256 IfThenElse(const Mask256 mask, const Vec256 yes, @@ -412,9 +429,9 @@ HWY_API Mask256 Xor(const Mask256 // Comparisons fill a lane with 1-bits if the condition is true, else 0. template -HWY_API Mask256 RebindMask(Full256 /*tag*/, Mask256 m) { +HWY_API Mask256 RebindMask(Full256 d_to, Mask256 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); - return Mask256{m.raw}; + return MaskFromVec(BitCast(d_to, VecFromMask(Full256(), m))); } // ------------------------------ Equality @@ -670,6 +687,14 @@ HWY_API Vec256 Max(const Vec256< return Vec256{_mm256_max_pd(a.raw, b.raw)}; } +// ------------------------------ FirstN (Iota, Lt) + +template +HWY_API Mask256 FirstN(const Full256 d, size_t n) { + const RebindToSigned di; // Signed comparisons are cheaper. + return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(n))); +} + // ================================================== ARITHMETIC // ------------------------------ Addition @@ -832,7 +857,13 @@ HWY_API Vec256 AverageRound(co // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. HWY_API Vec256 Abs(const Vec256 v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (wrong result) + const auto zero = Zero(Full256()); + return Vec256{_mm256_max_epi8(v.raw, (zero - v).raw)}; +#else return Vec256{_mm256_abs_epi8(v.raw)}; +#endif } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{_mm256_abs_epi16(v.raw)}; @@ -840,6 +871,7 @@ HWY_API Vec256 Abs(const Vec256 HWY_API Vec256 Abs(const Vec256 v) { return Vec256{_mm256_abs_epi32(v.raw)}; } +// i64 is implemented after BroadcastSignBit. HWY_API Vec256 Abs(const Vec256 v) { const Vec256 mask{_mm256_set1_epi32(0x7FFFFFFF)}; @@ -925,6 +957,16 @@ HWY_API Vec256 ShiftLeft(const return Vec256{_mm256_slli_epi64(v.raw, kBits)}; } +template +HWY_API Vec256 ShiftLeft(const Vec256 v) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + // ------------------------------ ShiftRight template @@ -943,6 +985,14 @@ HWY_API Vec256 ShiftRight(cons } template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 d8; + // Use raw instead of BitCast to support N=1. + const Vec256 shifted{ShiftRight(Vec256{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template HWY_API Vec256 ShiftRight(const Vec256 v) { return Vec256{_mm256_srai_epi16(v.raw, kBits)}; } @@ -952,6 +1002,15 @@ HWY_API Vec256 ShiftRight(const return Vec256{_mm256_srai_epi32(v.raw, kBits)}; } +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // i64 is implemented after BroadcastSignBit. // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) @@ -989,6 +1048,15 @@ HWY_API Vec256 ShiftRight(const #endif } +HWY_API Vec256 Abs(const Vec256 v) { +#if HWY_TARGET == HWY_AVX3 + return Vec256{_mm256_abs_epi64(v.raw)}; +#else + const auto zero = Zero(Full256()); + return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); +#endif +} + // ------------------------------ ShiftLeftSame HWY_API Vec256 ShiftLeftSame(const Vec256 v, @@ -1016,6 +1084,14 @@ HWY_API Vec256 ShiftLeftSame(co return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +template +HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, (0xFF << bits) & 0xFF); +} + // ------------------------------ ShiftRightSame (BroadcastSignBit) HWY_API Vec256 ShiftRightSame(const Vec256 v, @@ -1031,6 +1107,13 @@ HWY_API Vec256 ShiftRightSame( return Vec256{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, 0xFF >> bits); +} + HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { return Vec256{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; @@ -1053,6 +1136,14 @@ HWY_API Vec256 ShiftRightSame(c #endif } +HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { + const Full256 di; + const Full256 du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ Negate template @@ -1335,6 +1426,123 @@ HWY_API void Stream(const Vec256 _mm256_stream_pd(aligned, v.raw); } +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +#if HWY_TARGET == HWY_AVX3 +namespace detail { + +template +HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); +} +template +HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template +HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); +} +template +HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256 v, + Full256 /* tag */, T* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +template <> +HWY_INLINE void ScatterOffset(Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); +} +template <> +HWY_INLINE void ScatterIndex(Vec256 v, Full256 /* tag */, + float* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i32scatter_ps(base, index.raw, v.raw, 4); +} + +template <> +HWY_INLINE void ScatterOffset(Vec256 v, + Full256 /* tag */, + double* HWY_RESTRICT base, + const Vec256 offset) { + _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); +} +template <> +HWY_INLINE void ScatterIndex(Vec256 v, + Full256 /* tag */, + double* HWY_RESTRICT base, + const Vec256 index) { + _mm256_i64scatter_pd(base, index.raw, v.raw, 8); +} + +#else + +template +HWY_API void ScatterOffset(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + + constexpr size_t N = 32 / sizeof(T); + alignas(32) T lanes[N]; + Store(v, d, lanes); + + alignas(32) Offset offset_lanes[N]; + Store(offset, Simd(), offset_lanes); + + uint8_t* base_bytes = reinterpret_cast(base); + for (size_t i = 0; i < N; ++i) { + CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); + } +} + +template +HWY_API void ScatterIndex(Vec256 v, Full256 d, T* HWY_RESTRICT base, + const Vec256 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + + constexpr size_t N = 32 / sizeof(T); + alignas(32) T lanes[N]; + Store(v, d, lanes); + + alignas(32) Index index_lanes[N]; + Store(index, Simd(), index_lanes); + + for (size_t i = 0; i < N; ++i) { + base[index_lanes[i]] = lanes[i]; + } +} + +#endif + // ------------------------------ Gather namespace detail { @@ -1374,13 +1582,13 @@ HWY_API Vec256 GatherIndex(hwy::SizeT template HWY_API Vec256 GatherOffset(Full256 d, const T* HWY_RESTRICT base, const Vec256 offset) { - static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); return detail::GatherOffset(hwy::SizeTag(), d, base, offset); } template HWY_API Vec256 GatherIndex(Full256 d, const T* HWY_RESTRICT base, const Vec256 index) { - static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); return detail::GatherIndex(hwy::SizeTag(), d, base, index); } @@ -1410,6 +1618,8 @@ HWY_INLINE Vec256 GatherIndex{_mm256_i64gather_pd(base, index.raw, 8)}; } +HWY_DIAGNOSTICS(pop) + // ================================================== SWIZZLE template @@ -1861,38 +2071,26 @@ HWY_API Vec256 ZipUpper(const V return Vec256{_mm256_unpackhi_epi32(a.raw, b.raw)}; } -// ------------------------------ Blocks +// ------------------------------ Blocks (LowerHalf, ZeroExtendVector) + +// _mm256_broadcastsi128_si256 has 7 cycle latency. _mm256_permute2x128_si256 is +// slow on Zen1 (8 uops); we can avoid it for LowerLower and UpperLower, and on +// UpperUpper at the cost of one extra cycle/instruction. // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API Vec256 ConcatLowerLower(const Vec256 hi, const Vec256 lo) { - return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x20)}; + return Vec256{_mm256_inserti128_si256(lo.raw, LowerHalf(hi).raw, 1)}; } template <> HWY_INLINE Vec256 ConcatLowerLower(const Vec256 hi, const Vec256 lo) { - return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x20)}; + return Vec256{_mm256_insertf128_ps(lo.raw, LowerHalf(hi).raw, 1)}; } template <> HWY_INLINE Vec256 ConcatLowerLower(const Vec256 hi, const Vec256 lo) { - return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x20)}; -} - -// hiH,hiL loH,loL |-> hiH,loH (= upper halves) -template -HWY_API Vec256 ConcatUpperUpper(const Vec256 hi, const Vec256 lo) { - return Vec256{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)}; -} -template <> -HWY_INLINE Vec256 ConcatUpperUpper(const Vec256 hi, - const Vec256 lo) { - return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; -} -template <> -HWY_INLINE Vec256 ConcatUpperUpper(const Vec256 hi, - const Vec256 lo) { - return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; + return Vec256{_mm256_insertf128_pd(lo.raw, LowerHalf(hi).raw, 1)}; } // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) @@ -1927,6 +2125,12 @@ HWY_INLINE Vec256 ConcatUpperLow return Vec256{_mm256_blend_pd(hi.raw, lo.raw, 3)}; } +// hiH,hiL loH,loL |-> hiH,loH (= upper halves) +template +HWY_API Vec256 ConcatUpperUpper(const Vec256 hi, const Vec256 lo) { + return ConcatUpperLower(hi, ZeroExtendVector(UpperHalf(lo))); +} + // ------------------------------ Odd/even lanes namespace detail { @@ -2211,11 +2415,18 @@ HWY_API Vec128 DemoteTo(Full128< _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; } + // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". + // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") + HWY_API Vec128 DemoteTo(Full128 /* tag */, const Vec256 v) { return Vec128{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; } +HWY_DIAGNOSTICS(pop) + HWY_API Vec128 DemoteTo(Full128 /* tag */, const Vec256 v) { return Vec128{_mm256_cvtpd_ps(v.raw)}; @@ -2241,7 +2452,7 @@ HWY_API Vec128 U8FromU32(con return BitCast(Simd(), pair); } -// ------------------------------ Convert integer <=> floating point +// ------------------------------ Integer <=> fp (ShiftRight, OddEven) HWY_API Vec256 ConvertTo(Full256 /* tag */, const Vec256 v) { @@ -2253,13 +2464,20 @@ HWY_API Vec256 ConvertTo(Full256 (void)dd; return Vec256{_mm256_cvtepi64_pd(v.raw)}; #else - alignas(32) int64_t lanes_i[4]; - Store(v, Full256(), lanes_i); - alignas(32) double lanes_d[4]; - for (size_t i = 0; i < 4; ++i) { - lanes_d[i] = static_cast(lanes_i[i]); - } - return Load(dd, lanes_d); + // Based on wim's approach (https://stackoverflow.com/questions/41144668/) + const Repartition d32; + const Repartition d64; + + // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 + const auto k84_63 = Set(d64, 0x4530000080000000ULL); + const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); + + // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) + const auto k52 = Set(d32, 0x43300000); + const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); + + const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); + return (v_upper - k84_63_52) + v_lower; // order matters! #endif } @@ -2334,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT const auto compressed = _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0)); return static_cast(_mm256_movemask_epi8(compressed)); - -#endif +#endif // HWY_ARCH_X86_64 } template @@ -2473,75 +2690,100 @@ HWY_INLINE Vec256 Idx64x4FromB return Load(d32, packed_array + 8 * mask_bits); } -// Helper function called by both Compress and CompressStore - avoids a +// Helper functions called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. -HWY_API Vec256 Compress(Vec256 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)}; -#else - const Vec256 idx = detail::Idx32x8FromBits(mask_bits); - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; -#endif -} -HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { +template +HWY_API Vec256 Compress(hwy::SizeTag<4> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + const auto vu = BitCast(Full256(), v); #if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), v.raw)}; + const __m256i ret = + _mm256_maskz_compress_epi32(static_cast<__mmask8>(mask_bits), vu.raw); #else const Vec256 idx = detail::Idx32x8FromBits(mask_bits); - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; + const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw); #endif + return BitCast(Full256(), Vec256{ret}); } -HWY_API Vec256 Compress(Vec256 v, - const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)}; -#else - const Vec256 idx = detail::Idx64x4FromBits(mask_bits); - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; -#endif -} -HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { +template +HWY_API Vec256 Compress(hwy::SizeTag<8> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + const auto vu = BitCast(Full256(), v); #if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), v.raw)}; + const __m256i ret = + _mm256_maskz_compress_epi64(static_cast<__mmask8>(mask_bits), vu.raw); #else const Vec256 idx = detail::Idx64x4FromBits(mask_bits); - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; + const __m256i ret = _mm256_permutevar8x32_epi32(vu.raw, idx.raw); #endif + return BitCast(Full256(), Vec256{ret}); } -HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_ps(static_cast<__mmask8>(mask_bits), v.raw)}; -#else - const Vec256 idx = detail::Idx32x8FromBits(mask_bits); - return Vec256{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; -#endif -} +// Otherwise, defined in x86_512-inl.h so it can use wider vectors. +#if HWY_TARGET != HWY_AVX3 -HWY_API Vec256 Compress(Vec256 v, const uint64_t mask_bits) { -#if HWY_TARGET == HWY_AVX3 - return Vec256{ - _mm256_maskz_compress_pd(static_cast<__mmask8>(mask_bits), v.raw)}; -#else - const Vec256 idx = detail::Idx64x4FromBits(mask_bits); - return Vec256{_mm256_castsi256_pd( - _mm256_permutevar8x32_epi32(_mm256_castpd_si256(v.raw), idx.raw))}; -#endif +// LUTs are infeasible for 2^16 possible masks. Promoting to 32-bit and using +// the native Compress is probably more efficient than 2 LUTs. +template +HWY_API Vec256 Compress(hwy::SizeTag<2> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + using D = Full256; + const Rebind du; + const Repartition dw; + const auto vu16 = BitCast(du, v); // (required for float16_t inputs) + const auto promoted0 = PromoteTo(dw, LowerHalf(vu16)); + const auto promoted1 = PromoteTo(dw, UpperHalf(vu16)); + + const uint64_t mask_bits0 = mask_bits & 0xFF; + const uint64_t mask_bits1 = mask_bits >> 8; + const auto compressed0 = Compress(hwy::SizeTag<4>(), promoted0, mask_bits0); + const auto compressed1 = Compress(hwy::SizeTag<4>(), promoted1, mask_bits1); + + const Half dh; + const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0)); + const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1)); + + const size_t count0 = PopCount(mask_bits0); + // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with + // VPERMD for shifting at 4 byte granularity. + alignas(32) constexpr int32_t iota4[16] = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7}; + const auto indices = SetTableIndices(dw, iota4 + 8 - count0 / 2); + const auto shift1_multiple4 = + BitCast(du, TableLookupLanes(BitCast(dw, demoted1), indices)); + + // Whole-register unconditional shift by 2 bytes. + // TODO(janwas): slow on AMD, use 2 shifts + permq + OR instead? + const __m256i lo_zz = _mm256_permute2x128_si256(shift1_multiple4.raw, + shift1_multiple4.raw, 0x08); + const auto shift1_multiple2 = + Vec256{_mm256_alignr_epi8(shift1_multiple4.raw, lo_zz, 14)}; + + // Make the shift conditional on the lower bit of count0. + const auto m_odd = TestBit(Set(du, count0), Set(du, 1)); + const auto shifted1 = IfThenElse(m_odd, shift1_multiple2, shift1_multiple4); + + // Blend the lower and shifted upper parts. + constexpr uint16_t on = 0xFFFF; + alignas(32) constexpr uint16_t lower_lanes[32] = {HWY_REP4(on), HWY_REP4(on), + HWY_REP4(on), HWY_REP4(on)}; + const auto m_lower = MaskFromVec(LoadU(du, lower_lanes + 16 - count0)); + return BitCast(D(), IfThenElse(m_lower, demoted0, shifted1)); } +#endif // HWY_TARGET != HWY_AVX3 + } // namespace detail +// Otherwise, defined in x86_512-inl.h after detail::Compress. +#if HWY_TARGET != HWY_AVX3 + template HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + return detail::Compress(hwy::SizeTag(), v, + detail::BitsFromMask(mask)); } // ------------------------------ CompressStore @@ -2550,10 +2792,101 @@ template HWY_API size_t CompressStore(Vec256 v, const Mask256 mask, Full256 d, T* HWY_RESTRICT aligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); - Store(detail::Compress(v, mask_bits), d, aligned); + // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but + // using StoreU to concatenate the results would cause page faults if + // `aligned` is the last valid vector. Instead rely on in-register splicing. + Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); return PopCount(mask_bits); } +#endif // HWY_TARGET != HWY_AVX3 + +// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, +// TableLookupBytes, ConcatUpperLower) + +HWY_API void StoreInterleaved3(const Vec256 v0, + const Vec256 v1, + const Vec256 v2, Full256 d, + uint8_t* HWY_RESTRICT unaligned) { + const auto k5 = Set(d, 5); + const auto k6 = Set(d, 6); + + // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = LoadDup128(d, tbl_r0); + const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5 + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0.. + const auto interleaved_10_00 = r0 | g0 | b0; + + // Second vector: g10,r10, bgr[9:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. + const auto r1 = TableLookupBytes(v0, shuf_r1); + const auto g1 = TableLookupBytes(v1, shuf_g1); + const auto b1 = TableLookupBytes(v2, shuf_b1); + const auto interleaved_15_05 = r1 | g1 | b1; + + // We want to write the lower halves of the interleaved vectors, then the + // upper halves. We could obtain 10_05 and 15_0A via ConcatUpperLower, but + // that would require two ununaligned stores. For the lower halves, we can + // merge two 128-bit stores for the same swizzling cost: + const auto out0 = ConcatLowerLower(interleaved_15_05, interleaved_10_00); + StoreU(out0, d, unaligned + 0 * 32); + + // Third vector: bgr[15:11], b10 + const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. + const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. + const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A + const auto r2 = TableLookupBytes(v0, shuf_r2); + const auto g2 = TableLookupBytes(v1, shuf_g2); + const auto b2 = TableLookupBytes(v2, shuf_b2); + const auto interleaved_1A_0A = r2 | g2 | b2; + + const auto out1 = ConcatUpperLower(interleaved_10_00, interleaved_1A_0A); + StoreU(out1, d, unaligned + 1 * 32); + + const auto out2 = ConcatUpperUpper(interleaved_1A_0A, interleaved_15_05); + StoreU(out2, d, unaligned + 2 * 32); +} + +// ------------------------------ StoreInterleaved4 + +HWY_API void StoreInterleaved4(const Vec256 v0, + const Vec256 v1, + const Vec256 v2, + const Vec256 v3, Full256 d, + uint8_t* HWY_RESTRICT unaligned) { + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto ba8 = ZipUpper(v0, v1); + const auto dc8 = ZipUpper(v2, v3); + const auto dcba_0 = ZipLower(ba0, dc0); // d..a13 d..a10 | d..a03 d..a00 + const auto dcba_4 = ZipUpper(ba0, dc0); // d..a17 d..a14 | d..a07 d..a04 + const auto dcba_8 = ZipLower(ba8, dc8); // d..a1B d..a18 | d..a0B d..a08 + const auto dcba_C = ZipUpper(ba8, dc8); // d..a1F d..a1C | d..a0F d..a0C + // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can + // efficiently combine two lower halves into 256 bits: + const auto out0 = BitCast(d, ConcatLowerLower(dcba_4, dcba_0)); + const auto out1 = BitCast(d, ConcatLowerLower(dcba_C, dcba_8)); + StoreU(out0, d, unaligned + 0 * 32); + StoreU(out1, d, unaligned + 1 * 32); + const auto out2 = BitCast(d, ConcatUpperUpper(dcba_4, dcba_0)); + const auto out3 = BitCast(d, ConcatUpperUpper(dcba_C, dcba_8)); + StoreU(out2, d, unaligned + 2 * 32); + StoreU(out3, d, unaligned + 3 * 32); +} + // ------------------------------ Reductions namespace detail { diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_256-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h.12 2021-06-02 10:56:05.218904306 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -19,6 +19,23 @@ // particular, "Broadcast", pack and zip behavior may be surprising. #include // AVX2+ +#if defined(_MSC_VER) && defined(__clang__) +// Including should be enough, but Clang's headers helpfully skip +// including these headers when _MSC_VER is defined, like when using clang-cl. +// Include these directly here. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include @@ -100,9 +117,8 @@ struct RawMask512<8> { // Mask register: one bit per lane. template class Mask512 { - using Raw = typename RawMask512::type; - public: + using Raw = typename RawMask512::type; Raw raw; }; @@ -167,23 +183,24 @@ HWY_API Vec512 Set(Full512{_mm512_set1_epi16(static_cast(t))}; // NOLINT } HWY_API Vec512 Set(Full512 /* tag */, const uint32_t t) { - return Vec512{_mm512_set1_epi32(static_cast(t))}; // NOLINT + return Vec512{_mm512_set1_epi32(static_cast(t))}; } HWY_API Vec512 Set(Full512 /* tag */, const uint64_t t) { return Vec512{ _mm512_set1_epi64(static_cast(t))}; // NOLINT } HWY_API Vec512 Set(Full512 /* tag */, const int8_t t) { - return Vec512{_mm512_set1_epi8(t)}; + return Vec512{_mm512_set1_epi8(static_cast(t))}; // NOLINT } HWY_API Vec512 Set(Full512 /* tag */, const int16_t t) { - return Vec512{_mm512_set1_epi16(t)}; + return Vec512{_mm512_set1_epi16(static_cast(t))}; // NOLINT } HWY_API Vec512 Set(Full512 /* tag */, const int32_t t) { return Vec512{_mm512_set1_epi32(t)}; } HWY_API Vec512 Set(Full512 /* tag */, const int64_t t) { - return Vec512{_mm512_set1_epi64(t)}; + return Vec512{ + _mm512_set1_epi64(static_cast(t))}; // NOLINT } HWY_API Vec512 Set(Full512 /* tag */, const float t) { return Vec512{_mm512_set1_ps(t)}; @@ -329,7 +346,45 @@ HWY_API Vec512 CopySignToAbs(const Ve return CopySign(abs, sign); } -// ------------------------------ Select/blend +// ------------------------------ FirstN + +// Possibilities for constructing a bitmask of N ones: +// - kshift* only consider the lowest byte of the shift count, so they would +// not correctly handle large n. +// - Scalar shifts >= 64 are UB. +// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However, +// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds. + +#if HWY_ARCH_X86_32 +namespace detail { + +// 32 bit mask is sufficient for lane size >= 2. +template +HWY_API Mask512 FirstN(size_t n) { + using Bits = typename Mask512::Raw; + return Mask512{static_cast(_bzhi_u32(~uint32_t(0), n))}; +} + +template +HWY_API Mask512 FirstN(size_t n) { + const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0); + return Mask512{static_cast<__mmask64>(bits)}; +} + +} // namespace detail +#endif // HWY_ARCH_X86_32 + +template +HWY_API Mask512 FirstN(const Full512 /*tag*/, size_t n) { +#if HWY_ARCH_X86_64 + using Bits = typename Mask512::Raw; + return Mask512{static_cast(_bzhi_u64(~uint64_t(0), n))}; +#else + return detail::FirstN(n); +#endif // HWY_ARCH_X86_64 +} + +// ------------------------------ IfThenElse // Returns mask ? b : a. @@ -626,7 +681,13 @@ HWY_API Vec512 AverageRound(co // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. HWY_API Vec512 Abs(const Vec512 v) { +#if HWY_COMPILER_MSVC + // Workaround for incorrect codegen? (untested due to internal compiler error) + const auto zero = Zero(Full512()); + return Vec512{_mm512_max_epi8(v.raw, (zero - v).raw)}; +#else return Vec512{_mm512_abs_epi8(v.raw)}; +#endif } HWY_API Vec512 Abs(const Vec512 v) { return Vec512{_mm512_abs_epi16(v.raw)}; @@ -634,6 +695,9 @@ HWY_API Vec512 Abs(const Vec512 HWY_API Vec512 Abs(const Vec512 v) { return Vec512{_mm512_abs_epi32(v.raw)}; } +HWY_API Vec512 Abs(const Vec512 v) { + return Vec512{_mm512_abs_epi64(v.raw)}; +} // These aren't native instructions, they also involve AND with constant. HWY_API Vec512 Abs(const Vec512 v) { @@ -675,6 +739,16 @@ HWY_API Vec512 ShiftLeft(const return Vec512{_mm512_slli_epi64(v.raw, kBits)}; } +template +HWY_API Vec512 ShiftLeft(const Vec512 v) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); + return kBits == 1 + ? (v + v) + : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); +} + // ------------------------------ ShiftRight template @@ -693,6 +767,14 @@ HWY_API Vec512 ShiftRight(cons } template +HWY_API Vec512 ShiftRight(const Vec512 v) { + const Full512 d8; + // Use raw instead of BitCast to support N=1. + const Vec512 shifted{ShiftRight(Vec512{v.raw}).raw}; + return shifted & Set(d8, 0xFF >> kBits); +} + +template HWY_API Vec512 ShiftRight(const Vec512 v) { return Vec512{_mm512_srai_epi16(v.raw, kBits)}; } @@ -707,6 +789,15 @@ HWY_API Vec512 ShiftRight(const return Vec512{_mm512_srai_epi64(v.raw, kBits)}; } +template +HWY_API Vec512 ShiftRight(const Vec512 v) { + const Full512 di; + const Full512 du; + const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ ShiftLeftSame HWY_API Vec512 ShiftLeftSame(const Vec512 v, @@ -734,6 +825,14 @@ HWY_API Vec512 ShiftLeftSame(co return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +template +HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); + return shifted & Set(d8, (0xFF << bits) & 0xFF); +} + // ------------------------------ ShiftRightSame HWY_API Vec512 ShiftRightSame(const Vec512 v, @@ -749,6 +848,13 @@ HWY_API Vec512 ShiftRightSame( return Vec512{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { + const Full512 d8; + const RepartitionToWide d16; + const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); + return shifted & Set(d8, 0xFF >> bits); +} + HWY_API Vec512 ShiftRightSame(const Vec512 v, const int bits) { return Vec512{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; @@ -763,6 +869,14 @@ HWY_API Vec512 ShiftRightSame(c return Vec512{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } +HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { + const Full512 di; + const Full512 du; + const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); + const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); + return (shifted ^ shifted_sign) - shifted_sign; +} + // ------------------------------ Shl HWY_API Vec512 operator<<(const Vec512 v, @@ -1046,6 +1160,10 @@ HWY_API Vec512 ApproximateRecipro // ------------------------------ Floating-point rounding +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + // Toward nearest integer, tie to even HWY_API Vec512 Round(const Vec512 v) { return Vec512{_mm512_roundscale_ps( @@ -1086,6 +1204,8 @@ HWY_API Vec512 Floor(const Vec51 _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } +HWY_DIAGNOSTICS(pop) + // ================================================== COMPARE // Comparisons set a mask bit to 1 if the condition is true, else 0. @@ -1678,6 +1798,83 @@ HWY_API void Stream(const Vec512 _mm512_stream_pd(aligned, v.raw); } +// ------------------------------ Scatter + +// Work around warnings in the intrinsic definitions (passing -1 as a mask). +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + +namespace detail { + +template +HWY_API void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1); +} +template +HWY_API void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i32scatter_epi32(base, index.raw, v.raw, 4); +} + +template +HWY_API void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1); +} +template +HWY_API void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512 v, + Full512 /* tag */, T* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i64scatter_epi64(base, index.raw, v.raw, 8); +} + +} // namespace detail + +template +HWY_API void ScatterOffset(Vec512 v, Full512 d, T* HWY_RESTRICT base, + const Vec512 offset) { + static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); + return detail::ScatterOffset(hwy::SizeTag(), v, d, base, offset); +} +template +HWY_API void ScatterIndex(Vec512 v, Full512 d, T* HWY_RESTRICT base, + const Vec512 index) { + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); + return detail::ScatterIndex(hwy::SizeTag(), v, d, base, index); +} + +template <> +HWY_INLINE void ScatterOffset(Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i32scatter_ps(base, offset.raw, v.raw, 1); +} +template <> +HWY_INLINE void ScatterIndex(Vec512 v, Full512 /* tag */, + float* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i32scatter_ps(base, index.raw, v.raw, 4); +} + +template <> +HWY_INLINE void ScatterOffset(Vec512 v, + Full512 /* tag */, + double* HWY_RESTRICT base, + const Vec512 offset) { + _mm512_i64scatter_pd(base, offset.raw, v.raw, 1); +} +template <> +HWY_INLINE void ScatterIndex(Vec512 v, + Full512 /* tag */, + double* HWY_RESTRICT base, + const Vec512 index) { + _mm512_i64scatter_pd(base, index.raw, v.raw, 8); +} + // ------------------------------ Gather namespace detail { @@ -1713,13 +1910,13 @@ HWY_API Vec512 GatherIndex(hwy::SizeT template HWY_API Vec512 GatherOffset(Full512 d, const T* HWY_RESTRICT base, const Vec512 offset) { - static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs"); +static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); return detail::GatherOffset(hwy::SizeTag(), d, base, offset); } template HWY_API Vec512 GatherIndex(Full512 d, const T* HWY_RESTRICT base, const Vec512 index) { - static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx"); + static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); return detail::GatherIndex(hwy::SizeTag(), d, base, index); } @@ -1749,6 +1946,8 @@ HWY_INLINE Vec512 GatherIndex{_mm512_i64gather_pd(index.raw, base, 8)}; } +HWY_DIAGNOSTICS(pop) + // ================================================== SWIZZLE template @@ -2439,7 +2638,11 @@ HWY_API Vec256 DemoteTo(Full256< HWY_API Vec256 DemoteTo(Full256 /* tag */, const Vec512 v) { + // Work around warnings in the intrinsic definitions (passing -1 as a mask). + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Vec256{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; + HWY_DIAGNOSTICS(pop) } HWY_API Vec256 DemoteTo(Full256 /* tag */, @@ -2633,8 +2836,81 @@ HWY_API Vec512 Compress(Vec512{_mm512_maskz_compress_pd(mask.raw, v.raw)}; } +namespace detail { + +// Ignore IDE redefinition error for these two functions: if this header is +// included, then the functions weren't actually defined in x86_256-inl.h. +template +HWY_API Vec256 Compress(hwy::SizeTag<2> /*tag*/, Vec256 v, + const uint64_t mask_bits) { + using D = Full256; + const Rebind du; + const Rebind dw; // 512-bit, not 256! + const auto vu16 = BitCast(du, v); // (required for float16_t inputs) + const Mask512 mask{static_cast<__mmask16>(mask_bits)}; + return BitCast(D(), DemoteTo(du, Compress(PromoteTo(dw, vu16), mask))); +} + +} // namespace detail + +template +HWY_API Vec256 Compress(Vec256 v, const Mask256 mask) { + return detail::Compress(hwy::SizeTag(), v, + detail::BitsFromMask(mask)); +} + +// Expands to 32-bit, compresses, concatenate demoted halves. +template +HWY_API Vec512 Compress(Vec512 v, const Mask512 mask) { + using D = Full512; + const Rebind du; + const Repartition dw; + const auto vu16 = BitCast(du, v); // (required for float16_t inputs) + const auto promoted0 = PromoteTo(dw, LowerHalf(vu16)); + const auto promoted1 = PromoteTo(dw, UpperHalf(vu16)); + + const Mask512 mask0{static_cast<__mmask16>(mask.raw & 0xFFFF)}; + const Mask512 mask1{static_cast<__mmask16>(mask.raw >> 16)}; + const auto compressed0 = Compress(promoted0, mask0); + const auto compressed1 = Compress(promoted1, mask1); + + const Half dh; + const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0)); + const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1)); + + // Concatenate into single vector by shifting upper with writemask. + const size_t num0 = CountTrue(mask0); + const __mmask32 m_upper = ~((1u << num0) - 1); + alignas(64) uint16_t iota[64] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + const auto idx = LoadU(du, iota + 32 - num0); + return Vec512{_mm512_mask_permutexvar_epi16(demoted0.raw, m_upper, idx.raw, + demoted1.raw)}; +} + // ------------------------------ CompressStore +template +HWY_API size_t CompressStore(Vec256 v, const Mask256 mask, Full256 d, + T* HWY_RESTRICT aligned) { + const uint64_t mask_bits = detail::BitsFromMask(mask); + Store(detail::Compress(hwy::SizeTag(), v, mask_bits), d, aligned); + return PopCount(mask_bits); +} + +template +HWY_API size_t CompressStore(Vec512 v, const Mask512 mask, Full512 d, + T* HWY_RESTRICT aligned) { + // NOTE: it is tempting to split inputs into two halves for 16-bit lanes, but + // using StoreU to concatenate the results would cause page faults if + // `aligned` is the last valid vector. Instead rely on in-register splicing. + Store(Compress(v, mask), d, aligned); + return CountTrue(mask); +} + HWY_API size_t CompressStore(Vec512 v, const Mask512 mask, Full512 /* tag */, uint32_t* HWY_RESTRICT aligned) { @@ -2675,6 +2951,98 @@ HWY_API size_t CompressStore(Vec512 a, const Vec512 b, + const Vec512 c, Full512 d, + uint8_t* HWY_RESTRICT unaligned) { + const auto k5 = Set(d, 5); + const auto k6 = Set(d, 6); + + // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. + // 0x80 so lanes to be filled from other vectors are 0 for blending. + alignas(16) static constexpr uint8_t tbl_r0[16] = { + 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // + 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; + alignas(16) static constexpr uint8_t tbl_g0[16] = { + 0x80, 0, 0x80, 0x80, 1, 0x80, // + 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; + const auto shuf_r0 = LoadDup128(d, tbl_r0); + const auto shuf_g0 = LoadDup128(d, tbl_g0); // cannot reuse r0 due to 5 + const auto shuf_b0 = CombineShiftRightBytes<15>(shuf_g0, shuf_g0); + const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0 + const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0. + const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0.. + const auto i = (r0 | g0 | b0).raw; // low byte in each 128bit: 30 20 10 00 + + // Second vector: g10,r10, bgr[9:6], b5,g5 + const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. + const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 + const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. + const auto r1 = TableLookupBytes(a, shuf_r1); + const auto g1 = TableLookupBytes(b, shuf_g1); + const auto b1 = TableLookupBytes(c, shuf_b1); + const auto j = (r1 | g1 | b1).raw; // low byte in each 128bit: 35 25 15 05 + + // Third vector: bgr[15:11], b10 + const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. + const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. + const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A + const auto r2 = TableLookupBytes(a, shuf_r2); + const auto g2 = TableLookupBytes(b, shuf_g2); + const auto b2 = TableLookupBytes(c, shuf_b2); + const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A + + // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns". + const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0)); + const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1)); + const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2)); + + // Alternating order, most-significant 128 bits from the second arg. + const __mmask8 m = 0xCC; + const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1); + const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2); + const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0); + + StoreU(Vec512{i1_k0_j0_i0}, d, unaligned + 0 * 64); // 10 0A 05 00 + StoreU(Vec512{j2_i2_k1_j1}, d, unaligned + 1 * 64); // 25 20 1A 15 + StoreU(Vec512{k3_j3_i3_k2}, d, unaligned + 2 * 64); // 3A 35 30 2A +} + +// ------------------------------ StoreInterleaved4 + +HWY_API void StoreInterleaved4(const Vec512 v0, + const Vec512 v1, + const Vec512 v2, + const Vec512 v3, Full512 d, + uint8_t* HWY_RESTRICT unaligned) { + // let a,b,c,d denote v0..3. + const auto ba0 = ZipLower(v0, v1); // b7 a7 .. b0 a0 + const auto dc0 = ZipLower(v2, v3); // d7 c7 .. d0 c0 + const auto ba8 = ZipUpper(v0, v1); + const auto dc8 = ZipUpper(v2, v3); + const auto i = ZipLower(ba0, dc0).raw; // 4x128bit: d..a3 d..a0 + const auto j = ZipUpper(ba0, dc0).raw; // 4x128bit: d..a7 d..a4 + const auto k = ZipLower(ba8, dc8).raw; // 4x128bit: d..aB d..a8 + const auto l = ZipUpper(ba8, dc8).raw; // 4x128bit: d..aF d..aC + // 128-bit blocks were independent until now; transpose 4x4. + const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0)); + const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0)); + const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2)); + const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2)); + constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0); + constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1); + const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20); + const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31); + const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20); + const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31); + StoreU(Vec512{l0_k0_j0_i0}, d, unaligned + 0 * 64); + StoreU(Vec512{l1_k1_j1_i1}, d, unaligned + 1 * 64); + StoreU(Vec512{l2_k2_j2_i2}, d, unaligned + 2 * 64); + StoreU(Vec512{l3_k3_j3_i3}, d, unaligned + 3 * 64); +} + // ------------------------------ Reductions // Returns the sum in each lane. diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/ops/x86_512-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc.12 2021-06-02 10:56:05.281904625 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.cc 2021-05-31 10:37:11.000000000 -0400 @@ -28,12 +28,12 @@ #if HWY_ARCH_X86 #include -#ifdef _MSC_VER +#if HWY_COMPILER_MSVC #include -#else +#else // HWY_COMPILER_MSVC #include -#endif -#endif +#endif // HWY_COMPILER_MSVC +#endif // HWY_ARCH_X86 namespace hwy { namespace { @@ -48,13 +48,13 @@ bool IsBitSet(const uint32_t reg, const // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). void Cpuid(const uint32_t level, const uint32_t count, uint32_t* HWY_RESTRICT abcd) { -#ifdef _MSC_VER +#if HWY_COMPILER_MSVC int regs[4]; __cpuidex(regs, level, count); for (int i = 0; i < 4; ++i) { abcd[i] = regs[i]; } -#else +#else // HWY_COMPILER_MSVC uint32_t a; uint32_t b; uint32_t c; @@ -64,22 +64,22 @@ void Cpuid(const uint32_t level, const u abcd[1] = b; abcd[2] = c; abcd[3] = d; -#endif +#endif // HWY_COMPILER_MSVC } // Returns the lower 32 bits of extended control register 0. // Requires CPU support for "OSXSAVE" (see below). uint32_t ReadXCR0() { -#ifdef _MSC_VER +#if HWY_COMPILER_MSVC return static_cast(_xgetbv(0)); -#else +#else // HWY_COMPILER_MSVC uint32_t xcr0, xcr0_high; const uint32_t index = 0; asm volatile(".byte 0x0F, 0x01, 0xD0" : "=a"(xcr0), "=d"(xcr0_high) : "c"(index)); return xcr0; -#endif +#endif // HWY_COMPILER_MSVC } #endif // HWY_ARCH_X86 @@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13; constexpr uint32_t kAVX512DQ = 1u << 14; constexpr uint32_t kAVX512BW = 1u << 15; constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW; -#endif +#endif // HWY_ARCH_X86 } // namespace diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h.12 2021-06-02 10:56:05.267904554 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets.h 2021-05-31 10:37:11.000000000 -0400 @@ -65,7 +65,9 @@ // HWY_MAX_DYNAMIC_TARGETS in total. #define HWY_HIGHEST_TARGET_BIT_X86 9 -// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium +#define HWY_SVE2 0x400 +#define HWY_SVE 0x800 +// 0x1000 reserved for Helium #define HWY_NEON 0x2000 #define HWY_HIGHEST_TARGET_BIT_ARM 13 @@ -90,6 +92,9 @@ // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved #define HWY_SCALAR 0x20000000 + +#define HWY_HIGHEST_TARGET_BIT_SCALAR 29 + // Cannot use higher values, otherwise HWY_TARGETS computation might overflow. //------------------------------------------------------------------------------ @@ -106,25 +111,26 @@ #ifndef HWY_BROKEN_TARGETS // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid -// SSE4 codegen (msan failure), so disable all those targets. +// SSE4 codegen (possibly only for msan), so disable all those targets. #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) -// TODO: Disable all non-scalar targets for every build target once we have -// clang-7 enabled in our builders. -#ifdef MEMORY_SANITIZER #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3) -#else -#define HWY_BROKEN_TARGETS 0 -#endif // This entails a major speed reduction, so warn unless the user explicitly // opts in to scalar-only. #if !defined(HWY_COMPILE_ONLY_SCALAR) #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.") #endif -// MSVC, or 32-bit may fail to compile AVX2/3. -#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32 +// 32-bit may fail to compile AVX2/3. +#elif HWY_ARCH_X86_32 #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3) -#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds") + +// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16 +#elif HWY_COMPILER_MSVC != 0 +#define HWY_BROKEN_TARGETS (HWY_AVX3) + +// armv7be has not been tested and is not yet supported. +#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN)) +#define HWY_BROKEN_TARGETS (HWY_NEON) #else #define HWY_BROKEN_TARGETS 0 @@ -145,53 +151,74 @@ // user to override this without any guarantee of success. #ifndef HWY_BASELINE_TARGETS -#ifdef __wasm_simd128__ +// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with +// HWY_TARGET == HWY_SCALAR. + +#if HWY_ARCH_WASM && defined(__wasm_simd128__) #define HWY_BASELINE_WASM HWY_WASM #else #define HWY_BASELINE_WASM 0 #endif -#ifdef __VSX__ +// Avoid choosing the PPC target until we have an implementation. +#if HWY_ARCH_PPC && defined(__VSX__) && 0 #define HWY_BASELINE_PPC8 HWY_PPC8 #else #define HWY_BASELINE_PPC8 0 #endif -// GCC 4.5.4 only defines the former; 5.4 defines both. -#if defined(__ARM_NEON__) || defined(__ARM_NEON) +// Avoid choosing the SVE[2] targets the implementation is ready. +#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0 +#define HWY_BASELINE_SVE2 HWY_SVE2 +#else +#define HWY_BASELINE_SVE2 0 +#endif + +#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0 +#define HWY_BASELINE_SVE HWY_SVE +#else +#define HWY_BASELINE_SVE 0 +#endif + +// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both. +#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON)) #define HWY_BASELINE_NEON HWY_NEON #else #define HWY_BASELINE_NEON 0 #endif -#ifdef __SSE4_1__ +// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means +// we at least get SSE4 on machines supporting AVX but not AVX2. +// https://stackoverflow.com/questions/18563978/ +#if HWY_ARCH_X86 && \ + (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__))) #define HWY_BASELINE_SSE4 HWY_SSE4 #else #define HWY_BASELINE_SSE4 0 #endif -#ifdef __AVX2__ +#if HWY_ARCH_X86 && defined(__AVX2__) #define HWY_BASELINE_AVX2 HWY_AVX2 #else #define HWY_BASELINE_AVX2 0 #endif -#ifdef __AVX512F__ +#if HWY_ARCH_X86 && defined(__AVX512F__) #define HWY_BASELINE_AVX3 HWY_AVX3 #else #define HWY_BASELINE_AVX3 0 #endif -#ifdef __riscv_vector +#if HWY_ARCH_RVV && defined(__riscv_vector) #define HWY_BASELINE_RVV HWY_RVV #else #define HWY_BASELINE_RVV 0 #endif #define HWY_BASELINE_TARGETS \ - (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \ - HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ - HWY_BASELINE_RVV) + (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \ + HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 | \ + HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV) #endif // HWY_BASELINE_TARGETS @@ -242,13 +269,12 @@ #define HWY_TARGETS HWY_STATIC_TARGET // 3) For tests: include all attainable targets (in particular: scalar) -#elif defined(HWY_COMPILE_ALL_ATTAINABLE) +#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST) #define HWY_TARGETS HWY_ATTAINABLE_TARGETS // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by // excluding superseded targets, in particular scalar. #else - #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1)) #endif // target policy @@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha #endif #if HWY_ARCH_ARM + case HWY_SVE2: + return "SVE2"; + case HWY_SVE: + return "SVE"; case HWY_NEON: return "Neon"; #endif @@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha return "Scalar"; default: - return "?"; + return "Unknown"; // must satisfy gtest IsValidParamName() } } @@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha nullptr, /* SSE3 */ \ nullptr /* SSE2 */ -#endif // HWY_ARCH_X86 - -#if HWY_ARCH_ARM +#elif HWY_ARCH_ARM // See HWY_ARCH_X86 above for details. #define HWY_MAX_DYNAMIC_TARGETS 4 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM #define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ + HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ + HWY_CHOOSE_SVE(func_name), /* SVE */ \ nullptr, /* reserved */ \ HWY_CHOOSE_NEON(func_name) /* NEON */ -#endif // HWY_ARCH_ARM - -#if HWY_ARCH_PPC +#elif HWY_ARCH_PPC // See HWY_ARCH_X86 above for details. #define HWY_MAX_DYNAMIC_TARGETS 5 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC @@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha nullptr, /* VSX */ \ nullptr /* AltiVec */ -#endif // HWY_ARCH_PPC - -#if HWY_ARCH_WASM +#elif HWY_ARCH_WASM // See HWY_ARCH_X86 above for details. #define HWY_MAX_DYNAMIC_TARGETS 4 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM @@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha nullptr, /* reserved */ \ HWY_CHOOSE_WASM(func_name) /* WASM */ -#endif // HWY_ARCH_WASM - -#if HWY_ARCH_RVV +#elif HWY_ARCH_RVV // See HWY_ARCH_X86 above for details. #define HWY_MAX_DYNAMIC_TARGETS 4 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV @@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha nullptr, /* reserved */ \ HWY_CHOOSE_RVV(func_name) /* RVV */ -#endif // HWY_ARCH_RVV +#else +// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though +// still creating single-entry tables in HWY_EXPORT to ensure portability. +#define HWY_MAX_DYNAMIC_TARGETS 1 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR +#endif struct ChosenTarget { public: diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets.hE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc.12 2021-06-02 10:56:05.264904539 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -35,19 +35,19 @@ DECLARE_FUNCTION(SCALAR) HWY_EXPORT(FakeFunction); void CheckFakeFunction() { -#define CHECK_ARRAY_ENTRY(TGT) \ - if ((HWY_TARGETS & HWY_##TGT) != 0) { \ - hwy::SetSupportedTargetsForTest(HWY_##TGT); \ - /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \ - /* the pointer to the already cached function. */ \ - hwy::chosen_target.Update(); \ - EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ - /* Calling DeInit() will test that the initializer function */ \ - /* also calls the right function. */ \ - hwy::chosen_target.DeInit(); \ - EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ - /* Second call uses the cached value from the previous call. */ \ - EXPECT_EQ(HWY_##TGT, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ +#define CHECK_ARRAY_ENTRY(TGT) \ + if ((HWY_TARGETS & HWY_##TGT) != 0) { \ + hwy::SetSupportedTargetsForTest(HWY_##TGT); \ + /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \ + /* the pointer to the already cached function. */ \ + hwy::chosen_target.Update(); \ + EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ + /* Calling DeInit() will test that the initializer function */ \ + /* also calls the right function. */ \ + hwy::chosen_target.DeInit(); \ + EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ + /* Second call uses the cached value from the previous call. */ \ + EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ } CHECK_ARRAY_ENTRY(AVX3) CHECK_ARRAY_ENTRY(AVX2) diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/targets_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc.12 2021-06-02 10:56:05.251904473 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -16,7 +16,6 @@ #include #include -#include #include #undef HWY_TARGET_INCLUDE @@ -173,16 +172,8 @@ struct TestFloatAbs { }; HWY_NOINLINE void TestAllAbs() { - const ForPartialVectors test; - test(int8_t()); - test(int16_t()); - test(int32_t()); - - const ForPartialVectors test_float; - test_float(float()); -#if HWY_CAP_FLOAT64 - test_float(double()); -#endif + ForSignedTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); } template @@ -199,6 +190,45 @@ struct TestLeftShifts { const size_t N = Lanes(d); auto expected = AllocateAligned(N); + const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); + + // 1 + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + const T value = kSigned ? T(i) - T(N) : T(i); + expected[i] = T(TU(value) << kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); + } +}; + +template +struct TestVariableLeftShifts { + template + HWY_NOINLINE void operator()(T t, D d) { + if (kSigned) { + // Also test positive values + TestVariableLeftShifts()(t, d); + } + + using TI = MakeSigned; + using TU = MakeUnsigned; + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + const auto v0 = Zero(d); const auto v1 = Set(d, 1); const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift @@ -209,8 +239,6 @@ struct TestLeftShifts { const auto large_shifts = max_shift - small_shifts; // Same: 0 - HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); - HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); // Same: 1 @@ -218,8 +246,6 @@ struct TestLeftShifts { const T value = kSigned ? T(i) - T(N) : T(i); expected[i] = T(TU(value) << 1); } - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); // Same: max @@ -227,8 +253,6 @@ struct TestLeftShifts { const T value = kSigned ? T(i) - T(N) : T(i); expected[i] = T(TU(value) << kMaxShift); } - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft(values)); - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); // Variable: small @@ -252,6 +276,37 @@ struct TestUnsignedRightShifts { const size_t N = Lanes(d); auto expected = AllocateAligned(N); + const auto values = Iota(d, 0); + + const T kMax = LimitsMax(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> kMaxShift); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); + } +}; + +struct TestVariableUnsignedRightShifts { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + const auto v0 = Zero(d); const auto v1 = Set(d, 1); const auto values = Iota(d, 0); @@ -265,21 +320,15 @@ struct TestUnsignedRightShifts { const auto large_shifts = max_shift - small_shifts; // Same: 0 - HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); - HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); // Same: 1 for (size_t i = 0; i < N; ++i) { - expected[i] = T(i >> 1); + expected[i] = T(T(i & kMax) >> 1); } - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); - HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); // Same: max - HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(values)); - HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); // Variable: small @@ -296,33 +345,120 @@ struct TestUnsignedRightShifts { } }; -struct TestSignedRightShifts { +template +T RightShiftNegative(T val) { + // C++ shifts are implementation-defined for negative numbers, and we have + // seen divisions replaced with shifts, so resort to bit operations. + using TU = hwy::MakeUnsigned; + TU bits; + CopyBytes(&val, &bits); + + const TU shifted = bits >> kAmount; + + const TU all = ~TU(0); + const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; + const TU sign_extended = static_cast((all << num_zero) & LimitsMax()); + + bits = shifted | sign_extended; + CopyBytes(&bits, &val); + return val; +} + +class TestSignedRightShifts { + public: template - HWY_NOINLINE void operator()(T t, D d) { - // Also test positive values - TestUnsignedRightShifts()(t, d); + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + constexpr T kMin = LimitsMin(); + constexpr T kMax = LimitsMax(); + constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto values = Iota(d, 0) & Set(d, kMax); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); + HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(values)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); + + // Even negative value + Test<0>(kMin, d, __LINE__); + Test<1>(kMin, d, __LINE__); + Test<2>(kMin, d, __LINE__); + Test(kMin, d, __LINE__); + + const T odd = static_cast(kMin + 1); + Test<0>(odd, d, __LINE__); + Test<1>(odd, d, __LINE__); + Test<2>(odd, d, __LINE__); + Test(odd, d, __LINE__); + } + + private: + template + void Test(T val, D d, int line) { + const auto expected = Set(d, RightShiftNegative(val)); + const auto in = Set(d, val); + const char* file = __FILE__; + AssertVecEqual(d, expected, ShiftRight(in), file, line); + AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); + } +}; +struct TestVariableSignedRightShifts { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { using TU = MakeUnsigned; const size_t N = Lanes(d); auto expected = AllocateAligned(N); constexpr T kMin = LimitsMin(); - const auto values = Iota(d, kMin); + constexpr T kMax = LimitsMax(); constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; + + // First test positive values, negative are checked below. + const auto v0 = Zero(d); + const auto positive = Iota(d, 0) & Set(d, kMax); + + // Shift by 0 + HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); + HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); + + // Shift by 1 + for (size_t i = 0; i < N; ++i) { + expected[i] = T(T(i & kMax) >> 1); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); + HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); + + // max + HWY_ASSERT_VEC_EQ(d, v0, ShiftRight(positive)); + HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); + const auto max_shift = Set(d, kMaxShift); const auto small_shifts = And(Iota(d, 0), max_shift); const auto large_shifts = max_shift - small_shifts; - // Test varying values to shift + const auto negative = Iota(d, kMin); + + // Test varying negative to shift for (size_t i = 0; i < N; ++i) { - // We want a right-shift here, which is undefined behavior for negative - // numbers. Since we want (-1)>>1 to be -1, we need to adjust rounding if - // minT is odd and negative. - T minT = static_cast(kMin + i); - expected[i] = T(minT / 2 + (minT < 0 ? minT % 2 : 0)); + expected[i] = RightShiftNegative<1>(static_cast(kMin + i)); } - HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, Set(d, 1))); + HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); // Shift MSB right by small amounts for (size_t i = 0; i < N; ++i) { @@ -343,6 +479,13 @@ struct TestSignedRightShifts { }; HWY_NOINLINE void TestAllShifts() { + ForUnsignedTypes(ForPartialVectors>()); + ForSignedTypes(ForPartialVectors>()); + ForUnsignedTypes(ForPartialVectors()); + ForSignedTypes(ForPartialVectors()); +} + +HWY_NOINLINE void TestAllVariableShifts() { const ForPartialVectors> shl_u; const ForPartialVectors> shl_s; const ForPartialVectors shr_u; @@ -821,6 +964,40 @@ HWY_NOINLINE void TestAllRound() { ForFloatTypes(ForPartialVectors()); } +struct TestNearestInt { + template + HWY_NOINLINE void operator()(TF tf, const DF df) { + using TI = MakeSigned; + const RebindToSigned di; + + size_t padded; + auto in = RoundTestCases(tf, df, padded); + auto expected = AllocateAligned(padded); + + constexpr double max = static_cast(LimitsMax()); + for (size_t i = 0; i < padded; ++i) { + if (std::isnan(in[i])) { + // We replace NaN with 0 below (no_nan) + expected[i] = 0; + } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) { + // Avoid undefined result for lrintf + expected[i] = std::signbit(in[i]) ? LimitsMin() : LimitsMax(); + } else { + expected[i] = lrintf(in[i]); + } + } + for (size_t i = 0; i < padded; i += Lanes(df)) { + const auto v = Load(df, &in[i]); + const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df)); + HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan)); + } + } +}; + +HWY_NOINLINE void TestAllNearestInt() { + ForPartialVectors()(float()); +} + struct TestTrunc { template HWY_NOINLINE void operator()(T t, D d) { @@ -909,8 +1086,7 @@ struct TestSumOfLanes { }; HWY_NOINLINE void TestAllSumOfLanes() { - // Only full vectors because lanes in partial vectors are undefined. - const ForFullVectors sum; + const ForPartialVectors sum; // No u8/u16/i8/i16. sum(uint32_t()); @@ -976,9 +1152,8 @@ struct TestMaxOfLanes { }; HWY_NOINLINE void TestAllMinMaxOfLanes() { - // Only full vectors because lanes in partial vectors are undefined. - const ForFullVectors min; - const ForFullVectors max; + const ForPartialVectors min; + const ForPartialVectors max; // No u8/u16/i8/i16. min(uint32_t()); @@ -1044,10 +1219,12 @@ HWY_NOINLINE void TestAllNeg() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyArithmeticTest); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs); @@ -1062,10 +1239,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound); +HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff); HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/arithmetic_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc.12 2021-06-02 10:56:05.252904478 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -272,13 +272,14 @@ HWY_NOINLINE void TestAllCombineShiftRig HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyCombineTest); HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf); HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf); HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector); HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine); HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight); -HWY_AFTER_TEST(); +} // namespace hwy #endif #else diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/combine_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc.12 2021-06-02 10:56:05.249904463 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -206,11 +206,12 @@ HWY_NOINLINE void TestAllWeakFloat() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyCompareTest); HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask); HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality); HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt); HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat); HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/compare_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc.12 2021-06-02 10:56:05.261904523 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -16,8 +16,6 @@ #include #include -#include - #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/convert_test.cc" #include "hwy/foreach_target.h" @@ -547,37 +545,6 @@ HWY_NOINLINE void TestAllI32F64() { #endif } -struct TestNearestInt { - template - HWY_NOINLINE void operator()(TI /*unused*/, const DI di) { - using TF = MakeFloat; - const Rebind df; - const size_t N = Lanes(df); - - // Integer positive - HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 4.0f))); - - // Integer negative - HWY_ASSERT_VEC_EQ(di, Iota(di, -32), NearestInt(Iota(df, -32.0f))); - - // Above positive - HWY_ASSERT_VEC_EQ(di, Iota(di, 2), NearestInt(Iota(df, 2.001f))); - - // Below positive - HWY_ASSERT_VEC_EQ(di, Iota(di, 4), NearestInt(Iota(df, 3.9999f))); - - const TF eps = static_cast(0.0001); - // Above negative - HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) + eps))); - - // Below negative - HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), NearestInt(Iota(df, -TF(N) - eps))); - } -}; - -HWY_NOINLINE void TestAllNearestInt() { - ForPartialVectors()(int32_t()); -} // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -585,6 +552,7 @@ HWY_NOINLINE void TestAllNearestInt() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyConvertTest); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); @@ -596,6 +564,5 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64); -HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllNearestInt); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/convert_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/hwy_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.cc diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/list_targets.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc.12 2021-06-02 10:56:05.245904442 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -14,6 +14,7 @@ #include #include +#include // memcmp #include "hwy/base.h" @@ -159,6 +160,30 @@ HWY_NOINLINE void TestAllCopySign() { ForFloatTypes(ForPartialVectors()); } +struct TestFirstN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto mask_lanes = AllocateAligned(N); + + // NOTE: reverse polarity (mask is true iff mask_lanes[i] == 0) because we + // cannot reliably compare against all bits set (NaN for float types). + const T off = 1; + + for (size_t len = 0; len <= N; ++len) { + for (size_t i = 0; i < N; ++i) { + mask_lanes[i] = i < len ? T(0) : off; + } + const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d)); + HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len)); + } + } +}; + +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors()); +} + struct TestIfThenElse { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -208,15 +233,56 @@ HWY_NOINLINE void TestAllIfThenElse() { ForAllTypes(ForPartialVectors()); } -// Also tests MaskFromVec/VecFromMask +struct TestMaskVec { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + const size_t N = Lanes(d); + auto mask_lanes = AllocateAligned(N); + + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + mask_lanes[i] = static_cast(Random32(&rng) & 1); + } + + const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + } + } +}; + +HWY_NOINLINE void TestAllMaskVec() { + const ForPartialVectors test; + + test(uint16_t()); + test(int16_t()); + // TODO(janwas): float16_t - cannot compare yet + + test(uint32_t()); + test(int32_t()); + test(float()); + +#if HWY_CAP_INTEGER64 + test(uint64_t()); + test(int64_t()); +#endif +#if HWY_CAP_FLOAT64 + test(double()); +#endif +} + struct TestCompress { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; + using TU = MakeUnsigned; + const Rebind du; const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); - auto mask_lanes = AllocateAligned(N); + auto mask_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); auto actual = AllocateAligned(N); @@ -224,35 +290,56 @@ struct TestCompress { for (size_t rep = 0; rep < 100; ++rep) { size_t expected_pos = 0; for (size_t i = 0; i < N; ++i) { - in_lanes[i] = static_cast(Random32(&rng)); - mask_lanes[i] = static_cast(Random32(&rng) & 1); + const uint64_t bits = Random32(&rng); + in_lanes[i] = T(); // cannot initialize float16_t directly. + CopyBytes(&bits, &in_lanes[i]); + mask_lanes[i] = static_cast(Random32(&rng) & 1); if (mask_lanes[i] == 0) { // Zero means true (easier to compare) expected[expected_pos++] = in_lanes[i]; } } const auto in = Load(d, in_lanes.get()); - const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d)); + const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du))); - HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); Store(Compress(in, mask), d, actual.get()); // Upper lanes are undefined. for (size_t i = 0; i < expected_pos; ++i) { - HWY_ASSERT(actual[i] == expected[i]); + HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0); } // Also check CompressStore in the same way. - std::fill(actual.get(), actual.get() + N, T(0)); + memset(actual.get(), 0, N * sizeof(T)); const size_t num_written = CompressStore(in, mask, d, actual.get()); HWY_ASSERT_EQ(expected_pos, num_written); for (size_t i = 0; i < expected_pos; ++i) { - HWY_ASSERT_EQ(expected[i], actual[i]); + HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0); } } } }; #if 0 +namespace detail { // for code folding +void PrintCompress16x8Tables() { + constexpr size_t N = 8; // 128-bit SIMD + for (uint64_t code = 0; code < 1ull << N; ++code) { + std::array indices{0}; + size_t pos = 0; + for (size_t i = 0; i < N; ++i) { + if (code & (1ull << i)) { + indices[pos++] = i; + } + } + + // Doubled (for converting lane to byte indices) + for (size_t i = 0; i < N; ++i) { + printf("%d,", 2 * indices[i]); + } + } + printf("\n"); +} + // Compressed to nibbles void PrintCompress32x8Tables() { constexpr size_t N = 8; // AVX2 @@ -340,16 +427,22 @@ void PrintCompress64x2Tables() { } printf("\n"); } - +} // namespace detail #endif HWY_NOINLINE void TestAllCompress() { - // PrintCompress32x8Tables(); - // PrintCompress64x4Tables(); - // PrintCompress32x4Tables(); - // PrintCompress64x2Tables(); + // detail::PrintCompress32x8Tables(); + // detail::PrintCompress64x4Tables(); + // detail::PrintCompress32x4Tables(); + // detail::PrintCompress64x2Tables(); + // detail::PrintCompress16x8Tables(); const ForPartialVectors test; + + test(uint16_t()); + test(int16_t()); + test(float16_t()); + test(uint32_t()); test(int32_t()); test(float()); @@ -358,7 +451,6 @@ HWY_NOINLINE void TestAllCompress() { test(uint64_t()); test(int64_t()); #endif - #if HWY_CAP_FLOAT64 test(double()); #endif @@ -432,7 +524,7 @@ struct TestTestBit { }; HWY_NOINLINE void TestAllTestBit() { - ForIntegerTypes(ForFullVectors()); + ForIntegerTypes(ForPartialVectors()); } struct TestAllTrueFalse { @@ -445,6 +537,8 @@ struct TestAllTrueFalse { auto lanes = AllocateAligned(N); std::fill(lanes.get(), lanes.get() + N, T(0)); + auto mask_lanes = AllocateAligned(N); + HWY_ASSERT(AllTrue(Eq(v, zero))); HWY_ASSERT(!AllFalse(Eq(v, zero))); @@ -456,7 +550,13 @@ struct TestAllTrueFalse { for (size_t i = 0; i < N; ++i) { lanes[i] = T(1); v = Load(d, lanes.get()); - HWY_ASSERT(!AllTrue(Eq(v, zero))); + + // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be. + // Assigning to an lvalue is insufficient but storing to memory prevents + // the bug; so does Print of VecFromMask(d, Eq(v, zero)). + Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get()); + HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get())))); + HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero))); lanes[i] = T(-1); @@ -596,7 +696,7 @@ struct TestLogicalMask { }; HWY_NOINLINE void TestAllLogicalMask() { - ForAllTypes(ForFullVectors()); + ForAllTypes(ForPartialVectors()); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -604,11 +704,14 @@ HWY_NOINLINE void TestAllLogicalMask() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyLogicalTest); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit); @@ -617,5 +720,5 @@ HWY_EXPORT_AND_TEST_P(HwyLogicalTest, Te HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/logical_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc.12 2021-06-02 10:56:05.247904453 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are +// detected. Must come before Highway headers. +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + #include #include @@ -76,6 +82,119 @@ HWY_NOINLINE void TestAllLoadStore() { ForAllTypes(ForPartialVectors()); } +struct TestStoreInterleaved3 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned(3 * N); + for (size_t i = 0; i < 3 * N; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned(4 * N); + auto actual_aligned = AllocateAligned(4 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[3 * i + 0] = bytes[0 * N + i]; + expected[3 * i + 1] = bytes[1 * N + i]; + expected[3 * i + 2] = bytes[2 * N + i]; + // Ensure we do not write more than 3*N bytes + expected[3 * N + i] = actual[3 * N + i] = 0; + } + StoreInterleaved3(in0, in1, in2, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) { + Print(d, "in0", in0, pos / 3); + Print(d, "in1", in1, pos / 3); + Print(d, "in2", in2, pos / 3); + const size_t i = pos - pos % 3; + fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i], + actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4], + actual[i + 5]); + HWY_ASSERT(false); + } + } + } +}; + +HWY_NOINLINE void TestAllStoreInterleaved3() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors test; +#else + const ForPartialVectors test; +#endif + test(uint8_t()); +} + +struct TestStoreInterleaved4 { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + RandomState rng; + + // Data to be interleaved + auto bytes = AllocateAligned(4 * N); + for (size_t i = 0; i < 4 * N; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto in0 = Load(d, &bytes[0 * N]); + const auto in1 = Load(d, &bytes[1 * N]); + const auto in2 = Load(d, &bytes[2 * N]); + const auto in3 = Load(d, &bytes[3 * N]); + + // Interleave here, ensure vector results match scalar + auto expected = AllocateAligned(5 * N); + auto actual_aligned = AllocateAligned(5 * N + 1); + T* actual = actual_aligned.get() + 1; + + for (size_t rep = 0; rep < 100; ++rep) { + for (size_t i = 0; i < N; ++i) { + expected[4 * i + 0] = bytes[0 * N + i]; + expected[4 * i + 1] = bytes[1 * N + i]; + expected[4 * i + 2] = bytes[2 * N + i]; + expected[4 * i + 3] = bytes[3 * N + i]; + // Ensure we do not write more than 4*N bytes + expected[4 * N + i] = actual[4 * N + i] = 0; + } + StoreInterleaved4(in0, in1, in2, in3, d, actual); + size_t pos = 0; + if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) { + Print(d, "in0", in0, pos / 4); + Print(d, "in1", in1, pos / 4); + Print(d, "in2", in2, pos / 4); + Print(d, "in3", in3, pos / 4); + const size_t i = pos; + fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i], + actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4], + actual[i + 5], actual[i + 6], actual[i + 7]); + HWY_ASSERT(false); + } + } + } +}; + +HWY_NOINLINE void TestAllStoreInterleaved4() { +#if HWY_TARGET == HWY_RVV + // Segments are limited to 8 registers, so we can only go up to LMUL=2. + const ForExtendableVectors test; +#else + const ForPartialVectors test; +#endif + test(uint8_t()); +} + struct TestLoadDup128 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -86,13 +205,14 @@ struct TestLoadDup128 { for (size_t i = 0; i < N128; ++i) { lanes[i] = static_cast(1 + i); } - const auto v = LoadDup128(d, lanes); + const size_t N = Lanes(d); - auto out = AllocateAligned(N); - Store(v, d, out.get()); + auto expected = AllocateAligned(N); for (size_t i = 0; i < N; ++i) { - HWY_ASSERT_EQ(T(i % N128 + 1), out[i]); + expected[i] = static_cast(i % N128 + 1); } + + HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes)); #else (void)d; #endif @@ -136,6 +256,84 @@ HWY_NOINLINE void TestAllStream() { ForFloatTypes(test); } +// Assumes little-endian byte order! +struct TestScatter { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using Offset = MakeSigned; + + const size_t N = Lanes(d); + const size_t range = 4 * N; // number of items to scatter + const size_t max_bytes = range * sizeof(T); // upper bound on offset + + RandomState rng; + + // Data to be scattered + auto bytes = AllocateAligned(max_bytes); + for (size_t i = 0; i < max_bytes; ++i) { + bytes[i] = static_cast(Random32(&rng) & 0xFF); + } + const auto data = Load(d, reinterpret_cast(bytes.get())); + + // Scatter into these regions, ensure vector results match scalar + auto expected = AllocateAligned(range); + auto actual = AllocateAligned(range); + + const Rebind d_offsets; + auto offsets = AllocateAligned(N); // or indices + + for (size_t rep = 0; rep < 100; ++rep) { + // Byte offsets + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + offsets[i] = + static_cast(Random32(&rng) % (max_bytes - sizeof(T))); + CopyBytes( + bytes.get() + i * sizeof(T), + reinterpret_cast(expected.get()) + offsets[i]); + } + const auto voffsets = Load(d_offsets, offsets.get()); + ScatterOffset(data, d, actual.get(), voffsets); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Offsets", voffsets); + HWY_ASSERT(false); + } + + // Indices + std::fill(expected.get(), expected.get() + range, T(0)); + std::fill(actual.get(), actual.get() + range, T(0)); + for (size_t i = 0; i < N; ++i) { + offsets[i] = static_cast(Random32(&rng) % range); + CopyBytes(bytes.get() + i * sizeof(T), + &expected[offsets[i]]); + } + const auto vindices = Load(d_offsets, offsets.get()); + ScatterIndex(data, d, actual.get(), vindices); + if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { + Print(d, "Data", data); + Print(d_offsets, "Indices", vindices); + HWY_ASSERT(false); + } + } + } +}; + +HWY_NOINLINE void TestAllScatter() { + // No u8,u16,i8,i16. + const ForPartialVectors test; + test(uint32_t()); + test(int32_t()); + +#if HWY_CAP_INTEGER64 + test(uint64_t()); + test(int64_t()); +#endif + + ForFloatTypes(test); +} + struct TestGather { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -183,21 +381,15 @@ struct TestGather { HWY_NOINLINE void TestAllGather() { // No u8,u16,i8,i16. - const ForPartialVectors test32; - test32(uint32_t()); - test32(int32_t()); + const ForPartialVectors test; + test(uint32_t()); + test(int32_t()); #if HWY_CAP_INTEGER64 - const ForPartialVectors test64; - test64(uint64_t()); - test64(int64_t()); -#endif - - ForPartialVectors()(float()); - -#if HWY_CAP_FLOAT64 - ForPartialVectors()(double()); + test(uint64_t()); + test(int64_t()); #endif + ForFloatTypes(test); } HWY_NOINLINE void TestAllCache() { @@ -206,6 +398,7 @@ HWY_NOINLINE void TestAllCache() { int test = 0; Prefetch(&test); FlushCacheline(&test); + Pause(); } // NOLINTNEXTLINE(google-readability-namespace-comments) @@ -214,11 +407,15 @@ HWY_NOINLINE void TestAllCache() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwyMemoryTest); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/memory_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc.12 2021-06-02 10:56:05.259904513 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.cc 2021-05-31 10:37:11.000000000 -0400 @@ -223,6 +223,7 @@ struct TestTableLookupBytes { HWY_NOINLINE void TestAllTableLookupBytes() { ForIntegerTypes(ForPartialVectors()); } + struct TestTableLookupLanes { #if HWY_TARGET == HWY_RVV using Index = uint32_t; @@ -242,12 +243,13 @@ struct TestTableLookupLanes { if (N <= 8) { // Test all permutations for (size_t i0 = 0; i0 < N; ++i0) { idx[0] = static_cast(i0); + for (size_t i1 = 0; i1 < N; ++i1) { - idx[1] = static_cast(i1); + if (N >= 2) idx[1] = static_cast(i1); for (size_t i2 = 0; i2 < N; ++i2) { - idx[2] = static_cast(i2); + if (N >= 4) idx[2] = static_cast(i2); for (size_t i3 = 0; i3 < N; ++i3) { - idx[3] = static_cast(i3); + if (N >= 4) idx[3] = static_cast(i3); for (size_t i = 0; i < N; ++i) { expected[i] = static_cast(idx[i] + 1); // == v[idx[i]] @@ -286,7 +288,7 @@ struct TestTableLookupLanes { }; HWY_NOINLINE void TestAllTableLookupLanes() { - const ForFullVectors test; + const ForPartialVectors test; test(uint32_t()); test(int32_t()); test(float()); @@ -624,6 +626,7 @@ HWY_NOINLINE void TestAllOddEven() { HWY_AFTER_NAMESPACE(); #if HWY_ONCE +namespace hwy { HWY_BEFORE_TEST(HwySwizzleTest); HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes); HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes); @@ -637,5 +640,5 @@ HWY_EXPORT_AND_TEST_P(HwySwizzleTest, Te HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper); HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower); HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven); -HWY_AFTER_TEST(); +} // namespace hwy #endif diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/swizzle_test.ccE diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h --- chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h.12 2021-06-02 10:56:05.254904488 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.h 2021-05-31 10:37:11.000000000 -0400 @@ -23,7 +23,6 @@ #include #include -#include // isfinite #include #include #include // std::forward @@ -73,7 +72,8 @@ class TestWithParamTarget : public testi // Function to convert the test parameter of a TestWithParamTarget for // displaying it in the gtest test name. -std::string TestParamTargetName(const testing::TestParamInfo& info) { +static inline std::string TestParamTargetName( + const testing::TestParamInfo& info) { return TargetName(info.param); } @@ -157,31 +157,10 @@ std::string TestParamTargetNameAndT( static_assert(true, "For requiring trailing semicolon") #define HWY_BEFORE_TEST(suite) \ - namespace hwy { \ class suite : public hwy::TestWithParamTarget {}; \ HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \ static_assert(true, "For requiring trailing semicolon") -#define HWY_AFTER_TEST() \ - } /* namespace hwy */ \ - static_assert(true, "For requiring trailing semicolon") - -// Calls test for each enabled and available target. -template -HWY_NOINLINE void RunTest(const Func& func, Args&&... args) { - SetSupportedTargetsForTest(0); - auto targets = SupportedAndGeneratedTargets(); - - for (uint32_t target : targets) { - SetSupportedTargetsForTest(target); - fprintf(stderr, "Testing for target %s.\n", - TargetName(static_cast(target))); - func(std::forward(args)...); - } - // Disable the mask after the test. - SetSupportedTargetsForTest(0); -} - // 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937, // which triggers a compiler bug. class RandomState { @@ -223,9 +202,11 @@ static HWY_INLINE uint32_t Random32(Rand // built-in types. template inline void PreventElision(T&& output) { -#ifndef _MSC_VER +#if HWY_COMPILER_MSVC + (void)output; +#else // HWY_COMPILER_MSVC asm volatile("" : "+r"(output) : : "memory"); -#endif +#endif // HWY_COMPILER_MSVC } // Returns a name for the vector/part/scalar. The type prefix is u/i/f for @@ -234,23 +215,34 @@ inline void PreventElision(T&& output) { // understanding which instantiation of a generic test failed. template static inline std::string TypeName(T /*unused*/, size_t N) { - std::string prefix(IsFloat() ? "f" : (IsSigned() ? "i" : "u")); - prefix += std::to_string(sizeof(T) * 8); - - // Scalars: omit the xN suffix. - if (N == 1) return prefix; - - return prefix + 'x' + std::to_string(N); + const char prefix = IsFloat() ? 'f' : (IsSigned() ? 'i' : 'u'); + char name[64]; + // Omit the xN suffix for scalars. + if (N == 1) { + snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8); + } else { + snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N); + } + return name; } // String comparison template -inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size) { +inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size, + size_t* pos = nullptr) { const uint8_t* bytes1 = reinterpret_cast(p1); const uint8_t* bytes2 = reinterpret_cast(p2); for (size_t i = 0; i < size; ++i) { - if (bytes1[i] != bytes2[i]) return false; + if (bytes1[i] != bytes2[i]) { + fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i, + size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(), + TypeName(T2(), 1).c_str()); + if (pos != nullptr) { + *pos = i; + } + return false; + } } return true; } @@ -287,11 +279,11 @@ HWY_NOINLINE void Print(const D d, const auto lanes = AllocateAligned(N); Store(v, d, lanes.get()); const size_t begin = static_cast(std::max(0, lane - 2)); - const size_t end = std::min(begin + 5, N); + const size_t end = std::min(begin + 7, N); fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption, begin); for (size_t i = begin; i < end; ++i) { - fprintf(stderr, "%s,", std::to_string(lanes[i]).c_str()); + fprintf(stderr, "%g,", double(lanes[i])); } if (begin >= end) fprintf(stderr, "(out of bounds)"); fprintf(stderr, "\n"); @@ -352,10 +344,12 @@ HWY_NOINLINE void AssertEqual(const T ex const char* filename = "", const int line = -1, const size_t lane = 0) { if (!IsEqual(expected, actual)) { - const std::string expected_str = std::to_string(expected); - const std::string actual_str = std::to_string(actual); - NotifyFailure(filename, line, type_name.c_str(), lane, expected_str.c_str(), - actual_str.c_str()); + char expected_str[100]; + snprintf(expected_str, sizeof(expected_str), "%g", double(expected)); + char actual_str[100]; + snprintf(actual_str, sizeof(actual_str), "%g", double(actual)); + NotifyFailure(filename, line, type_name.c_str(), lane, expected_str, + actual_str); } } @@ -382,9 +376,15 @@ HWY_NOINLINE void AssertVecEqual(D d, co fprintf(stderr, "\n\n"); Print(d, "expect", expected, i); Print(d, "actual", actual, i); + + char expected_str[100]; + snprintf(expected_str, sizeof(expected_str), "%g", + double(expected_lanes[i])); + char actual_str[100]; + snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i])); + NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i, - std::to_string(expected_lanes[i]).c_str(), - std::to_string(actual_lanes[i]).c_str()); + expected_str, actual_str); } } } @@ -458,11 +458,8 @@ struct ForeachSizeR +// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes]. +template struct ForPartialVectors { template void operator()(T /*unused*/) const { @@ -470,8 +467,8 @@ struct ForPartialVectors { // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full. ForeachSizeR::Do(); #else - ForeachSizeR::Do(); + ForeachSizeR::Do(); #endif } }; @@ -505,33 +502,19 @@ struct ForGE128Vectors { } }; -// Calls Test for all powers of two in [128 bits, max bits/2]. -template +// Calls Test for all vectors that can be expanded by kFactor. +template struct ForExtendableVectors { template void operator()(T /*unused*/) const { #if HWY_TARGET == HWY_RVV - ForeachSizeR::Do(); + ForeachSizeR::Do(); #else - ForeachSizeR::Do(); #endif } }; - -// Calls Test for full vectors only. -template -struct ForFullVectors { - template - void operator()(T t) const { -#if HWY_TARGET == HWY_RVV - ForeachSizeR::Do(); - (void)t; -#else - Test()(t, HWY_FULL(T)()); -#endif - } -}; // Type lists to shorten call sites: diff -up chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE.12 chromium-91.0.4472.77/third_party/highway/src/hwy/tests/test_util-inl.hE diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.in diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy.pc.inE diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.in diff -up chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE.12 chromium-91.0.4472.77/third_party/highway/src/libhwy-test.pc.inE diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSE diff -up chromium-91.0.4472.77/third_party/highway/src/LICENSEE.12 chromium-91.0.4472.77/third_party/highway/src/LICENSEE diff -up chromium-91.0.4472.77/third_party/highway/src/Makefile.12 chromium-91.0.4472.77/third_party/highway/src/Makefile diff -up chromium-91.0.4472.77/third_party/highway/src/MakefileE.12 chromium-91.0.4472.77/third_party/highway/src/MakefileE diff -up chromium-91.0.4472.77/third_party/highway/src/README.md.12 chromium-91.0.4472.77/third_party/highway/src/README.md --- chromium-91.0.4472.77/third_party/highway/src/README.md.12 2021-06-02 10:56:05.295904696 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/README.md 2021-05-31 10:37:11.000000000 -0400 @@ -15,15 +15,19 @@ applying the same operation to 'lanes'. ## Current status Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD. -A port to RVV is in progress. +Ports to RVV and SVE/SVE2 are in progress. Version 0.11 is considered stable enough to use in other projects, and is expected to remain backwards compatible unless serious issues are discovered while implementing SVE/RVV targets. After these targets are added, Highway will reach version 1.0. -Continuous integration tests use a recent version of Clang and older version of -MSVC (VS2015). Also periodically tested on Clang 7-11 and GCC 8, 9 and 10.2.1. +Continuous integration tests build with a recent version of Clang (running on +x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). + +Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via +GCC cross-compile and QEMU. See the +[testing process](g3doc/release_testing_process.md) for details. The `contrib` directory contains SIMD-related utilities: an image class with aligned rows, and a math library (16 functions already implemented, mostly @@ -62,9 +66,11 @@ To test on all the attainable targets fo default configuration skips baseline targets (e.g. scalar) that are superseded by another baseline target. +Bazel is also supported for building, but it is not as widely used/tested. + ## Quick start -You can use the `skeleton` examples inside examples/ as a starting point. +You can use the `benchmark` inside examples/ as a starting point. A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations and their parameters, and the [instruction_matrix][instmtx] indicates the diff -up chromium-91.0.4472.77/third_party/highway/src/README.mdE.12 chromium-91.0.4472.77/third_party/highway/src/README.mdE diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.bat --- chromium-91.0.4472.77/third_party/highway/src/run_tests.bat.12 2021-06-02 10:56:05.293904685 -0400 +++ chromium-91.0.4472.77/third_party/highway/src/run_tests.bat 2021-05-31 10:37:11.000000000 -0400 @@ -2,9 +2,9 @@ REM Switch directory of this batch file cd %~dp0 -if not exist build mkdir build +if not exist build_win mkdir build_win -cd build +cd build_win cmake .. -G Ninja || goto error ninja || goto error ctest -j || goto error diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.batE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.batE diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.sh.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.sh diff -up chromium-91.0.4472.77/third_party/highway/src/run_tests.shE.12 chromium-91.0.4472.77/third_party/highway/src/run_tests.shE diff -up chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time.12 chromium-91.0.4472.77/third_party/llvm/libcxx/test/std/utilities/time/time.hms/time diff -up chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1.12 chromium-91.0.4472.77/third_party/llvm/llvm/test/tools/gold/X86/v1