diff --git a/chromium-92-clang-format.patch b/chromium-92-clang-format.patch
new file mode 100644
index 0000000..0ba7ee2
--- /dev/null
+++ b/chromium-92-clang-format.patch
@@ -0,0 +1,25 @@
+--- a/buildtools/linux64/clang-format.orig	2021-08-23 09:18:56.269570955 +0200
++++ b/buildtools/linux64/clang-format	2021-08-23 09:17:55.531190516 +0200
+@@ -10,9 +10,9 @@
+ args = sys.argv[1:]
+ inputfiles = [a for a in args if not a.startswith('-')]
+ 
+-contents = ''
++contents = b''
+ if '-' in args or not inputfiles:
+-    contents = sys.stdin.read()
++    contents = sys.stdin.buffer.read()
+ 
+ # Tarball builds may or may not have depot_tools in $PATH. In the former case,
+ # running 'clang-format' will call back into this script infinitely. Strip off
+@@ -34,8 +34,8 @@
+     stdout, stderr = proc.communicate(input=contents)
+     # Ignore if clang-format fails. Eg: it may be too old to support C++14.
+     if proc.returncode == 0:
+-        sys.stdout.write(stdout)
+-        sys.stderr.write(stderr)
++        sys.stdout.buffer.write(stdout)
++        sys.stderr.buffer.write(stderr)
+         sys.exit(0)
+ except OSError:
+     # Ignore if clang-format is not installed.
diff --git a/chromium-92-v8-constexpr.patch b/chromium-92-v8-constexpr.patch
new file mode 100644
index 0000000..cbe1347
--- /dev/null
+++ b/chromium-92-v8-constexpr.patch
@@ -0,0 +1,17 @@
+GCC: make VRegister::from_code() constexpr on aarch64
+
+LiftoffRegister::gp() and LiftoffRegister::fp() are constexpr.
+Therefore, VRegister::from_code() needs to be constexpr as well.
+diff --git a/v8/src/codegen/arm64/register-arm64.h b/v8/src/codegen/arm64/register-arm64.h
+index 1150daf..21007a5 100644
+--- a/v8/src/codegen/arm64/register-arm64.h
++++ b/v8/src/codegen/arm64/register-arm64.h
+@@ -413,7 +413,7 @@ class VRegister : public CPURegister {
+   static constexpr int kMaxNumRegisters = kNumberOfVRegisters;
+   STATIC_ASSERT(kMaxNumRegisters == kDoubleAfterLast);
+ 
+-  static VRegister from_code(int code) {
++  static constexpr VRegister from_code(int code) {
+     // Always return a D register.
+     return VRegister::Create(code, kDRegSizeInBits);
+   }
diff --git a/chromium-92.0.4515.107-EnumTable-crash.patch b/chromium-92.0.4515.107-EnumTable-crash.patch
new file mode 100644
index 0000000..e9eaab6
--- /dev/null
+++ b/chromium-92.0.4515.107-EnumTable-crash.patch
@@ -0,0 +1,70 @@
+diff -up chromium-92.0.4515.107/components/cast_channel/enum_table.h.EnumTable-crash chromium-92.0.4515.107/components/cast_channel/enum_table.h
+--- chromium-92.0.4515.107/components/cast_channel/enum_table.h.EnumTable-crash	2021-07-19 14:45:12.000000000 -0400
++++ chromium-92.0.4515.107/components/cast_channel/enum_table.h	2021-07-26 17:41:21.987375385 -0400
+@@ -212,7 +212,7 @@ class
+ 
+   template <typename E>
+   friend class EnumTable;
+-  DISALLOW_COPY_AND_ASSIGN(GenericEnumTableEntry);
++  DISALLOW_ASSIGN(GenericEnumTableEntry);
+ };
+ 
+ // Yes, these constructors really needs to be inlined.  Even though they look
+@@ -250,8 +250,7 @@ class EnumTable {
+     // Constructor for regular entries.
+     constexpr Entry(E value, base::StringPiece str)
+         : GenericEnumTableEntry(static_cast<int32_t>(value), str) {}
+-
+-    DISALLOW_COPY_AND_ASSIGN(Entry);
++    DISALLOW_ASSIGN(Entry);
+   };
+ 
+   static_assert(sizeof(E) <= sizeof(int32_t),
+@@ -306,15 +305,14 @@ class EnumTable {
+     if (is_sorted_) {
+       const std::size_t index = static_cast<std::size_t>(value);
+       if (ANALYZER_ASSUME_TRUE(index < data_.size())) {
+-        const auto& entry = data_.begin()[index];
++        const auto& entry = data_[index];
+         if (ANALYZER_ASSUME_TRUE(entry.has_str()))
+           return entry.str();
+       }
+       return absl::nullopt;
+     }
+     return GenericEnumTableEntry::FindByValue(
+-        reinterpret_cast<const GenericEnumTableEntry*>(data_.begin()),
+-        data_.size(), static_cast<int32_t>(value));
++        &data_[0], data_.size(), static_cast<int32_t>(value));
+   }
+ 
+   // This overload of GetString is designed for cases where the argument is a
+@@ -342,8 +340,7 @@ class EnumTable {
+   // enum value directly.
+   absl::optional<E> GetEnum(base::StringPiece str) const {
+     auto* entry = GenericEnumTableEntry::FindByString(
+-        reinterpret_cast<const GenericEnumTableEntry*>(data_.begin()),
+-        data_.size(), str);
++        &data_[0], data_.size(), str);
+     return entry ? static_cast<E>(entry->value) : absl::optional<E>();
+   }
+ 
+@@ -358,7 +355,7 @@ class EnumTable {
+   // Align the data on a cache line boundary.
+   alignas(64)
+ #endif
+-      std::initializer_list<Entry> data_;
++      const std::vector<Entry> data_;
+   bool is_sorted_;
+ 
+   constexpr EnumTable(std::initializer_list<Entry> data, bool is_sorted)
+@@ -370,8 +367,8 @@ class EnumTable {
+ 
+     for (std::size_t i = 0; i < data.size(); i++) {
+       for (std::size_t j = i + 1; j < data.size(); j++) {
+-        const Entry& ei = data.begin()[i];
+-        const Entry& ej = data.begin()[j];
++        const Entry& ei = data[i];
++        const Entry& ej = data[j];
+         DCHECK(ei.value != ej.value)
+             << "Found duplicate enum values at indices " << i << " and " << j;
+         DCHECK(!(ei.has_str() && ej.has_str() && ei.str() == ej.str()))
diff --git a/chromium-92.0.4515.107-gn-gcc-cleanup.patch b/chromium-92.0.4515.107-gn-gcc-cleanup.patch
new file mode 100644
index 0000000..c61b51b
--- /dev/null
+++ b/chromium-92.0.4515.107-gn-gcc-cleanup.patch
@@ -0,0 +1,45 @@
+diff -up chromium-92.0.4515.107/tools/gn/src/gn/err.h.gn-gcc-cleanup chromium-92.0.4515.107/tools/gn/src/gn/err.h
+--- chromium-92.0.4515.107/tools/gn/src/gn/err.h.gn-gcc-cleanup	2021-07-19 14:54:04.000000000 -0400
++++ chromium-92.0.4515.107/tools/gn/src/gn/err.h	2021-07-26 17:23:54.477420431 -0400
+@@ -56,7 +56,7 @@ class Err {
+       const std::string& help_text = std::string());
+ 
+   Err(const Err& other);
+-
++  Err& operator=(const Err& other) = default;
+   ~Err();
+ 
+   bool has_error() const { return has_error_; }
+diff -up chromium-92.0.4515.107/tools/gn/src/gn/label_pattern.h.gn-gcc-cleanup chromium-92.0.4515.107/tools/gn/src/gn/label_pattern.h
+--- chromium-92.0.4515.107/tools/gn/src/gn/label_pattern.h.gn-gcc-cleanup	2021-07-26 17:23:54.478420447 -0400
++++ chromium-92.0.4515.107/tools/gn/src/gn/label_pattern.h	2021-07-26 17:26:36.904894419 -0400
+@@ -33,6 +33,7 @@ class LabelPattern {
+                std::string_view name,
+                const Label& toolchain_label);
+   LabelPattern(const LabelPattern& other);
++  LabelPattern& operator=(const LabelPattern& other) = default;
+   ~LabelPattern();
+ 
+   // Converts the given input string to a pattern. This does special stuff
+diff -up chromium-92.0.4515.107/tools/gn/src/gn/substitution_list.h.gn-gcc-cleanup chromium-92.0.4515.107/tools/gn/src/gn/substitution_list.h
+--- chromium-92.0.4515.107/tools/gn/src/gn/substitution_list.h.gn-gcc-cleanup	2021-07-19 14:54:04.000000000 -0400
++++ chromium-92.0.4515.107/tools/gn/src/gn/substitution_list.h	2021-07-26 17:23:54.478420447 -0400
+@@ -15,6 +15,7 @@ class SubstitutionList {
+  public:
+   SubstitutionList();
+   SubstitutionList(const SubstitutionList& other);
++  SubstitutionList& operator=(const SubstitutionList& other) = default;
+   ~SubstitutionList();
+ 
+   bool Parse(const Value& value, Err* err);
+diff -up chromium-92.0.4515.107/tools/gn/src/gn/substitution_pattern.h.gn-gcc-cleanup chromium-92.0.4515.107/tools/gn/src/gn/substitution_pattern.h
+--- chromium-92.0.4515.107/tools/gn/src/gn/substitution_pattern.h.gn-gcc-cleanup	2021-07-19 14:54:04.000000000 -0400
++++ chromium-92.0.4515.107/tools/gn/src/gn/substitution_pattern.h	2021-07-26 17:23:54.478420447 -0400
+@@ -35,6 +35,7 @@ class SubstitutionPattern {
+ 
+   SubstitutionPattern();
+   SubstitutionPattern(const SubstitutionPattern& other);
++  SubstitutionPattern& operator=(const SubstitutionPattern& other) = default;
+   ~SubstitutionPattern();
+ 
+   // Parses the given string and fills in the pattern. The pattern must only
diff --git a/chromium-92.0.4515.107-norar.patch b/chromium-92.0.4515.107-norar.patch
new file mode 100644
index 0000000..311caf6
--- /dev/null
+++ b/chromium-92.0.4515.107-norar.patch
@@ -0,0 +1,90 @@
+diff -up chromium-92.0.4515.107/chrome/common/safe_browsing/BUILD.gn.nounrar chromium-92.0.4515.107/chrome/common/safe_browsing/BUILD.gn
+--- chromium-92.0.4515.107/chrome/common/safe_browsing/BUILD.gn.nounrar	2021-07-19 14:45:10.000000000 -0400
++++ chromium-92.0.4515.107/chrome/common/safe_browsing/BUILD.gn	2021-07-26 16:44:53.670761825 -0400
+@@ -43,39 +43,6 @@ if (safe_browsing_mode == 1) {
+     public_deps = [ "//components/safe_browsing/core:csd_proto" ]
+   }
+ 
+-  source_set("rar_analyzer") {
+-    sources = [
+-      "rar_analyzer.cc",
+-      "rar_analyzer.h",
+-    ]
+-
+-    deps = [
+-      ":archive_analyzer_results",
+-      ":download_type_util",
+-      "//base",
+-      "//base:i18n",
+-      "//components/safe_browsing/core:features",
+-      "//components/safe_browsing/core:file_type_policies",
+-      "//third_party/unrar:unrar",
+-    ]
+-
+-    defines = [
+-      "_FILE_OFFSET_BITS=64",
+-      "LARGEFILE_SOURCE",
+-      "RAR_SMP",
+-      "SILENT",
+-
+-      # The following is set to disable certain macro definitions in the unrar
+-      # source code.
+-      "CHROMIUM_UNRAR",
+-
+-      # Disables exceptions in unrar, replaces them with process termination.
+-      "UNRAR_NO_EXCEPTIONS",
+-    ]
+-
+-    public_deps = [ "//components/safe_browsing/core:csd_proto" ]
+-  }
+-
+   if (is_mac) {
+     source_set("disk_image_type_sniffer_mac") {
+       sources = [
+@@ -145,7 +112,6 @@ source_set("safe_browsing") {
+       ":archive_analyzer_results",
+       ":binary_feature_extractor",
+       ":download_type_util",
+-      ":rar_analyzer",
+       "//components/safe_browsing/core:features",
+     ]
+ 
+diff -up chromium-92.0.4515.107/chrome/common/safe_browsing/DEPS.nounrar chromium-92.0.4515.107/chrome/common/safe_browsing/DEPS
+--- chromium-92.0.4515.107/chrome/common/safe_browsing/DEPS.nounrar	2021-07-19 14:45:10.000000000 -0400
++++ chromium-92.0.4515.107/chrome/common/safe_browsing/DEPS	2021-07-26 16:44:53.670761825 -0400
+@@ -1,6 +1,5 @@
+ include_rules = [
+   "+components/safe_browsing",
+   "+third_party/protobuf",
+-  "+third_party/unrar",
+   "+third_party/zlib",
+ ]
+diff -up chromium-92.0.4515.107/chrome/services/file_util/BUILD.gn.nounrar chromium-92.0.4515.107/chrome/services/file_util/BUILD.gn
+--- chromium-92.0.4515.107/chrome/services/file_util/BUILD.gn.nounrar	2021-07-26 16:44:53.670761825 -0400
++++ chromium-92.0.4515.107/chrome/services/file_util/BUILD.gn	2021-07-26 16:48:21.283924750 -0400
+@@ -50,7 +50,6 @@ source_set("file_util") {
+     deps += [
+       "//chrome/common/safe_browsing",
+       "//chrome/common/safe_browsing:archive_analyzer_results",
+-      "//chrome/common/safe_browsing:rar_analyzer",
+     ]
+   }
+ 
+diff -up chromium-92.0.4515.107/chrome/services/file_util/safe_archive_analyzer.cc.nounrar chromium-92.0.4515.107/chrome/services/file_util/safe_archive_analyzer.cc
+--- chromium-92.0.4515.107/chrome/services/file_util/safe_archive_analyzer.cc.nounrar	2021-07-19 14:45:11.000000000 -0400
++++ chromium-92.0.4515.107/chrome/services/file_util/safe_archive_analyzer.cc	2021-07-26 16:44:53.670761825 -0400
+@@ -45,10 +45,14 @@ void SafeArchiveAnalyzer::AnalyzeDmgFile
+ void SafeArchiveAnalyzer::AnalyzeRarFile(base::File rar_file,
+                                          base::File temporary_file,
+                                          AnalyzeRarFileCallback callback) {
++#if 0
+   DCHECK(rar_file.IsValid());
+ 
+   safe_browsing::ArchiveAnalyzerResults results;
+   safe_browsing::rar_analyzer::AnalyzeRarFile(
+       std::move(rar_file), std::move(temporary_file), &results);
+   std::move(callback).Run(results);
++#else
++  NOTREACHED();
++#endif
+ }
diff --git a/chromium-92.0.4515.107-py2-bootstrap.patch b/chromium-92.0.4515.107-py2-bootstrap.patch
new file mode 100644
index 0000000..ea09033
--- /dev/null
+++ b/chromium-92.0.4515.107-py2-bootstrap.patch
@@ -0,0 +1,24 @@
+diff -up chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py.py2 chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py
+--- chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py.py2	2021-07-19 14:47:19.000000000 -0400
++++ chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py	2021-07-26 17:02:23.160750472 -0400
+@@ -83,7 +83,7 @@ def _MinifyJS(input_js):
+ 
+   with tempfile.NamedTemporaryFile() as _:
+     args = [
+-        'python',
++        'python2',
+         rjsmin_path
+     ]
+     p = subprocess.Popen(args,
+diff -up chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py.py2 chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py
+--- chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py.py2	2021-07-19 14:45:43.000000000 -0400
++++ chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py	2021-07-26 17:02:23.160750472 -0400
+@@ -130,7 +130,7 @@ def main(argv):
+     if not options.debug:
+       gn_gen_args += ' is_debug=false'
+     subprocess.check_call([
+-        gn_path, 'gen', out_dir,
++        gn_path, 'gen', out_dir, ' --script-executable=/usr/bin/python2',
+         '--args=%s' % gn_gen_args, "--root=" + SRC_ROOT
+     ])
+ 
diff --git a/chromium-92.0.4515.107-py3-bootstrap.patch b/chromium-92.0.4515.107-py3-bootstrap.patch
new file mode 100644
index 0000000..8455813
--- /dev/null
+++ b/chromium-92.0.4515.107-py3-bootstrap.patch
@@ -0,0 +1,24 @@
+diff -up chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py.py2 chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py
+--- chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py.py2	2021-07-19 14:47:19.000000000 -0400
++++ chromium-92.0.4515.107/third_party/catapult/common/py_vulcanize/py_vulcanize/generate.py	2021-07-26 17:02:23.160750472 -0400
+@@ -83,7 +83,7 @@ def _MinifyJS(input_js):
+ 
+   with tempfile.NamedTemporaryFile() as _:
+     args = [
+-        'python',
++        'python3',
+         rjsmin_path
+     ]
+     p = subprocess.Popen(args,
+diff -up chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py.py2 chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py
+--- chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py.py2	2021-07-19 14:45:43.000000000 -0400
++++ chromium-92.0.4515.107/tools/gn/bootstrap/bootstrap.py	2021-07-26 17:02:23.160750472 -0400
+@@ -130,7 +130,7 @@ def main(argv):
+     if not options.debug:
+       gn_gen_args += ' is_debug=false'
+     subprocess.check_call([
+-        gn_path, 'gen', out_dir,
++        gn_path, 'gen', out_dir, ' --script-executable=/usr/bin/python3',
+         '--args=%s' % gn_gen_args, "--root=" + SRC_ROOT
+     ])
+ 
diff --git a/chromium-92.0.4515.107-py3-fixes.patch b/chromium-92.0.4515.107-py3-fixes.patch
new file mode 100644
index 0000000..81ae7f6
--- /dev/null
+++ b/chromium-92.0.4515.107-py3-fixes.patch
@@ -0,0 +1,17 @@
+diff -up chromium-92.0.4515.107/third_party/jinja2/tests.py.py3 chromium-92.0.4515.107/third_party/jinja2/tests.py
+--- chromium-92.0.4515.107/third_party/jinja2/tests.py.py3	2021-07-28 15:53:45.670961029 -0400
++++ chromium-92.0.4515.107/third_party/jinja2/tests.py	2021-07-28 15:55:56.637013096 -0400
+@@ -10,7 +10,12 @@
+ """
+ import operator
+ import re
+-from collections import Mapping
++import sys
++if sys.version_info[:2] >= (3, 8):  # pragma: no cover
++    from collections.abc import Mapping
++else:  # pragma: no cover
++    from collections import Mapping
++
+ from jinja2.runtime import Undefined
+ from jinja2._compat import text_type, string_types, integer_types
+ import decimal
diff --git a/chromium-92.0.4515.107-rawhide-gcc-std-max-fix.patch b/chromium-92.0.4515.107-rawhide-gcc-std-max-fix.patch
new file mode 100644
index 0000000..bfe17b9
--- /dev/null
+++ b/chromium-92.0.4515.107-rawhide-gcc-std-max-fix.patch
@@ -0,0 +1,13 @@
+diff -up chromium-92.0.4515.107/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc.sigstkszfix chromium-92.0.4515.107/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc
+diff -up chromium-92.0.4515.107/third_party/breakpad/breakpad/src/client/linux/handler/exception_handler.cc.sigstkszfix chromium-92.0.4515.107/third_party/breakpad/breakpad/src/client/linux/handler/exception_handler.cc
+--- chromium-92.0.4515.107/third_party/breakpad/breakpad/src/client/linux/handler/exception_handler.cc.sigstkszfix	2021-07-19 14:47:20.000000000 -0400
++++ chromium-92.0.4515.107/third_party/breakpad/breakpad/src/client/linux/handler/exception_handler.cc	2021-07-26 17:28:50.155924005 -0400
+@@ -138,7 +138,7 @@ void InstallAlternateStackLocked() {
+   // SIGSTKSZ may be too small to prevent the signal handlers from overrunning
+   // the alternative stack. Ensure that the size of the alternative stack is
+   // large enough.
+-  static const unsigned kSigStackSize = std::max(16384, SIGSTKSZ);
++  static const unsigned kSigStackSize = std::max(static_cast<long>(16384), SIGSTKSZ);
+ 
+   // Only set an alternative stack if there isn't already one, or if the current
+   // one is too small.
diff --git a/chromium-92.0.4515.107-rhel8-force-disable-use_gnome_keyring.patch b/chromium-92.0.4515.107-rhel8-force-disable-use_gnome_keyring.patch
new file mode 100644
index 0000000..9724c44
--- /dev/null
+++ b/chromium-92.0.4515.107-rhel8-force-disable-use_gnome_keyring.patch
@@ -0,0 +1,12 @@
+diff -up chromium-92.0.4515.107/components/os_crypt/features.gni.disblegnomekeyring chromium-92.0.4515.107/components/os_crypt/features.gni
+--- chromium-92.0.4515.107/components/os_crypt/features.gni.disblegnomekeyring	2021-07-26 22:31:54.887207201 -0400
++++ chromium-92.0.4515.107/components/os_crypt/features.gni	2021-07-26 22:35:00.879013268 -0400
+@@ -8,7 +8,7 @@ import("//build/config/ui.gni")
+ declare_args() {
+   # Whether to use libgnome-keyring (deprecated by libsecret).
+   # See http://crbug.com/466975 and http://crbug.com/355223.
+-  use_gnome_keyring = (is_linux || is_chromeos_lacros) && use_glib
++  use_gnome_keyring = false
+ 
+   # Whether to make account and service names for the crypto key storage
+   # configurable at runtime for embedders.
diff --git a/chromium-92.0.4515.107-sandbox-clone3.patch b/chromium-92.0.4515.107-sandbox-clone3.patch
new file mode 100644
index 0000000..a439935
--- /dev/null
+++ b/chromium-92.0.4515.107-sandbox-clone3.patch
@@ -0,0 +1,16 @@
+diff -up chromium-92.0.4515.107/sandbox/linux/seccomp-bpf-helpers/baseline_policy.cc.clone3 chromium-92.0.4515.107/sandbox/linux/seccomp-bpf-helpers/baseline_policy.cc
+--- chromium-92.0.4515.107/sandbox/linux/seccomp-bpf-helpers/baseline_policy.cc.clone3	2021-08-16 09:05:35.836277326 -0400
++++ chromium-92.0.4515.107/sandbox/linux/seccomp-bpf-helpers/baseline_policy.cc	2021-08-16 09:06:17.420502628 -0400
+@@ -178,6 +178,12 @@ ResultExpr EvaluateSyscallImpl(int fs_de
+     return RestrictCloneToThreadsAndEPERMFork();
+   }
+ 
++  // clone3 takes a pointer argument which we cannot examine, so return ENOSYS
++  // to force the libc to use clone. See https://crbug.com/1213452.
++  if (sysno == __NR_clone3) {
++    return Error(ENOSYS);
++  }
++
+   if (sysno == __NR_fcntl)
+     return RestrictFcntlCommands();
+ 
diff --git a/chromium-92.0.4515.107-update-highway-0.12.2.patch b/chromium-92.0.4515.107-update-highway-0.12.2.patch
new file mode 100644
index 0000000..fca308b
--- /dev/null
+++ b/chromium-92.0.4515.107-update-highway-0.12.2.patch
@@ -0,0 +1,4199 @@
+diff -up chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt.in.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt.in
+diff -up chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt
+--- chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/CMakeLists.txt	2021-07-26 17:13:36.158002603 -0400
+@@ -19,7 +19,7 @@ if(POLICY CMP0083)
+   cmake_policy(SET CMP0083 NEW)
+ endif()
+ 
+-project(hwy VERSION 0.1)
++project(hwy VERSION 0.12.2)  # Keep in sync with highway.h version
+ 
+ set(CMAKE_CXX_STANDARD 11)
+ set(CMAKE_CXX_EXTENSIONS OFF)
+@@ -40,6 +40,8 @@ if (NOT CMAKE_BUILD_TYPE)
+   set(CMAKE_BUILD_TYPE RelWithDebInfo)
+ endif()
+ 
++set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
++
+ include(CheckCXXSourceCompiles)
+ check_cxx_source_compiles(
+    "int main() {
+@@ -51,10 +53,13 @@ check_cxx_source_compiles(
+   HWY_EMSCRIPTEN
+ )
+ 
++set(HWY_CONTRIB_SOURCES
++    hwy/contrib/image/image.cc
++    hwy/contrib/image/image.h
++    hwy/contrib/math/math-inl.h
++)
++
+ set(HWY_SOURCES
+-    contrib/image/image.cc
+-    contrib/image/image.h
+-    contrib/math/math-inl.h
+     hwy/aligned_allocator.cc
+     hwy/aligned_allocator.h
+     hwy/base.h
+@@ -64,6 +69,7 @@ set(HWY_SOURCES
+     hwy/nanobenchmark.cc
+     hwy/nanobenchmark.h
+     hwy/ops/arm_neon-inl.h
++    hwy/ops/arm_sve-inl.h
+     hwy/ops/scalar-inl.h
+     hwy/ops/set_macros-inl.h
+     hwy/ops/shared-inl.h
+@@ -146,13 +152,28 @@ else()
+       -fno-exceptions
+     )
+   endif()
+-endif()
++
++  if (HWY_CMAKE_ARM7)
++    list(APPEND HWY_FLAGS
++      -march=armv7-a
++      -mfpu=neon-vfpv4
++      -mfloat-abi=hard  # must match the toolchain specified as CXX=
++      -mfp16-format=ieee  # required for vcvt_f32_f16
++    )
++  endif()  # HWY_CMAKE_ARM7
++
++endif()  # !MSVC
+ 
+ add_library(hwy STATIC ${HWY_SOURCES})
+ target_compile_options(hwy PRIVATE ${HWY_FLAGS})
+ set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
+ target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+ 
++add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
++target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
++set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
++target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
++
+ # -------------------------------------------------------- install library
+ install(TARGETS hwy
+   DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+@@ -166,9 +187,21 @@ foreach (source ${HWY_SOURCES})
+   endif()
+ endforeach()
+ 
+-# Add a pkg-config file for libhwy and the test library.
++install(TARGETS hwy_contrib
++  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
++# Install all the headers keeping the relative path to the current directory
++# when installing them.
++foreach (source ${HWY_CONTRIB_SOURCES})
++  if ("${source}" MATCHES "\.h$")
++    get_filename_component(dirname "${source}" DIRECTORY)
++    install(FILES "${source}"
++        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
++  endif()
++endforeach()
++
++# Add a pkg-config file for libhwy and the contrib/test libraries.
+ set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
+-foreach (pc libhwy.pc libhwy-test.pc)
++foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
+   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
+   install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
+       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+@@ -251,8 +284,8 @@ endif()
+ endif() # HWY_SYSTEM_GTEST
+ 
+ set(HWY_TEST_FILES
+-  contrib/image/image_test.cc
+-  # contrib/math/math_test.cc
++  hwy/contrib/image/image_test.cc
++  # hwy/contrib/math/math_test.cc
+   hwy/aligned_allocator_test.cc
+   hwy/base_test.cc
+   hwy/highway_test.cc
+@@ -274,11 +307,16 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILE
+   get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+   add_executable(${TESTNAME} ${TESTFILE})
+   target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
++  # Test all targets, not just the best/baseline. This changes the default
++  # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
++  # cause compile errors because only one may be set, and other CMakeLists.txt
++  # that include us may set them.
++  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
+ 
+   if(HWY_SYSTEM_GTEST)
+-    target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
++    target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
+   else()
+-    target_link_libraries(${TESTNAME} hwy gtest gtest_main)
++    target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
+   endif()
+   # Output test targets in the test directory.
+   set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+diff -up chromium-92.0.4515.107/third_party/highway/src/debian/changelog.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/debian/changelog
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator.h	2021-07-26 17:15:37.281847484 -0400
+@@ -111,6 +111,32 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
+       new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
+ }
+ 
++// Helpers for array allocators (avoids overflow)
++namespace detail {
++
++// Returns x such that 1u << x == n (if n is a power of two).
++static inline constexpr size_t ShiftCount(size_t n) {
++  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
++}
++
++template <typename T>
++T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
++  constexpr size_t size = sizeof(T);
++
++  constexpr bool is_pow2 = (size & (size - 1)) == 0;
++  constexpr size_t bits = ShiftCount(size);
++  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
++
++  const size_t bytes = is_pow2 ? items << bits : items * size;
++  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
++  if (check != items) {
++    return nullptr;  // overflowed
++  }
++  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
++}
++
++}  // namespace detail
++
+ // Aligned memory equivalent of make_unique<T[]> for array types using the
+ // custom allocators alloc/free. This function calls the constructor with the
+ // passed Args... on every created item. The destructor of each element will be
+@@ -118,10 +144,11 @@ AlignedUniquePtr<T> MakeUniqueAligned(Ar
+ template <typename T, typename... Args>
+ AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
+     size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
+-  T* ptr =
+-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
+-  for (size_t i = 0; i < items; i++) {
+-    new (ptr + i) T(std::forward<Args>(args)...);
++  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
++  if (ptr != nullptr) {
++    for (size_t i = 0; i < items; i++) {
++      new (ptr + i) T(std::forward<Args>(args)...);
++    }
+   }
+   return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
+ }
+@@ -165,7 +192,7 @@ template <typename T>
+ AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
+                                           FreePtr free, void* opaque) {
+   return AlignedFreeUniquePtr<T[]>(
+-      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
++      detail::AllocateAlignedItems<T>(items, alloc, opaque),
+       AlignedFreer(free, opaque));
+ }
+ 
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator_test.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator_test.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator_test.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/aligned_allocator_test.cc	2021-07-26 17:16:43.672858709 -0400
+@@ -16,6 +16,7 @@
+ 
+ #include <stddef.h>
+ 
++#include <array>
+ #include <new>
+ #include <random>
+ #include <vector>
+@@ -87,6 +88,32 @@ TEST(AlignedAllocatorTest, FreeNullptr)
+                    /*opaque_ptr=*/nullptr);
+ }
+ 
++TEST(AlignedAllocatorTest, Log2) {
++  EXPECT_EQ(0u, detail::ShiftCount(1));
++  EXPECT_EQ(1u, detail::ShiftCount(2));
++  EXPECT_EQ(3u, detail::ShiftCount(8));
++}
++
++// Allocator returns null when it detects overflow of items * sizeof(T).
++TEST(AlignedAllocatorTest, Overflow) {
++  constexpr size_t max = ~size_t(0);
++  constexpr size_t msb = (max >> 1) + 1;
++  using Size5 = std::array<uint8_t, 5>;
++  using Size10 = std::array<uint8_t, 10>;
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
++  EXPECT_EQ(nullptr,
++            detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
++}
++
+ TEST(AlignedAllocatorTest, AllocDefaultPointers) {
+   const size_t kSize = 7777;
+   void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
+@@ -215,7 +242,8 @@ TEST(AlignedAllocatorTest, MakeUniqueAli
+     auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
+         7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
+         &counter);
+-    // An array shold still only call a single allocation.
++    ASSERT_NE(nullptr, arr.get());
++    // An array should still only call a single allocation.
+     EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+     EXPECT_EQ(7, counter);
+     for (size_t i = 0; i < 7; i++) {
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/base.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/base.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/base.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/base.h	2021-07-26 17:16:04.753265910 -0400
+@@ -203,6 +203,10 @@
+ #define HWY_ARCH_X86_64 0
+ #endif
+ 
++#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
++#error "Cannot have both x86-32 and x86-64"
++#endif
++
+ #if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
+ #define HWY_ARCH_X86 1
+ #else
+@@ -249,9 +253,11 @@
+ #define HWY_ARCH_RVV 0
+ #endif
+ 
++// It is an error to detect multiple architectures at the same time, but OK to
++// detect none of the above.
+ #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
+-     HWY_ARCH_RVV) != 1
+-#error "Must detect exactly one platform"
++     HWY_ARCH_RVV) > 1
++#error "Must not detect more than one architecture"
+ #endif
+ 
+ //------------------------------------------------------------------------------
+@@ -328,6 +334,12 @@ static constexpr HWY_MAYBE_UNUSED size_t
+ 
+ // RVV already has a builtin type and the GCC intrinsics require it.
+ #if HWY_ARCH_RVV && HWY_COMPILER_GCC
++#define HWY_NATIVE_FLOAT16 1
++#else
++#define HWY_NATIVE_FLOAT16 0
++#endif
++
++#if HWY_NATIVE_FLOAT16
+ using float16_t = __fp16;
+ // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
+ // arguments, so use a wrapper.
+@@ -597,7 +609,7 @@ HWY_API size_t PopCount(uint64_t x) {
+   return static_cast<size_t>(__builtin_popcountll(x));
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+   return _mm_popcnt_u64(x);
+-#elif HWY_COMPILER_MSVC
++#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
+   return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+ #else
+   x -= ((x >> 1) & 0x55555555U);
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/cache_control.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/cache_control.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/cache_control.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/cache_control.h	2021-07-26 17:16:26.004589594 -0400
+@@ -32,6 +32,14 @@
+ #include <emmintrin.h>  // SSE2
+ #endif
+ 
++// Windows.h #defines these, which causes infinite recursion. Temporarily
++// undefine them in this header; these functions are anyway deprecated.
++// TODO(janwas): remove when these functions are removed.
++#pragma push_macro("LoadFence")
++#pragma push_macro("StoreFence")
++#undef LoadFence
++#undef StoreFence
++
+ namespace hwy {
+ 
+ // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
+@@ -83,6 +91,17 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCach
+ #endif
+ }
+ 
++// Reduces power consumption in spin-loops. No effect on non-x86.
++HWY_INLINE HWY_ATTR_CACHE void Pause() {
++#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
++  _mm_pause();
++#endif
++}
++
+ }  // namespace hwy
+ 
++// TODO(janwas): remove when these functions are removed. (See above.)
++#pragma pop_macro("StoreFence")
++#pragma pop_macro("LoadFence")
++
+ #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/examples/skeleton.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/examples/skeleton.cc
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/examples/skeleton_test.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/examples/skeleton_test.cc
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/highway.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/highway.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/highway.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/highway.h	2021-07-26 17:16:58.109078590 -0400
+@@ -25,10 +25,10 @@
+ 
+ namespace hwy {
+ 
+-// API version (https://semver.org/)
++// API version (https://semver.org/); keep in sync with CMakeLists.txt.
+ #define HWY_MAJOR 0
+ #define HWY_MINOR 12
+-#define HWY_PATCH 0
++#define HWY_PATCH 2
+ 
+ //------------------------------------------------------------------------------
+ // Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
+@@ -49,7 +49,7 @@ namespace hwy {
+   HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
+ #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
+ 
+-// Vector of up to MAX_N lanes.
++// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
+ #define HWY_CAPPED(T, MAX_N) \
+   hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
+ 
+@@ -75,6 +75,10 @@ namespace hwy {
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_NEON
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
++#elif HWY_STATIC_TARGET == HWY_SVE
++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
++#elif HWY_STATIC_TARGET == HWY_SVE2
++#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_PPC8
+ #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
+ #elif HWY_STATIC_TARGET == HWY_SSE4
+@@ -143,6 +147,18 @@ FunctionCache<RetType, Args...> Function
+ #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
+ #endif
+ 
++#if HWY_TARGETS & HWY_SVE
++#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
++#else
++#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
++#endif
++
++#if HWY_TARGETS & HWY_SVE2
++#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
++#else
++#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
++#endif
++
+ #if HWY_TARGETS & HWY_PPC8
+ #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
+ #else
+@@ -261,8 +277,11 @@ FunctionCache<RetType, Args...> Function
+ #elif HWY_TARGET == HWY_AVX3
+ #include "hwy/ops/x86_512-inl.h"
+ #elif HWY_TARGET == HWY_PPC8
++#error "PPC is not yet supported"
+ #elif HWY_TARGET == HWY_NEON
+ #include "hwy/ops/arm_neon-inl.h"
++#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
++#include "hwy/ops/arm_sve-inl.h"
+ #elif HWY_TARGET == HWY_WASM
+ #include "hwy/ops/wasm_128-inl.h"
+ #elif HWY_TARGET == HWY_RVV
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.cc	2021-07-26 17:17:12.094291603 -0400
+@@ -29,6 +29,22 @@
+ #include <string>
+ #include <vector>
+ 
++#if defined(_WIN32) || defined(_WIN64)
++#ifndef NOMINMAX
++#define NOMINMAX
++#endif  // NOMINMAX
++#include <windows.h>
++#endif
++
++#if defined(__MACH__)
++#include <mach/mach.h>
++#include <mach/mach_time.h>
++#endif
++
++#if defined(__HAIKU__)
++#include <OS.h>
++#endif
++
+ #include "hwy/base.h"
+ #if HWY_ARCH_PPC
+ #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
+@@ -43,114 +59,13 @@
+ #endif  // HWY_ARCH_X86
+ 
+ namespace hwy {
+-namespace platform {
+ namespace {
+-
+-#if HWY_ARCH_X86
+-
+-void Cpuid(const uint32_t level, const uint32_t count,
+-           uint32_t* HWY_RESTRICT abcd) {
+-#if HWY_COMPILER_MSVC
+-  int regs[4];
+-  __cpuidex(regs, level, count);
+-  for (int i = 0; i < 4; ++i) {
+-    abcd[i] = regs[i];
+-  }
+-#else
+-  uint32_t a;
+-  uint32_t b;
+-  uint32_t c;
+-  uint32_t d;
+-  __cpuid_count(level, count, a, b, c, d);
+-  abcd[0] = a;
+-  abcd[1] = b;
+-  abcd[2] = c;
+-  abcd[3] = d;
+-#endif
+-}
+-
+-std::string BrandString() {
+-  char brand_string[49];
+-  std::array<uint32_t, 4> abcd;
+-
+-  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+-  Cpuid(0x80000000U, 0, abcd.data());
+-  if (abcd[0] < 0x80000004U) {
+-    return std::string();
+-  }
+-
+-  for (size_t i = 0; i < 3; ++i) {
+-    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
+-    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+-  }
+-  brand_string[48] = 0;
+-  return brand_string;
+-}
+-
+-// Returns the frequency quoted inside the brand string. This does not
+-// account for throttling nor Turbo Boost.
+-double NominalClockRate() {
+-  const std::string& brand_string = BrandString();
+-  // Brand strings include the maximum configured frequency. These prefixes are
+-  // defined by Intel CPUID documentation.
+-  const char* prefixes[3] = {"MHz", "GHz", "THz"};
+-  const double multipliers[3] = {1E6, 1E9, 1E12};
+-  for (size_t i = 0; i < 3; ++i) {
+-    const size_t pos_prefix = brand_string.find(prefixes[i]);
+-    if (pos_prefix != std::string::npos) {
+-      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+-      if (pos_space != std::string::npos) {
+-        const std::string digits =
+-            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+-        return std::stod(digits) * multipliers[i];
+-      }
+-    }
+-  }
+-
+-  return 0.0;
+-}
+-
+-#endif  // HWY_ARCH_X86
+-
+-}  // namespace
+-
+-// Returns tick rate. Invariant means the tick counter frequency is independent
+-// of CPU throttling or sleep. May be expensive, caller should cache the result.
+-double InvariantTicksPerSecond() {
+-#if HWY_ARCH_PPC
+-  return __ppc_get_timebase_freq();
+-#elif HWY_ARCH_X86
+-  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
+-  return NominalClockRate();
+-#else
+-  // Fall back to clock_gettime nanoseconds.
+-  return 1E9;
+-#endif
+-}
+-
+-}  // namespace platform
+-namespace {
+-
+-// Prevents the compiler from eliding the computations that led to "output".
+-template <class T>
+-inline void PreventElision(T&& output) {
+-#if HWY_COMPILER_MSVC == 0
+-  // Works by indicating to the compiler that "output" is being read and
+-  // modified. The +r constraint avoids unnecessary writes to memory, but only
+-  // works for built-in types (typically FuncOutput).
+-  asm volatile("" : "+r"(output) : : "memory");
+-#else
+-  // MSVC does not support inline assembly anymore (and never supported GCC's
+-  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+-  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+-  // with volatile pointers generates inefficient code on MSVC 2017.
+-  static std::atomic<T> dummy(T{});
+-  dummy.store(output, std::memory_order_relaxed);
+-#endif
+-}
+-
+ namespace timer {
+ 
++// Ticks := platform-specific timer values (CPU cycles on x86). Must be
++// unsigned to guarantee wraparound on overflow.
++using Ticks = uint64_t;
++
+ // Start/Stop return absolute timestamps and must be placed immediately before
+ // and after the region to measure. We provide separate Start/Stop functions
+ // because they use different fences.
+@@ -202,8 +117,8 @@ namespace timer {
+ 
+ // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+ // divide by InvariantTicksPerSecond.
+-inline uint64_t Start64() {
+-  uint64_t t;
++inline Ticks Start() {
++  Ticks t;
+ #if HWY_ARCH_PPC
+   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+ #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+@@ -228,8 +143,15 @@ inline uint64_t Start64() {
+       : "rdx", "memory", "cc");
+ #elif HWY_ARCH_RVV
+   asm volatile("rdcycle %0" : "=r"(t));
+-#else
+-  // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
++#elif defined(_WIN32) || defined(_WIN64)
++  LARGE_INTEGER counter;
++  (void)QueryPerformanceCounter(&counter);
++  t = counter.QuadPart;
++#elif defined(__MACH__)
++  t = mach_absolute_time();
++#elif defined(__HAIKU__)
++  t = system_time_nsecs();  // since boot
++#else  // POSIX
+   timespec ts;
+   clock_gettime(CLOCK_MONOTONIC, &ts);
+   t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+@@ -237,7 +159,7 @@ inline uint64_t Start64() {
+   return t;
+ }
+ 
+-inline uint64_t Stop64() {
++inline Ticks Stop() {
+   uint64_t t;
+ #if HWY_ARCH_PPC
+   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+@@ -261,61 +183,7 @@ inline uint64_t Stop64() {
+       // "cc" = flags modified by SHL.
+       : "rcx", "rdx", "memory", "cc");
+ #else
+-  t = Start64();
+-#endif
+-  return t;
+-}
+-
+-// Returns a 32-bit timestamp with about 4 cycles less overhead than
+-// Start64. Only suitable for measuring very short regions because the
+-// timestamp overflows about once a second.
+-inline uint32_t Start32() {
+-  uint32_t t;
+-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-  t = static_cast<uint32_t>(__rdtsc());
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-#elif HWY_ARCH_X86_64
+-  asm volatile(
+-      "lfence\n\t"
+-      "rdtsc\n\t"
+-      "lfence"
+-      : "=a"(t)
+-      :
+-      // "memory" avoids reordering. rdx = TSC >> 32.
+-      : "rdx", "memory");
+-#elif HWY_ARCH_RVV
+-  asm volatile("rdcycle %0" : "=r"(t));
+-#else
+-  t = static_cast<uint32_t>(Start64());
+-#endif
+-  return t;
+-}
+-
+-inline uint32_t Stop32() {
+-  uint32_t t;
+-#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+-  _ReadWriteBarrier();
+-  unsigned aux;
+-  t = static_cast<uint32_t>(__rdtscp(&aux));
+-  _ReadWriteBarrier();
+-  _mm_lfence();
+-  _ReadWriteBarrier();
+-#elif HWY_ARCH_X86_64
+-  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+-  asm volatile(
+-      "rdtscp\n\t"
+-      "lfence"
+-      : "=a"(t)
+-      :
+-      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+-      : "rcx", "rdx", "memory");
+-#else
+-  t = static_cast<uint32_t>(Stop64());
++  t = Start();
+ #endif
+   return t;
+ }
+@@ -440,21 +308,130 @@ T MedianAbsoluteDeviation(const T* value
+ }
+ 
+ }  // namespace robust_statistics
++}  // namespace
++namespace platform {
++namespace {
+ 
+-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+-// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
+-// read than 64 bit.
+-using Ticks = uint32_t;
++// Prevents the compiler from eliding the computations that led to "output".
++template <class T>
++inline void PreventElision(T&& output) {
++#if HWY_COMPILER_MSVC == 0
++  // Works by indicating to the compiler that "output" is being read and
++  // modified. The +r constraint avoids unnecessary writes to memory, but only
++  // works for built-in types (typically FuncOutput).
++  asm volatile("" : "+r"(output) : : "memory");
++#else
++  // MSVC does not support inline assembly anymore (and never supported GCC's
++  // RTL constraints). Self-assignment with #pragma optimize("off") might be
++  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
++  // with volatile pointers generates inefficient code on MSVC 2017.
++  static std::atomic<T> dummy(T{});
++  dummy.store(output, std::memory_order_relaxed);
++#endif
++}
++
++#if HWY_ARCH_X86
++
++void Cpuid(const uint32_t level, const uint32_t count,
++           uint32_t* HWY_RESTRICT abcd) {
++#if HWY_COMPILER_MSVC
++  int regs[4];
++  __cpuidex(regs, level, count);
++  for (int i = 0; i < 4; ++i) {
++    abcd[i] = regs[i];
++  }
++#else
++  uint32_t a;
++  uint32_t b;
++  uint32_t c;
++  uint32_t d;
++  __cpuid_count(level, count, a, b, c, d);
++  abcd[0] = a;
++  abcd[1] = b;
++  abcd[2] = c;
++  abcd[3] = d;
++#endif
++}
++
++std::string BrandString() {
++  char brand_string[49];
++  std::array<uint32_t, 4> abcd;
++
++  // Check if brand string is supported (it is on all reasonable Intel/AMD)
++  Cpuid(0x80000000U, 0, abcd.data());
++  if (abcd[0] < 0x80000004U) {
++    return std::string();
++  }
++
++  for (size_t i = 0; i < 3; ++i) {
++    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
++    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
++  }
++  brand_string[48] = 0;
++  return brand_string;
++}
++
++// Returns the frequency quoted inside the brand string. This does not
++// account for throttling nor Turbo Boost.
++double NominalClockRate() {
++  const std::string& brand_string = BrandString();
++  // Brand strings include the maximum configured frequency. These prefixes are
++  // defined by Intel CPUID documentation.
++  const char* prefixes[3] = {"MHz", "GHz", "THz"};
++  const double multipliers[3] = {1E6, 1E9, 1E12};
++  for (size_t i = 0; i < 3; ++i) {
++    const size_t pos_prefix = brand_string.find(prefixes[i]);
++    if (pos_prefix != std::string::npos) {
++      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
++      if (pos_space != std::string::npos) {
++        const std::string digits =
++            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
++        return std::stod(digits) * multipliers[i];
++      }
++    }
++  }
++
++  return 0.0;
++}
++
++#endif  // HWY_ARCH_X86
++
++}  // namespace
++
++double InvariantTicksPerSecond() {
++#if HWY_ARCH_PPC
++  return __ppc_get_timebase_freq();
++#elif HWY_ARCH_X86
++  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
++  return NominalClockRate();
++#elif defined(_WIN32) || defined(_WIN64)
++  LARGE_INTEGER freq;
++  (void)QueryPerformanceFrequency(&freq);
++  return double(freq.QuadPart);
++#elif defined(__MACH__)
++  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
++  mach_timebase_info_data_t timebase;
++  (void)mach_timebase_info(&timebase);
++  return double(timebase.denom) / timebase.numer * 1E9;
++#else
++  // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
++  return 1E9;  // Haiku and clock_gettime return nanoseconds.
++#endif
++}
+ 
+-// Returns timer overhead / minimum measurable difference.
+-Ticks TimerResolution() {
++double Now() {
++  static const double mul = 1.0 / InvariantTicksPerSecond();
++  return static_cast<double>(timer::Start()) * mul;
++}
++
++uint64_t TimerResolution() {
+   // Nested loop avoids exceeding stack/L1 capacity.
+-  Ticks repetitions[Params::kTimerSamples];
++  timer::Ticks repetitions[Params::kTimerSamples];
+   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
+-    Ticks samples[Params::kTimerSamples];
++    timer::Ticks samples[Params::kTimerSamples];
+     for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+-      const Ticks t0 = timer::Start32();
+-      const Ticks t1 = timer::Stop32();
++      const timer::Ticks t0 = timer::Start();
++      const timer::Ticks t1 = timer::Stop();
+       samples[i] = t1 - t0;
+     }
+     repetitions[rep] = robust_statistics::Mode(samples);
+@@ -462,18 +439,21 @@ Ticks TimerResolution() {
+   return robust_statistics::Mode(repetitions);
+ }
+ 
+-static const Ticks timer_resolution = TimerResolution();
++}  // namespace platform
++namespace {
++
++static const timer::Ticks timer_resolution = platform::TimerResolution();
+ 
+ // Estimates the expected value of "lambda" values with a variable number of
+ // samples until the variability "rel_mad" is less than "max_rel_mad".
+ template <class Lambda>
+-Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+-                        const Params& p, const Lambda& lambda) {
++timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
++                               const Params& p, const Lambda& lambda) {
+   // Choose initial samples_per_eval based on a single estimated duration.
+-  Ticks t0 = timer::Start32();
++  timer::Ticks t0 = timer::Start();
+   lambda();
+-  Ticks t1 = timer::Stop32();
+-  Ticks est = t1 - t0;
++  timer::Ticks t1 = timer::Stop();
++  timer::Ticks est = t1 - t0;
+   static const double ticks_per_second = platform::InvariantTicksPerSecond();
+   const size_t ticks_per_eval =
+       static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
+@@ -481,21 +461,21 @@ Ticks SampleUntilStable(const double max
+       est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
+   samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
+ 
+-  std::vector<Ticks> samples;
++  std::vector<timer::Ticks> samples;
+   samples.reserve(1 + samples_per_eval);
+   samples.push_back(est);
+ 
+   // Percentage is too strict for tiny differences, so also allow a small
+   // absolute "median absolute deviation".
+-  const Ticks max_abs_mad = (timer_resolution + 99) / 100;
++  const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
+   *rel_mad = 0.0;  // ensure initialized
+ 
+   for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
+     samples.reserve(samples.size() + samples_per_eval);
+     for (size_t i = 0; i < samples_per_eval; ++i) {
+-      t0 = timer::Start32();
++      t0 = timer::Start();
+       lambda();
+-      t1 = timer::Stop32();
++      t1 = timer::Stop();
+       samples.push_back(t1 - t0);
+     }
+ 
+@@ -508,14 +488,14 @@ Ticks SampleUntilStable(const double max
+     NANOBENCHMARK_CHECK(est != 0);
+ 
+     // Median absolute deviation (mad) is a robust measure of 'variability'.
+-    const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
++    const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+         samples.data(), samples.size(), est);
+-    *rel_mad = static_cast<double>(int(abs_mad)) / est;
++    *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
+ 
+     if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
+       if (p.verbose) {
+-        printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
+-               samples.size(), est, abs_mad, *rel_mad * 100.0);
++        printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
++               samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
+       }
+       return est;
+     }
+@@ -539,29 +519,17 @@ InputVec UniqueInputs(const FuncInput* i
+   return unique;
+ }
+ 
+-// Returns how often we need to call func for sufficient precision, or zero
+-// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
++// Returns how often we need to call func for sufficient precision.
+ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
+                const Params& p) {
+   // Min elapsed ticks for any input.
+-  Ticks min_duration = ~0u;
++  timer::Ticks min_duration = ~timer::Ticks(0);
+ 
+   for (const FuncInput input : unique) {
+-    // Make sure a 32-bit timer is sufficient.
+-    const uint64_t t0 = timer::Start64();
+-    PreventElision(func(arg, input));
+-    const uint64_t t1 = timer::Stop64();
+-    const uint64_t elapsed = t1 - t0;
+-    if (elapsed >= (1ULL << 30)) {
+-      fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
+-              input);
+-      return 0;
+-    }
+-
+     double rel_mad;
+-    const Ticks total = SampleUntilStable(
++    const timer::Ticks total = SampleUntilStable(
+         p.target_rel_mad, &rel_mad, p,
+-        [func, arg, input]() { PreventElision(func(arg, input)); });
++        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
+     min_duration = std::min(min_duration, total - timer_resolution);
+   }
+ 
+@@ -571,8 +539,8 @@ size_t NumSkip(const Func func, const ui
+   const size_t num_skip =
+       min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
+   if (p.verbose) {
+-    printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
+-           max_skip, min_duration, num_skip);
++    printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
++           size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
+   }
+   return num_skip;
+ }
+@@ -637,13 +605,14 @@ void FillSubset(const InputVec& full, co
+ }
+ 
+ // Returns total ticks elapsed for all inputs.
+-Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
+-                    const Params& p, double* max_rel_mad) {
++timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
++                           const InputVec* inputs, const Params& p,
++                           double* max_rel_mad) {
+   double rel_mad;
+-  const Ticks duration =
++  const timer::Ticks duration =
+       SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
+         for (const FuncInput input : *inputs) {
+-          PreventElision(func(arg, input));
++          platform::PreventElision(func(arg, input));
+         }
+       });
+   *max_rel_mad = std::max(*max_rel_mad, rel_mad);
+@@ -657,19 +626,20 @@ HWY_NOINLINE FuncOutput EmptyFunc(const
+ 
+ // Returns overhead of accessing inputs[] and calling a function; this will
+ // be deducted from future TotalDuration return values.
+-Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
++timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
++                      const Params& p) {
+   double rel_mad;
+   // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
+   return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
+     for (const FuncInput input : *inputs) {
+-      PreventElision(EmptyFunc(arg, input));
++      platform::PreventElision(EmptyFunc(arg, input));
+     }
+   });
+ }
+ 
+ }  // namespace
+ 
+-int Unpredictable1() { return timer::Start64() != ~0ULL; }
++int Unpredictable1() { return timer::Start() != ~0ULL; }
+ 
+ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+                const size_t num_inputs, Result* results, const Params& p) {
+@@ -685,32 +655,35 @@ size_t Measure(const Func func, const ui
+       ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
+   InputVec subset(full.size() - num_skip);
+ 
+-  const Ticks overhead = Overhead(arg, &full, p);
+-  const Ticks overhead_skip = Overhead(arg, &subset, p);
++  const timer::Ticks overhead = Overhead(arg, &full, p);
++  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
+   if (overhead < overhead_skip) {
+-    fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
+-            overhead_skip);
++    fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
++            size_t(overhead), size_t(overhead_skip));
+     return 0;
+   }
+ 
+   if (p.verbose) {
+-    printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
+-           overhead, overhead_skip);
++    printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
++           size_t(overhead), size_t(overhead_skip));
+   }
+ 
+   double max_rel_mad = 0.0;
+-  const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
++  const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+ 
+   for (size_t i = 0; i < unique.size(); ++i) {
+     FillSubset(full, unique[i], num_skip, &subset);
+-    const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
++    const timer::Ticks total_skip =
++        TotalDuration(func, arg, &subset, p, &max_rel_mad);
+ 
+     if (total < total_skip) {
+-      fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
++      fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
++              size_t(total_skip));
+       return 0;
+     }
+ 
+-    const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
++    const timer::Ticks duration =
++        (total - overhead) - (total_skip - overhead_skip);
+     results[i].input = unique[i];
+     results[i].ticks = static_cast<float>(duration) * mul;
+     results[i].variability = static_cast<float>(max_rel_mad);
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark.h	2021-07-26 17:17:12.094291603 -0400
+@@ -44,11 +44,6 @@
+ // central tendency of the measurement samples with the "half sample mode",
+ // which is more robust to outliers and skewed data than the mean or median.
+ 
+-// WARNING if included from multiple translation units compiled with distinct
+-// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
+-// macro that is unique to the current compile flags. We must also avoid
+-// standard library headers such as vector and functional that define functions.
+-
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -79,6 +74,16 @@ namespace platform {
+ // This call may be expensive, callers should cache the result.
+ double InvariantTicksPerSecond();
+ 
++// Returns current timestamp [in seconds] relative to an unspecified origin.
++// Features: monotonic (no negative elapsed time), steady (unaffected by system
++// time changes), high-resolution (on the order of microseconds).
++double Now();
++
++// Returns ticks elapsed in back to back timer calls, i.e. a function of the
++// timer resolution (minimum measurable difference) and overhead.
++// This call is expensive, callers should cache the result.
++uint64_t TimerResolution();
++
+ }  // namespace platform
+ 
+ // Returns 1, but without the compiler knowing what the value is. This prevents
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark_test.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark_test.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark_test.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/nanobenchmark_test.cc	2021-07-26 17:10:30.283171481 -0400
+@@ -15,11 +15,11 @@
+ #include "hwy/nanobenchmark.h"
+ 
+ #include <stdio.h>
+-#include <stdlib.h>  // strtol
+-#include <unistd.h>  // sleep
+ 
+ #include <random>
+ 
++#include "hwy/tests/test_util-inl.h"
++
+ namespace hwy {
+ namespace {
+ 
+@@ -31,6 +31,7 @@ FuncOutput Div(const void*, FuncInput in
+ 
+ template <size_t N>
+ void MeasureDiv(const FuncInput (&inputs)[N]) {
++  printf("Measuring integer division (output on final two lines)\n");
+   Result results[N];
+   Params params;
+   params.max_evals = 4;  // avoid test timeout
+@@ -66,39 +67,14 @@ void MeasureRandom(const FuncInput (&inp
+   }
+ }
+ 
+-template <size_t N>
+-void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
+-  printf("Expect a 'measurement failed' below:\n");
+-  Result results[N];
+-
+-  const size_t num_results = Measure(
+-      [](const void*, const FuncInput input) -> FuncOutput {
+-        // Loop until the sleep succeeds (not interrupted by signal). We assume
+-        // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
+-        while (sleep(2) != 0) {
+-        }
+-        return input;
+-      },
+-      nullptr, inputs, N, results);
+-  NANOBENCHMARK_CHECK(num_results == 0);
+-  (void)num_results;
+-}
+-
+-void RunAll(const int argc, char** /*argv*/) {
+-  // unpredictable == 1 but the compiler doesn't know that.
+-  const int unpredictable = argc != 999;
++TEST(NanobenchmarkTest, RunAll) {
++  const int unpredictable = Unpredictable1();  // == 1, unknown to compiler.
+   static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
+                                      static_cast<FuncInput>(unpredictable + 9)};
+ 
+   MeasureDiv(inputs);
+   MeasureRandom(inputs);
+-  EnsureLongMeasurementFails(inputs);
+ }
+ 
+ }  // namespace
+ }  // namespace hwy
+-
+-int main(int argc, char* argv[]) {
+-  hwy::RunAll(argc, argv);
+-  return 0;
+-}
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/ops/arm_neon-inl.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/ops/arm_neon-inl.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/ops/arm_neon-inl.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/ops/arm_neon-inl.h	2021-07-26 17:20:19.294142914 -0400
+@@ -26,6 +26,8 @@ HWY_BEFORE_NAMESPACE();
+ namespace hwy {
+ namespace HWY_NAMESPACE {
+ 
++namespace detail {  // for code folding and Raw128
++
+ // Macros used to define single and double function calls for multiple types
+ // for full and half vectors. These macros are undefined at the end of the file.
+ 
+@@ -437,12 +439,14 @@ struct Raw128<int8_t, 1> {
+   using type = int8x8_t;
+ };
+ 
++}  // namespace detail
++
+ template <typename T>
+ using Full128 = Simd<T, 16 / sizeof(T)>;
+ 
+ template <typename T, size_t N = 16 / sizeof(T)>
+ class Vec128 {
+-  using Raw = typename Raw128<T, N>::type;
++  using Raw = typename detail::Raw128<T, N>::type;
+ 
+  public:
+   HWY_INLINE Vec128() {}
+@@ -480,7 +484,8 @@ class Vec128 {
+ // FF..FF or 0, also for floating-point - see README.
+ template <typename T, size_t N = 16 / sizeof(T)>
+ class Mask128 {
+-  using Raw = typename Raw128<T, N>::type;
++  // ARM C Language Extensions return and expect unsigned type.
++  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
+ 
+  public:
+   HWY_INLINE Mask128() {}
+@@ -664,15 +669,25 @@ template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
+   HWY_DIAGNOSTICS(push)
+   HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+-  typename Raw128<T, N>::type a;
++  typename detail::Raw128<T, N>::type a;
+   return Vec128<T, N>(a);
+   HWY_DIAGNOSTICS(pop)
+ }
+ 
+-// ------------------------------ Extract lane
++// Returns a vector with lane i=[0, N) set to "first" + i.
++template <typename T, size_t N, typename T2>
++Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
++  HWY_ALIGN T lanes[16 / sizeof(T)];
++  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
++    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
++  }
++  return Load(d, lanes);
++}
++
++// ------------------------------ GetLane
+ 
+ HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
+-  return vget_lane_u8(vget_low_u8(v.raw), 0);
++  return vgetq_lane_u8(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
+@@ -680,7 +695,7 @@ HWY_INLINE uint8_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
+-  return vget_lane_s8(vget_low_s8(v.raw), 0);
++  return vgetq_lane_s8(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
+@@ -688,7 +703,7 @@ HWY_INLINE int8_t GetLane(const Vec128<i
+ }
+ 
+ HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
+-  return vget_lane_u16(vget_low_u16(v.raw), 0);
++  return vgetq_lane_u16(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
+@@ -696,7 +711,7 @@ HWY_INLINE uint16_t GetLane(const Vec128
+ }
+ 
+ HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
+-  return vget_lane_s16(vget_low_s16(v.raw), 0);
++  return vgetq_lane_s16(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
+@@ -704,7 +719,7 @@ HWY_INLINE int16_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
+-  return vget_lane_u32(vget_low_u32(v.raw), 0);
++  return vgetq_lane_u32(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
+@@ -712,7 +727,7 @@ HWY_INLINE uint32_t GetLane(const Vec128
+ }
+ 
+ HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
+-  return vget_lane_s32(vget_low_s32(v.raw), 0);
++  return vgetq_lane_s32(v.raw, 0);
+ }
+ template <size_t N>
+ HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
+@@ -720,20 +735,20 @@ HWY_INLINE int32_t GetLane(const Vec128<
+ }
+ 
+ HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
+-  return vget_lane_u64(vget_low_u64(v.raw), 0);
++  return vgetq_lane_u64(v.raw, 0);
+ }
+ HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
+   return vget_lane_u64(v.raw, 0);
+ }
+ HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
+-  return vget_lane_s64(vget_low_s64(v.raw), 0);
++  return vgetq_lane_s64(v.raw, 0);
+ }
+ HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
+   return vget_lane_s64(v.raw, 0);
+ }
+ 
+ HWY_INLINE float GetLane(const Vec128<float, 4> v) {
+-  return vget_lane_f32(vget_low_f32(v.raw), 0);
++  return vgetq_lane_f32(v.raw, 0);
+ }
+ HWY_INLINE float GetLane(const Vec128<float, 2> v) {
+   return vget_lane_f32(v.raw, 0);
+@@ -743,7 +758,7 @@ HWY_INLINE float GetLane(const Vec128<fl
+ }
+ #if HWY_ARCH_ARM_A64
+ HWY_INLINE double GetLane(const Vec128<double, 2> v) {
+-  return vget_lane_f64(vget_low_f64(v.raw), 0);
++  return vgetq_lane_f64(v.raw, 0);
+ }
+ HWY_INLINE double GetLane(const Vec128<double, 1> v) {
+   return vget_lane_f64(v.raw, 0);
+@@ -785,8 +800,6 @@ HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSu
+ // ------------------------------ Average
+ 
+ // Returns (a + b + 1) / 2
+-
+-// Unsigned
+ HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
+ HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
+ 
+@@ -802,6 +815,7 @@ HWY_INLINE Vec128<int16_t> Abs(const Vec
+ HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
+   return Vec128<int32_t>(vabsq_s32(v.raw));
+ }
++// i64 is implemented after BroadcastSignBit.
+ HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
+   return Vec128<float>(vabsq_f32(v.raw));
+ }
+@@ -1184,21 +1198,34 @@ HWY_INLINE Vec128<float, N> ApproximateR
+ #if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
+ #else
+-// Emulated with approx reciprocal + Newton-Raphson + mul
++// Not defined on armv7: approximate
++namespace detail {
++
++HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
++    const Vec128<float> recip, const Vec128<float> divisor) {
++  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
++}
++template <size_t N>
++HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
++    const Vec128<float, N> recip, Vec128<float, N> divisor) {
++  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
++}
++
++}  // namespace detail
++
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
+                                       const Vec128<float, N> b) {
+   auto x = ApproximateReciprocal(b);
+-  // Newton-Raphson on 1/x - b
+-  const auto two = Set(Simd<float, N>(), 2);
+-  x = x * (two - b * x);
+-  x = x * (two - b * x);
+-  x = x * (two - b * x);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
++  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
+   return a * x;
+ }
+ #endif
+ 
+-// Absolute value of difference.
++// ------------------------------ Absolute value of difference.
++
+ HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
+   return Vec128<float>(vabdq_f32(a.raw, b.raw));
+ }
+@@ -1312,7 +1339,7 @@ HWY_INLINE Vec128<double, N> NegMulSub(c
+ }
+ #endif
+ 
+-// ------------------------------ Floating-point square root
++// ------------------------------ Floating-point square root (IfThenZeroElse)
+ 
+ // Approximate reciprocal square root
+ HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
+@@ -1328,77 +1355,33 @@ HWY_INLINE Vec128<float, N> ApproximateR
+ #if HWY_ARCH_ARM_A64
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
+ #else
+-// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
+-template <size_t N>
+-HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
+-  auto b = v;
+-  auto Y = ApproximateReciprocalSqrt(v);
+-  auto x = v * Y;
+-  const auto half = Set(Simd<float, N>(), 0.5);
+-  const auto oneandhalf = Set(Simd<float, N>(), 1.5);
+-  for (size_t i = 0; i < 3; i++) {
+-    b = b * Y * Y;
+-    Y = oneandhalf - half * b;
+-    x = x * Y;
+-  }
+-  return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
+-}
+-#endif
+-
+-// ================================================== COMPARE
+-
+-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
++namespace detail {
+ 
+-template <typename TFrom, typename TTo, size_t N>
+-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
+-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
+-  return Mask128<TTo, N>{m.raw};
++HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
++                                            const Vec128<float> recip) {
++  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
++}
++template <size_t N>
++HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
++                                               Vec128<float, N> recip) {
++  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
+ }
+ 
+-#define HWY_NEON_BUILD_TPL_HWY_COMPARE
+-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
+-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
+-  const Vec128<type, size> a, const Vec128<type, size> b
+-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
+-
+-// ------------------------------ Equality
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
+-#if HWY_ARCH_ARM_A64
+-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
+-#else
+-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
+-#endif
++}  // namespace detail
+ 
+-// ------------------------------ Strict inequality
++// Not defined on armv7: approximate
++template <size_t N>
++HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
++  auto recip = ApproximateReciprocalSqrt(v);
+ 
+-// Signed/float < (no unsigned)
+-#if HWY_ARCH_ARM_A64
+-HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
+-#else
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
+-#endif
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
++  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
+ 
+-// Signed/float > (no unsigned)
+-#if HWY_ARCH_ARM_A64
+-HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
+-#else
+-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
++  const auto root = v * recip;
++  return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
++}
+ #endif
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
+-
+-// ------------------------------ Weak inequality
+-
+-// Float <= >=
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
+-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
+-
+-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
+-#undef HWY_NEON_BUILD_RET_HWY_COMPARE
+-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
+-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
+ 
+ // ================================================== LOGICAL
+ 
+@@ -1407,13 +1390,16 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operato
+ // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
+ template <typename T>
+ HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
+-  const Full128<uint8_t> d8;
+-  return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
++  const Full128<T> d;
++  const Repartition<uint8_t, decltype(d)> d8;
++  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
+ }
+ template <typename T, size_t N, HWY_IF_LE64(T, N)>
+ HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
+-  const Repartition<uint8_t, Simd<T, N>> d8;
+-  return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
++  const Simd<T, N> d;
++  const Repartition<uint8_t, decltype(d)> d8;
++  using V8 = decltype(Zero(d8));
++  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
+ }
+ 
+ // ------------------------------ And
+@@ -1513,33 +1499,38 @@ HWY_API Vec128<T, N> BroadcastSignBit(co
+   return ShiftRight<sizeof(T) * 8 - 1>(v);
+ }
+ 
+-// ------------------------------ Make mask
++// ================================================== MASK
+ 
+-template <typename T, size_t N>
+-HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
+-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
+-  return (v & bit) == bit;
+-}
++// ------------------------------ To/from vector
+ 
+-// Mask and Vec are the same (true = FF..FF).
++// Mask and Vec have the same representation (true = FF..FF).
+ template <typename T, size_t N>
+ HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
+-  return Mask128<T, N>(v.raw);
++  const Simd<MakeUnsigned<T>, N> du;
++  return Mask128<T, N>(BitCast(du, v).raw);
+ }
+ 
++// DEPRECATED
+ template <typename T, size_t N>
+ HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
+-  return Vec128<T, N>(v.raw);
++  return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
+ }
+ 
+ template <typename T, size_t N>
+-HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
+-                                     const Mask128<T, N> v) {
+-  return Vec128<T, N>(v.raw);
++HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
++  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
++}
++
++// ------------------------------ RebindMask
++
++template <typename TFrom, typename TTo, size_t N>
++HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
++  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
++  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
+ }
+ 
+-// IfThenElse(mask, yes, no)
+-// Returns mask ? b : a.
++// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
++
+ #define HWY_NEON_BUILD_TPL_HWY_IF
+ #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
+ #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                 \
+@@ -1574,7 +1565,6 @@ HWY_INLINE Vec128<T, N> ZeroIfNegative(V
+   return Max(zero, v);
+ }
+ 
+-
+ // ------------------------------ Mask logical
+ 
+ template <typename T, size_t N>
+@@ -1607,30 +1597,183 @@ HWY_API Mask128<T, N> Xor(const Mask128<
+   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
+ }
+ 
+-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
++// ================================================== COMPARE
+ 
+-namespace detail {
++// Comparisons fill a lane with 1-bits if the condition is true, else 0.
++
++// ------------------------------ Shuffle2301 (for i64 compares)
++
++// Swap 32-bit halves in 64-bits
++HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
++  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
++}
++HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
++  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
++}
++HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
++  return Vec128<float, 2>(vrev64_f32(v.raw));
++}
++HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
++  return Vec128<uint32_t>(vrev64q_u32(v.raw));
++}
++HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
++  return Vec128<int32_t>(vrev64q_s32(v.raw));
++}
++HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
++  return Vec128<float>(vrev64q_f32(v.raw));
++}
++
++#define HWY_NEON_BUILD_TPL_HWY_COMPARE
++#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
++#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
++  const Vec128<type, size> a, const Vec128<type, size> b
++#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
+ 
++// ------------------------------ Equality
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
+ #if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
++#else
++// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
++#endif
+ 
+-HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+-  return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
++// ------------------------------ Strict inequality (signed, float)
++#if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
++#else
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
++#endif
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
++
++// ------------------------------ Weak inequality (float)
++HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
++
++#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
++#undef HWY_NEON_BUILD_RET_HWY_COMPARE
++#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
++#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
++
++// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
++
++#if HWY_ARCH_ARM_V7
++
++template <size_t N>
++HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
++                                          const Vec128<int64_t, N> b) {
++  const Simd<int32_t, N * 2> d32;
++  const Simd<int64_t, N> d64;
++  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
++  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
++  return MaskFromVec(BitCast(d64, cmp64));
+ }
+-HWY_INLINE Vec128<uint64_t, 1> Gt(Vec128<uint64_t, 1> a,
+-                                  Vec128<uint64_t, 1> b) {
+-  return Vec128<uint64_t, 1>(vcgt_u64(a.raw, b.raw));
++
++template <size_t N>
++HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
++                                           const Vec128<uint64_t, N> b) {
++  const Simd<uint32_t, N * 2> d32;
++  const Simd<uint64_t, N> d64;
++  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
++  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
++  return MaskFromVec(BitCast(d64, cmp64));
+ }
+ 
+-HWY_INLINE Vec128<int64_t> Gt(Vec128<int64_t> a, Vec128<int64_t> b) {
+-  return Vec128<int64_t>(vcgtq_s64(a.raw, b.raw));
++HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
++                                      const Vec128<int64_t> b) {
++  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
++  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
+ }
+-HWY_INLINE Vec128<int64_t, 1> Gt(Vec128<int64_t, 1> a, Vec128<int64_t, 1> b) {
+-  return Vec128<int64_t, 1>(vcgt_s64(a.raw, b.raw));
++HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
++                                         const Vec128<int64_t, 1> b) {
++  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
++  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
+ }
+ 
+ #endif
+ 
+-}  // namespace detail
++// ------------------------------ Reversed comparisons
++
++template <typename T, size_t N>
++HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
++  return operator<(b, a);
++}
++template <typename T, size_t N>
++HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
++  return operator<=(b, a);
++}
++
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T, size_t N>
++HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
++}
++
++// ------------------------------ TestBit (Eq)
++
++#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
++#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
++#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
++  Vec128<type, size> v, Vec128<type, size> bit
++#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
++
++#if HWY_ARCH_ARM_A64
++HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
++#else
++// No 64-bit versions on armv7
++HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
++HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
++
++template <size_t N>
++HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
++                                        Vec128<uint64_t, N> bit) {
++  return (v & bit) == bit;
++}
++template <size_t N>
++HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
++                                       Vec128<int64_t, N> bit) {
++  return (v & bit) == bit;
++}
++
++#endif
++#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
++#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
++#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
++#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
++
++// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
++HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
++#if HWY_ARCH_ARM_A64
++  return Vec128<int64_t>(vabsq_s64(v.raw));
++#else
++  const auto zero = Zero(Full128<int64_t>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
++#if HWY_ARCH_ARM_A64
++  return Vec128<int64_t, 1>(vabs_s64(v.raw));
++#else
++  const auto zero = Zero(Simd<int64_t, 1>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
++// ------------------------------ Min (IfThenElse, BroadcastSignBit)
++
++#if HWY_ARCH_ARM_A64
++
++HWY_INLINE Mask128<uint64_t> operator<(Vec128<uint64_t> a, Vec128<uint64_t> b) {
++  return Mask128<uint64_t>(vcltq_u64(a.raw, b.raw));
++}
++HWY_INLINE Mask128<uint64_t, 1> operator<(Vec128<uint64_t, 1> a,
++                                          Vec128<uint64_t, 1> b) {
++  return Mask128<uint64_t, 1>(vclt_u64(a.raw, b.raw));
++}
++
++#endif
+ 
+ // Unsigned
+ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
+@@ -1639,7 +1782,7 @@ template <size_t N>
+ HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
+                                    const Vec128<uint64_t, N> b) {
+ #if HWY_ARCH_ARM_A64
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
++  return IfThenElse(b < a, b, a);
+ #else
+   const Simd<uint64_t, N> du;
+   const Simd<int64_t, N> di;
+@@ -1654,7 +1797,7 @@ template <size_t N>
+ HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
+                                   const Vec128<int64_t, N> b) {
+ #if HWY_ARCH_ARM_A64
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
++  return IfThenElse(b < a, b, a);
+ #else
+   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
+   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
+@@ -1677,7 +1820,7 @@ template <size_t N>
+ HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
+                                    const Vec128<uint64_t, N> b) {
+ #if HWY_ARCH_ARM_A64
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
++  return IfThenElse(b < a, a, b);
+ #else
+   const Simd<uint64_t, N> du;
+   const Simd<int64_t, N> di;
+@@ -1692,7 +1835,7 @@ template <size_t N>
+ HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
+                                   const Vec128<int64_t, N> b) {
+ #if HWY_ARCH_ARM_A64
+-  return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
++  return IfThenElse(b < a, a, b);
+ #else
+   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
+   return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
+@@ -1805,73 +1948,72 @@ HWY_INLINE Vec128<double, 1> LoadU(Simd<
+ // we don't actually care what is in it, and we don't want
+ // to introduce extra overhead by initializing it to something.
+ 
+-HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
++HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
+                                     const uint8_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
+   return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
+ }
+-HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
++HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
+                                      const uint16_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
+   return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
+ }
+-HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
++HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
+                                      const uint32_t* HWY_RESTRICT p) {
+-  uint32x2_t a = Undefined(d).raw;
++  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
+   uint32x2_t b = vld1_lane_u32(p, a, 0);
+   return Vec128<uint32_t, 1>(b);
+ }
+-HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
++HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
+                                    const int8_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
+   return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
+ }
+-HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
++HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
+                                     const int16_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
+   return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
+ }
+-HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
++HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
+                                     const int32_t* HWY_RESTRICT p) {
+-  int32x2_t a = Undefined(d).raw;
++  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
+   int32x2_t b = vld1_lane_s32(p, a, 0);
+   return Vec128<int32_t, 1>(b);
+ }
+-HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
++HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
+                                   const float* HWY_RESTRICT p) {
+-  float32x2_t a = Undefined(d).raw;
++  float32x2_t a = Undefined(Simd<float, 2>()).raw;
+   float32x2_t b = vld1_lane_f32(p, a, 0);
+   return Vec128<float, 1>(b);
+ }
+ 
+ // ------------------------------ Load 16
+ 
+-HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
++HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
+                                     const uint8_t* HWY_RESTRICT p) {
+-  uint16x4_t a = Undefined(d).raw;
++  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
+   uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
+   return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
+ }
+-HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
++HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
+                                      const uint16_t* HWY_RESTRICT p) {
+-  uint16x4_t a = Undefined(d).raw;
++  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
+   uint16x4_t b = vld1_lane_u16(p, a, 0);
+   return Vec128<uint16_t, 1>(b);
+ }
+-
+-HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
++HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
+                                    const int8_t* HWY_RESTRICT p) {
+-  int16x4_t a = Undefined(d).raw;
++  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
+   int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
+   return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
+ }
+-HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
++HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
+                                     const int16_t* HWY_RESTRICT p) {
+-  int16x4_t a = Undefined(d).raw;
++  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
+   int16x4_t b = vld1_lane_s16(p, a, 0);
+   return Vec128<int16_t, 1>(b);
+ }
+@@ -2009,12 +2151,12 @@ HWY_INLINE void StoreU(const Vec128<doub
+ HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
+                        uint8_t* HWY_RESTRICT p) {
+   uint32x2_t a = vreinterpret_u32_u8(v.raw);
+-  vst1_lane_u32(p, a, 0);
++  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
+                        uint16_t* HWY_RESTRICT p) {
+   uint32x2_t a = vreinterpret_u32_u16(v.raw);
+-  vst1_lane_u32(p, a, 0);
++  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
+                        uint32_t* HWY_RESTRICT p) {
+@@ -2023,12 +2165,12 @@ HWY_INLINE void StoreU(const Vec128<uint
+ HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
+                        int8_t* HWY_RESTRICT p) {
+   int32x2_t a = vreinterpret_s32_s8(v.raw);
+-  vst1_lane_s32(p, a, 0);
++  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
+                        int16_t* HWY_RESTRICT p) {
+   int32x2_t a = vreinterpret_s32_s16(v.raw);
+-  vst1_lane_s32(p, a, 0);
++  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
+                        int32_t* HWY_RESTRICT p) {
+@@ -2044,7 +2186,7 @@ HWY_INLINE void StoreU(const Vec128<floa
+ HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
+                        uint8_t* HWY_RESTRICT p) {
+   uint16x4_t a = vreinterpret_u16_u8(v.raw);
+-  vst1_lane_u16(p, a, 0);
++  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
+                        uint16_t* HWY_RESTRICT p) {
+@@ -2053,7 +2195,7 @@ HWY_INLINE void StoreU(const Vec128<uint
+ HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
+                        int8_t* HWY_RESTRICT p) {
+   int16x4_t a = vreinterpret_s16_s8(v.raw);
+-  vst1_lane_s16(p, a, 0);
++  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
+ }
+ HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
+                        int16_t* HWY_RESTRICT p) {
+@@ -2118,18 +2260,18 @@ HWY_INLINE Vec128<uint64_t> PromoteTo(Fu
+                                       const Vec128<uint32_t, 2> v) {
+   return Vec128<uint64_t>(vmovl_u32(v.raw));
+ }
+-HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
++HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
+                                      const Vec128<uint8_t, 8> v) {
+-  return Vec128<int16_t>(vmovl_u8(v.raw));
++  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
+ }
+-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
++HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
+                                      const Vec128<uint8_t, 4> v) {
+   uint16x8_t a = vmovl_u8(v.raw);
+-  return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
++  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
+ }
+-HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
++HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
+                                      const Vec128<uint16_t, 4> v) {
+-  return Vec128<int32_t>(vmovl_u16(v.raw));
++  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
+ }
+ 
+ // Unsigned: zero-extend to half vector.
+@@ -2155,9 +2297,9 @@ HWY_INLINE Vec128<uint64_t, N> PromoteTo
+   return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
+ }
+ template <size_t N, HWY_IF_LE64(int16_t, N)>
+-HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
++HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
+                                         const Vec128<uint8_t, N> v) {
+-  return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
++  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
+ }
+ template <size_t N, HWY_IF_LE64(int32_t, N)>
+ HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
+@@ -2220,12 +2362,14 @@ HWY_INLINE Vec128<int64_t, N> PromoteTo(
+ 
+ HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
+                                    const Vec128<float16_t, 4> v) {
+-  return Vec128<float>(vcvt_f32_f16(vreinterpret_f16_u16(v.raw)));
++  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
++  return Vec128<float>(f32);
+ }
+ template <size_t N>
+ HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
+                                       const Vec128<float16_t, N> v) {
+-  return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
++  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
++  return Vec128<float, N>(vget_low_f32(f32));
+ }
+ 
+ #else
+@@ -2353,7 +2497,8 @@ HWY_INLINE Vec128<float16_t, 4> DemoteTo
+ template <size_t N>
+ HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
+                                          const Vec128<float, N> v) {
+-  return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
++  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
++  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
+ }
+ 
+ #else
+@@ -2965,33 +3110,58 @@ HWY_INLINE Vec128<T, N> TableLookupBytes
+                                                 BitCast(d8, from).raw)));
+ }
+ 
+-// ------------------------------ Hard-coded shuffles
++// ------------------------------ TableLookupLanes
+ 
+-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
+-// Shuffle0321 rotates one lane to the right (the previous least-significant
+-// lane is now most-significant). These could also be implemented via
+-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
++// Returned by SetTableIndices for use by TableLookupLanes.
++template <typename T, size_t N>
++struct Indices128 {
++  typename detail::Raw128<T, N>::type raw;
++};
+ 
+-// Swap 32-bit halves in 64-bits
+-HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
+-  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
+-}
+-HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
+-  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
+-}
+-HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
+-  return Vec128<float, 2>(vrev64_f32(v.raw));
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
++#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
++  for (size_t i = 0; i < N; ++i) {
++    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
++  }
++#endif
++
++  const Repartition<uint8_t, decltype(d)> d8;
++  alignas(16) uint8_t control[16] = {0};
++  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
++    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
++      control[idx_lane * sizeof(T) + idx_byte] =
++          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
++    }
++  }
++  return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
+ }
+-HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
+-  return Vec128<uint32_t>(vrev64q_u32(v.raw));
++
++template <size_t N>
++HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
++    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
+ }
+-HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
+-  return Vec128<int32_t>(vrev64q_s32(v.raw));
++template <size_t N>
++HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
++    const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
+ }
+-HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
+-  return Vec128<float>(vrev64q_f32(v.raw));
++template <size_t N>
++HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
++                                             const Indices128<float, N> idx) {
++  const Simd<int32_t, N> di;
++  const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
++  return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
+ }
+ 
++// ------------------------------ Other shuffles (TableLookupBytes)
++
++// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
++// Shuffle0321 rotates one lane to the right (the previous least-significant
++// lane is now most-significant). These could also be implemented via
++// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
++
+ // Swap 64-bit halves
+ template <typename T>
+ HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
+@@ -3029,49 +3199,6 @@ HWY_INLINE Vec128<T> Shuffle0123(const V
+   return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
+ }
+ 
+-// ------------------------------ TableLookupLanes
+-
+-// Returned by SetTableIndices for use by TableLookupLanes.
+-template <typename T>
+-struct Indices128 {
+-  typename Raw128<T, 16 / sizeof(T)>::type raw;
+-};
+-
+-template <typename T>
+-HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
+-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+-  const size_t N = 16 / sizeof(T);
+-  for (size_t i = 0; i < N; ++i) {
+-    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+-  }
+-#endif
+-
+-  const Full128<uint8_t> d8;
+-  alignas(16) uint8_t control[16];
+-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
+-    const size_t idx_lane = idx_byte / sizeof(T);
+-    const size_t mod = idx_byte % sizeof(T);
+-    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
+-  }
+-  return Indices128<T>{BitCast(Full128<T>(), Load(d8, control)).raw};
+-}
+-
+-HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
+-                                             const Indices128<uint32_t> idx) {
+-  return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
+-}
+-HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
+-                                            const Indices128<int32_t> idx) {
+-  return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
+-}
+-HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
+-                                          const Indices128<float> idx) {
+-  const Full128<int32_t> di;
+-  const Full128<float> df;
+-  return BitCast(df,
+-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
+-}
+-
+ // ------------------------------ Interleave lanes
+ 
+ // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
+@@ -3334,16 +3461,6 @@ HWY_INLINE Vec128<T> OddEven(const Vec12
+ 
+ // ================================================== MISC
+ 
+-// Returns a vector with lane i=[0, N) set to "first" + i.
+-template <typename T, size_t N, typename T2>
+-Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
+-  HWY_ALIGN T lanes[16 / sizeof(T)];
+-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
+-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
+-  }
+-  return Load(d, lanes);
+-}
+-
+ // ------------------------------ Scatter (Store)
+ 
+ template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
+@@ -3413,52 +3530,44 @@ HWY_API Vec128<T, N> GatherIndex(const S
+   return Load(d, lanes);
+ }
+ 
+-// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
++// ------------------------------ Reductions
+ 
+-#if HWY_ARCH_ARM_V7
++namespace detail {
+ 
+-template <size_t N>
+-HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
+-                                          const Vec128<int64_t, N> b) {
+-  const Simd<int32_t, N * 2> d32;
+-  const Simd<int64_t, N> d64;
+-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
+-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+-  return MaskFromVec(BitCast(d64, cmp64));
++// N=1 for any T: no-op
++template <typename T>
++HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
++  return v;
+ }
+-
+-template <size_t N>
+-HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
+-                                           const Vec128<uint64_t, N> b) {
+-  const Simd<uint32_t, N * 2> d32;
+-  const Simd<uint64_t, N> d64;
+-  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
+-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
+-  return MaskFromVec(BitCast(d64, cmp64));
++template <typename T>
++HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
+ }
+ 
+-HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
+-                                      const Vec128<int64_t> b) {
+-  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
+-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
++// u32/i32/f32: N=2
++template <typename T, HWY_IF_LANE_SIZE(T, 4)>
++HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
++  return v10 + Shuffle2301(v10);
+ }
+-HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
+-                                         const Vec128<int64_t, 1> b) {
+-  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
+-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
++template <typename T>
++HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Min(v10, Shuffle2301(v10));
+ }
+-
+-template <size_t N>
+-HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
+-                                         const Vec128<int64_t, N> b) {
+-  return b < a;
++template <typename T>
++HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Max(v10, Shuffle2301(v10));
+ }
+-#endif
+-
+-// ------------------------------ Reductions
+ 
++// full vectors
+ #if HWY_ARCH_ARM_A64
+-// Supported for 32b and 64b vector types. Returns the sum in each lane.
+ HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+   return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
+ }
+@@ -3505,20 +3614,15 @@ HWY_INLINE Vec128<int64_t> SumOfLanes(co
+ }
+ #endif
+ 
+-namespace detail {
+-
+-// For u32/i32/f32.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Min(v20_31_20_31, v31_20_31_20);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+@@ -3526,15 +3630,13 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
+ }
+ 
+ // For u64/i64[/f64].
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Min(v10, v01);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Max(v10, v01);
+ }
+@@ -3542,6 +3644,10 @@ HWY_API Vec128<T, N> MaxOfLanes(hwy::Siz
+ }  // namespace detail
+ 
+ template <typename T, size_t N>
++HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
++  return detail::SumOfLanes(v);
++}
++template <typename T, size_t N>
+ HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
+   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+ }
+@@ -3569,13 +3675,13 @@ HWY_INLINE uint64_t BitsFromMask(hwy::Si
+   const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
+   const uint8x8_t x4 = vpadd_u8(x2, x2);
+   const uint8x8_t x8 = vpadd_u8(x4, x4);
+-  return vreinterpret_u16_u8(x8)[0];
++  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
+ #else
+   // Don't have vpaddq, so keep doubling lane size.
+   const uint16x8_t x2 = vpaddlq_u8(values.raw);
+   const uint32x4_t x4 = vpaddlq_u16(x2);
+   const uint64x2_t x8 = vpaddlq_u32(x4);
+-  return (uint64_t(x8[1]) << 8) | x8[0];
++  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
+ #endif
+ }
+ 
+@@ -3725,7 +3831,7 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+   const int16x8_t x2 = vpaddlq_s8(ones);
+   const int32x4_t x4 = vpaddlq_s16(x2);
+   const int64x2_t x8 = vpaddlq_s32(x4);
+-  return x8[0] + x8[1];
++  return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
+ #endif
+ }
+ template <typename T>
+@@ -3739,7 +3845,7 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+ #else
+   const int32x4_t x2 = vpaddlq_s16(ones);
+   const int64x2_t x4 = vpaddlq_s32(x2);
+-  return x4[0] + x4[1];
++  return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
+ #endif
+ }
+ 
+@@ -3753,7 +3859,7 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+   return vaddvq_s32(ones);
+ #else
+   const int64x2_t x2 = vpaddlq_s32(ones);
+-  return x2[0] + x2[1];
++  return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
+ #endif
+ }
+ 
+@@ -3765,10 +3871,10 @@ HWY_INLINE size_t CountTrue(hwy::SizeTag
+       vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
+   return vaddvq_s64(ones);
+ #else
+-  const Full128<int64_t> di;
+-  const int64x2_t ones =
+-      vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
+-  return ones[0] + ones[1];
++  const Full128<uint64_t> du;
++  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
++  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
++  return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
+ #endif
+ }
+ 
+@@ -3798,11 +3904,13 @@ HWY_INLINE size_t StoreMaskBits(const Ma
+ template <typename T>
+ HWY_INLINE bool AllFalse(const Mask128<T> m) {
+ #if HWY_ARCH_ARM_A64
+-  return (vmaxvq_u32(m.raw) == 0);
++  const Full128<uint32_t> d32;
++  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(Full128<T>(), m)));
++  return (vmaxvq_u32(m32.raw) == 0);
+ #else
+   const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
+   uint32x2_t a = vqmovn_u64(v64.raw);
+-  return vreinterpret_u64_u32(a)[0] == 0;
++  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
+ #endif
+ }
+ 
+@@ -4178,6 +4286,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
+   return a <= b;
+ }
+ 
++namespace detail {  // for code folding
+ #if HWY_ARCH_ARM_V7
+ #undef vuzp1_s8
+ #undef vuzp1_u8
+@@ -4265,6 +4374,7 @@ HWY_API auto Le(V a, V b) -> decltype(a
+ #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
+ #undef HWY_NEON_DEF_FUNCTION_UINTS
+ #undef HWY_NEON_EVAL
++}  // namespace detail
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+ }  // namespace HWY_NAMESPACE
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/ops/rvv-inl.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/ops/rvv-inl.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/ops/rvv-inl.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/ops/rvv-inl.h	2021-07-26 17:10:30.290171587 -0400
+@@ -39,6 +39,11 @@ using TFromV = TFromD<DFromV<V>>;
+   hwy::EnableIf<IsSigned<TFromV<V>>() && !IsFloat<TFromV<V>>()>* = nullptr
+ #define HWY_IF_FLOAT_V(V) hwy::EnableIf<IsFloat<TFromV<V>>()>* = nullptr
+ 
++// kShift = log2 of multiplier: 0 for m1, 1 for m2, -2 for mf4
++template <typename T, int kShift = 0>
++using Full = Simd<T, (kShift < 0) ? (HWY_LANES(T) >> (-kShift))
++                                  : (HWY_LANES(T) << kShift)>;
++
+ // ================================================== MACROS
+ 
+ // Generate specializations and function definitions using X macros. Although
+@@ -58,29 +63,30 @@ namespace detail {  // for code folding
+ 
+ // For given SEW, iterate over all LMUL. Precompute SEW/LMUL => MLEN because the
+ // preprocessor cannot easily do it.
+-#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 8, 1, 8, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 2, 4, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 4, 2, NAME, OP)                  \
+-  X_MACRO(BASE, CHAR, 8, 8, 1, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 16, 1, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 16, 2, 8, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 16, 4, 4, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 16, 8, 2, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 32, 1, 32, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 32, 2, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 32, 4, 8, NAME, OP)                 \
+-  X_MACRO(BASE, CHAR, 32, 8, 4, NAME, OP)
+-
+-#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP) \
+-  X_MACRO(BASE, CHAR, 64, 1, 64, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 2, 32, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 4, 16, NAME, OP)                \
+-  X_MACRO(BASE, CHAR, 64, 8, 8, NAME, OP)
++// TODO(janwas): GCC does not yet support fractional LMUL
++#define HWY_RVV_FOREACH_08(X_MACRO, BASE, CHAR, NAME, OP)        \
++  X_MACRO(BASE, CHAR, 8, m1, /*kShift=*/0, /*MLEN=*/8, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m2, /*kShift=*/1, /*MLEN=*/4, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m4, /*kShift=*/2, /*MLEN=*/2, NAME, OP) \
++  X_MACRO(BASE, CHAR, 8, m8, /*kShift=*/3, /*MLEN=*/1, NAME, OP)
++
++#define HWY_RVV_FOREACH_16(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 16, m1, /*kShift=*/0, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 16, m2, /*kShift=*/1, /*MLEN=*/8, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 16, m4, /*kShift=*/2, /*MLEN=*/4, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 16, m8, /*kShift=*/3, /*MLEN=*/2, NAME, OP)
++
++#define HWY_RVV_FOREACH_32(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 32, m1, /*kShift=*/0, /*MLEN=*/32, NAME, OP) \
++  X_MACRO(BASE, CHAR, 32, m2, /*kShift=*/1, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 32, m4, /*kShift=*/2, /*MLEN=*/8, NAME, OP)  \
++  X_MACRO(BASE, CHAR, 32, m8, /*kShift=*/3, /*MLEN=*/4, NAME, OP)
++
++#define HWY_RVV_FOREACH_64(X_MACRO, BASE, CHAR, NAME, OP)          \
++  X_MACRO(BASE, CHAR, 64, m1, /*kShift=*/0, /*MLEN=*/64, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m2, /*kShift=*/1, /*MLEN=*/32, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m4, /*kShift=*/2, /*MLEN=*/16, NAME, OP) \
++  X_MACRO(BASE, CHAR, 64, m8, /*kShift=*/3, /*MLEN=*/8, NAME, OP)
+ 
+ // SEW for unsigned:
+ #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP) \
+@@ -153,63 +159,61 @@ namespace detail {  // for code folding
+ 
+ // Assemble types for use in x-macros
+ #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
+-#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##m##LMUL
+-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##m##LMUL##_t
++#define HWY_RVV_D(CHAR, SEW, LMUL) D##CHAR##SEW##LMUL
++#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
+ #define HWY_RVV_M(MLEN) vbool##MLEN##_t
+ 
+ }  // namespace detail
+ 
+ // TODO(janwas): remove typedefs and only use HWY_RVV_V etc. directly
+ 
+-// TODO(janwas): do we want fractional LMUL? (can encode as negative)
+-// Mixed-precision code can use LMUL 1..8 and that should be enough unless they
+-// need many registers.
+-#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  using HWY_RVV_D(CHAR, SEW, LMUL) =                                      \
+-      Simd<HWY_RVV_T(BASE, SEW), HWY_LANES(HWY_RVV_T(BASE, SEW)) * LMUL>; \
+-  using V##CHAR##SEW##m##LMUL = HWY_RVV_V(BASE, SEW, LMUL);               \
+-  template <>                                                             \
+-  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                           \
+-    using Lane = HWY_RVV_T(BASE, SEW);                                    \
+-    using type = Simd<Lane, HWY_LANES(Lane) * LMUL>;                      \
++// Until we have full intrinsic support for fractional LMUL, mixed-precision
++// code can use LMUL 1..8 (adequate unless they need many registers).
++#define HWY_SPECIALIZE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
++  using HWY_RVV_D(CHAR, SEW, LMUL) = Full<HWY_RVV_T(BASE, SEW), SHIFT>; \
++  using V##CHAR##SEW##LMUL = HWY_RVV_V(BASE, SEW, LMUL);                \
++  template <>                                                           \
++  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                         \
++    using Lane = HWY_RVV_T(BASE, SEW);                                  \
++    using type = Full<Lane, SHIFT>;                                     \
+   };
+ using Vf16m1 = vfloat16m1_t;
+ using Vf16m2 = vfloat16m2_t;
+ using Vf16m4 = vfloat16m4_t;
+ using Vf16m8 = vfloat16m8_t;
+-using Df16m1 = Simd<float16_t, HWY_LANES(uint16_t) * 1>;
+-using Df16m2 = Simd<float16_t, HWY_LANES(uint16_t) * 2>;
+-using Df16m4 = Simd<float16_t, HWY_LANES(uint16_t) * 4>;
+-using Df16m8 = Simd<float16_t, HWY_LANES(uint16_t) * 8>;
++using Df16m1 = Full<float16_t, 0>;
++using Df16m2 = Full<float16_t, 1>;
++using Df16m4 = Full<float16_t, 2>;
++using Df16m8 = Full<float16_t, 3>;
+ 
+ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
+ #undef HWY_SPECIALIZE
+ 
+ // vector = f(d), e.g. Zero
+-#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
++#define HWY_RVV_RETV_ARGD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(CHAR, SEW, LMUL) d) { \
+     (void)Lanes(d);                                                       \
+-    return v##OP##_##CHAR##SEW##m##LMUL();                                \
++    return v##OP##_##CHAR##SEW##LMUL();                                   \
+   }
+ 
+ // vector = f(vector), e.g. Not
+-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)          \
++#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)   \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_##CHAR##SEW##m##LMUL(v);                             \
++    return v##OP##_v_##CHAR##SEW##LMUL(v);                                \
+   }
+ 
+ // vector = f(vector, scalar), e.g. detail::Add
+-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)  \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
+-    return v##OP##_##CHAR##SEW##m##LMUL(a, b);                     \
++#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {       \
++    return v##OP##_##CHAR##SEW##LMUL(a, b);                              \
+   }
+ 
+ // vector = f(vector, vector), e.g. Add
+-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
++#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(a, b);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(a, b);                           \
+   }
+ 
+ // ================================================== INIT
+@@ -218,9 +222,9 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _)
+ 
+ // WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
+ // vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
+-#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {  \
+-    return v##OP##SEW##m##LMUL();                            \
++#define HWY_RVV_LANES(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API size_t NAME(HWY_RVV_D(CHAR, SEW, LMUL) /* d */) {         \
++    return v##OP##SEW##LMUL();                                      \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e)
+@@ -233,19 +237,31 @@ HWY_RVV_FOREACH(HWY_RVV_RETV_ARGD, Zero,
+ template <class D>
+ using VFromD = decltype(Zero(D()));
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Full<T>> Zero(Simd<T, N> /*tag*/) {
++  return Zero(Full<T>());
++}
++
+ // ------------------------------ Set
+ // vector = f(d, scalar), e.g. Set
+-#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_SET(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                 \
+       NAME(HWY_RVV_D(CHAR, SEW, LMUL) d, HWY_RVV_T(BASE, SEW) arg) { \
+     (void)Lanes(d);                                                  \
+-    return v##OP##_##CHAR##SEW##m##LMUL(arg);                        \
++    return v##OP##_##CHAR##SEW##LMUL(arg);                           \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x)
+ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f)
+ #undef HWY_RVV_SET
+ 
++// Partial vectors
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> Set(Simd<T, N> /*tag*/, T arg) {
++  return Set(Full<T>(), arg);
++}
++
+ // ------------------------------ Undefined
+ 
+ // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
+@@ -265,7 +281,7 @@ HWY_API VFromD<D> Undefined(D d) {
+ namespace detail {
+ 
+ // u8: no change
+-#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_CAST_NOP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
+       BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) {                       \
+     return v;                                                             \
+@@ -276,25 +292,25 @@ namespace detail {
+   }
+ 
+ // Other integers
+-#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)            \
+-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_##CHAR##SEW##m##LMUL##_u8m##LMUL(v);                 \
+-  }                                                                       \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
+-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
+-    return v##OP##_v_u8m##LMUL##_##CHAR##SEW##m##LMUL(v);                 \
++#define HWY_RVV_CAST_UI(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
++  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v);                    \
++  }                                                                      \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
++      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
++    return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v);                    \
+   }
+ 
+ // Float: first cast to/from unsigned
+-#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  HWY_API vuint8m##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_v_u##SEW##m##LMUL##_u8m##LMUL(                         \
+-        v##OP##_v_f##SEW##m##LMUL##_u##SEW##m##LMUL(v));                  \
+-  }                                                                       \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                     \
+-      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8m##LMUL##_t v) {          \
+-    return v##OP##_v_u##SEW##m##LMUL##_f##SEW##m##LMUL(                   \
+-        v##OP##_v_u8m##LMUL##_u##SEW##m##LMUL(v));                        \
++#define HWY_RVV_CAST_F(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)     \
++  HWY_API vuint8##LMUL##_t BitCastToByte(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_v_u##SEW##LMUL##_u8##LMUL(                            \
++        v##OP##_v_f##SEW##LMUL##_u##SEW##LMUL(v));                       \
++  }                                                                      \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
++      HWY_RVV_D(CHAR, SEW, LMUL) /* d */, vuint8##LMUL##_t v) {          \
++    return v##OP##_v_u##SEW##LMUL##_f##SEW##LMUL(                        \
++        v##OP##_v_u8##LMUL##_u##SEW##LMUL(v));                           \
+   }
+ 
+ HWY_RVV_FOREACH_U08(HWY_RVV_CAST_NOP, _, _)
+@@ -315,6 +331,12 @@ HWY_API VFromD<D> BitCast(D d, FromV v)
+   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
+ }
+ 
++// Partial
++template <typename T, size_t N, class FromV, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> BitCast(Simd<T, N> /*tag*/, FromV v) {
++  return BitCast(Full<T>(), v);
++}
++
+ namespace detail {
+ 
+ template <class V, class DU = RebindToUnsigned<DFromV<V>>>
+@@ -336,6 +358,12 @@ HWY_API VFromD<DU> Iota0(const D /*d*/)
+   return BitCastToUnsigned(Iota0(DU()));
+ }
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API VFromD<Simd<T, N>> Iota0(Simd<T, N> /*tag*/) {
++  return Iota0(Full<T>());
++}
++
+ }  // namespace detail
+ 
+ // ================================================== LOGICAL
+@@ -370,11 +398,11 @@ HWY_API V And(const V a, const V b) {
+ // ------------------------------ Or
+ 
+ // Scalar argument plus mask. Used by VecFromMask.
+-#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)           \
++#define HWY_RVV_OR_MASK(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)    \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_T(BASE, SEW) imm,       \
+            HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff) { \
+-    return v##OP##_##CHAR##SEW##m##LMUL##_m(mask, maskedoff, v, imm);    \
++    return v##OP##_##CHAR##SEW##LMUL##_m(mask, maskedoff, v, imm);       \
+   }
+ 
+ namespace detail {
+@@ -466,14 +494,14 @@ HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV,
+ // ------------------------------ ShiftLeft[Same]
+ 
+ // Intrinsics do not define .vi forms, so use .vx instead.
+-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
+-  template <int kBits>                                                     \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {  \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, kBits);                      \
+-  }                                                                        \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+-      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                 \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, static_cast<uint8_t>(bits)); \
++#define HWY_RVV_SHIFT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
++  template <int kBits>                                                    \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
++    return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits);                        \
++  }                                                                       \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
++      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                \
++    return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits));   \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll)
+@@ -486,19 +514,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRi
+ #undef HWY_RVV_SHIFT
+ 
+ // ------------------------------ Shl
+-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
++#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)      \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, bits);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, bits);                           \
+   }
+ 
+ HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll)
+ 
+-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)              \
++#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)       \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) {  \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v,                                \
+-                                           detail::BitCastToUnsigned(bits)); \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits)); \
+   }
+ 
+ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll)
+@@ -569,11 +596,11 @@ HWY_API V ApproximateReciprocalSqrt(cons
+ 
+ // ------------------------------ MulAdd
+ // Note: op is still named vv, not vvv.
+-#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
++#define HWY_RVV_FMA(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                     \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
+            HWY_RVV_V(BASE, SEW, LMUL) add) {                             \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(add, mul, x);                 \
++    return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x);                    \
+   }
+ 
+ HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc)
+@@ -596,11 +623,11 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub
+ // of all bits; SLEN 8 / LMUL 4 = half of all bits.
+ 
+ // mask = f(vector, vector)
+-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)        \
++#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
+   HWY_API HWY_RVV_M(MLEN)                                                \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
+     (void)Lanes(DFromV<decltype(a)>());                                  \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL##_b##MLEN(a, b);              \
++    return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b);                 \
+   }
+ 
+ // ------------------------------ Eq
+@@ -675,11 +702,11 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xo
+ #undef HWY_RVV_RETM_ARGMM
+ 
+ // ------------------------------ IfThenElse
+-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                \
+-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,       \
+-           HWY_RVV_V(BASE, SEW, LMUL) no) {                         \
+-    return v##OP##_vvm_##CHAR##SEW##m##LMUL(m, no, yes);            \
++#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
++      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,              \
++           HWY_RVV_V(BASE, SEW, LMUL) no) {                                \
++    return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes);                      \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge)
+@@ -774,17 +801,17 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _,
+ 
+ // ------------------------------ Load
+ 
+-#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                        \
+-      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                    \
+-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {   \
+-    (void)Lanes(d);                                         \
+-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p);          \
++#define HWY_RVV_LOAD(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                               \
++      NAME(HWY_RVV_D(CHAR, SEW, LMUL) d,                           \
++           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {          \
++    (void)Lanes(d);                                                \
++    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p);                    \
+   }
+ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le)
+ #undef HWY_RVV_LOAD
+ 
+-// Partial load
++// Partial
+ template <typename T, size_t N, HWY_IF_LE128(T, N)>
+ HWY_API VFromD<Simd<T, N>> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
+   return Load(d, p);
+@@ -800,16 +827,22 @@ HWY_API VFromD<D> LoadU(D d, const TFrom
+ 
+ // ------------------------------ Store
+ 
+-#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                 \
+-                    HWY_RVV_D(CHAR, SEW, LMUL) d,                 \
+-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {      \
+-    (void)Lanes(d);                                               \
+-    return v##OP##SEW##_v_##CHAR##SEW##m##LMUL(p, v);             \
++#define HWY_RVV_RET_ARGVDP(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                        \
++                    HWY_RVV_D(CHAR, SEW, LMUL) d,                        \
++                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {             \
++    (void)Lanes(d);                                                      \
++    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v);                       \
+   }
+ HWY_RVV_FOREACH(HWY_RVV_RET_ARGVDP, Store, se)
+ #undef HWY_RVV_RET_ARGVDP
+ 
++// Partial
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API void Store(VFromD<Simd<T, N>> v, Simd<T, N> d, T* HWY_RESTRICT p) {
++  return Store(v, Full<T>(), p);
++}
++
+ // ------------------------------ StoreU
+ 
+ // RVV only requires lane alignment, not natural alignment of the entire vector.
+@@ -963,67 +996,6 @@ HWY_API VFromD<Simd<int32_t, N>> Promote
+   return BitCast(d, PromoteTo(Simd<uint32_t, N>(), v));
+ }
+ 
+-// ------------------------------ PromoteTo I
+-
+-HWY_API Vi16m2 PromoteTo(Di16m2 /* d */, Vi8m1 v) { return vsext_vf2_i16m2(v); }
+-HWY_API Vi16m4 PromoteTo(Di16m4 /* d */, Vi8m2 v) { return vsext_vf2_i16m4(v); }
+-HWY_API Vi16m8 PromoteTo(Di16m8 /* d */, Vi8m4 v) { return vsext_vf2_i16m8(v); }
+-
+-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, Vi8m1 v) { return vsext_vf4_i32m4(v); }
+-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, Vi8m2 v) { return vsext_vf4_i32m8(v); }
+-
+-HWY_API Vi32m2 PromoteTo(Di32m2 /* d */, const Vi16m1 v) {
+-  return vsext_vf2_i32m2(v);
+-}
+-HWY_API Vi32m4 PromoteTo(Di32m4 /* d */, const Vi16m2 v) {
+-  return vsext_vf2_i32m4(v);
+-}
+-HWY_API Vi32m8 PromoteTo(Di32m8 /* d */, const Vi16m4 v) {
+-  return vsext_vf2_i32m8(v);
+-}
+-
+-HWY_API Vi64m2 PromoteTo(Di64m2 /* d */, const Vi32m1 v) {
+-  return vsext_vf2_i64m2(v);
+-}
+-HWY_API Vi64m4 PromoteTo(Di64m4 /* d */, const Vi32m2 v) {
+-  return vsext_vf2_i64m4(v);
+-}
+-HWY_API Vi64m8 PromoteTo(Di64m8 /* d */, const Vi32m4 v) {
+-  return vsext_vf2_i64m8(v);
+-}
+-
+-// ------------------------------ PromoteTo F
+-
+-HWY_API Vf32m2 PromoteTo(Df32m2 /* d */, const Vf16m1 v) {
+-  return vfwcvt_f_f_v_f32m2(v);
+-}
+-HWY_API Vf32m4 PromoteTo(Df32m4 /* d */, const Vf16m2 v) {
+-  return vfwcvt_f_f_v_f32m4(v);
+-}
+-HWY_API Vf32m8 PromoteTo(Df32m8 /* d */, const Vf16m4 v) {
+-  return vfwcvt_f_f_v_f32m8(v);
+-}
+-
+-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vf32m1 v) {
+-  return vfwcvt_f_f_v_f64m2(v);
+-}
+-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vf32m2 v) {
+-  return vfwcvt_f_f_v_f64m4(v);
+-}
+-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vf32m4 v) {
+-  return vfwcvt_f_f_v_f64m8(v);
+-}
+-
+-HWY_API Vf64m2 PromoteTo(Df64m2 /* d */, const Vi32m1 v) {
+-  return vfwcvt_f_x_v_f64m2(v);
+-}
+-HWY_API Vf64m4 PromoteTo(Df64m4 /* d */, const Vi32m2 v) {
+-  return vfwcvt_f_x_v_f64m4(v);
+-}
+-HWY_API Vf64m8 PromoteTo(Df64m8 /* d */, const Vi32m4 v) {
+-  return vfwcvt_f_x_v_f64m8(v);
+-}
+-
+ // ------------------------------ DemoteTo U
+ 
+ // First clamp negative numbers to zero to match x86 packus.
+@@ -1124,19 +1096,19 @@ HWY_API Vi32m4 DemoteTo(Di32m4 /* d */,
+ 
+ // ------------------------------ ConvertTo F
+ 
+-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)                 \
++#define HWY_RVV_CONVERT(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)          \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
+       HWY_RVV_D(CHAR, SEW, LMUL) /* d */, HWY_RVV_V(int, SEW, LMUL) v) {       \
+-    return vfcvt_f_x_v_f##SEW##m##LMUL(v);                                     \
++    return vfcvt_f_x_v_f##SEW##LMUL(v);                                        \
+   }                                                                            \
+   /* Truncates (rounds toward zero). */                                        \
+   HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(i, SEW, LMUL) /* d */, \
+                                               HWY_RVV_V(BASE, SEW, LMUL) v) {  \
+-    return vfcvt_rtz_x_f_v_i##SEW##m##LMUL(v);                                 \
++    return vfcvt_rtz_x_f_v_i##SEW##LMUL(v);                                    \
+   }                                                                            \
+   /* Uses default rounding mode. */                                            \
+   HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return vfcvt_x_f_v_i##SEW##m##LMUL(v);                                     \
++    return vfcvt_x_f_v_i##SEW##LMUL(v);                                        \
+   }
+ 
+ // API only requires f32 but we provide f64 for internal use (otherwise, it
+@@ -1184,10 +1156,10 @@ HWY_API VFromD<DU> SetTableIndices(D d,
+ 
+ // <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
+ // to 2048! We could instead use vrgatherei16.
+-#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)               \
++#define HWY_RVV_TABLE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
+       NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, idx);                        \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, idx);                           \
+   }
+ 
+ HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
+@@ -1279,7 +1251,6 @@ HWY_API V OffsetsOf128BitBlocks(const D
+   using T = MakeUnsigned<TFromD<D>>;
+   return detail::And(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
+ }
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1307,9 +1278,9 @@ HWY_API V Broadcast(const V v) {
+ 
+ // ------------------------------ GetLane
+ 
+-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)     \
+-  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_s_##CHAR##SEW##m##LMUL##_##CHAR##SEW(v);         \
++#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP) \
++  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {    \
++    return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v);               \
+   }
+ 
+ HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x)
+@@ -1318,11 +1289,12 @@ HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetL
+ 
+ // ------------------------------ ShiftLeftLanes
+ 
+-// vector = f(vector, size_t)
+-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP) \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                         \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t lanes) {     \
+-    return v##OP##_vx_##CHAR##SEW##m##LMUL(v, v, lanes);     \
++// vector = f(vector, vector, size_t)
++#define HWY_RVV_SLIDE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)        \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
++           size_t lanes) {                                                 \
++    return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes);                  \
+   }
+ 
+ namespace detail {
+@@ -1333,7 +1305,7 @@ template <size_t kLanes, class V>
+ HWY_API V ShiftLeftLanes(const V v) {
+   using D = DFromV<V>;
+   const RebindToSigned<D> di;
+-  const auto shifted = detail::SlideUp(v, kLanes);
++  const auto shifted = detail::SlideUp(v, v, kLanes);
+   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
+   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
+   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
+@@ -1363,7 +1335,7 @@ template <size_t kLanes, class V>
+ HWY_API V ShiftRightLanes(const V v) {
+   using D = DFromV<V>;
+   const RebindToSigned<D> di;
+-  const auto shifted = detail::SlideDown(v, kLanes);
++  const auto shifted = detail::SlideDown(v, v, kLanes);
+   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
+   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
+   const auto idx_mod = detail::And(detail::Iota0(di), kLanesPerBlock - 1);
+@@ -1405,7 +1377,7 @@ HWY_API V ConcatUpperLower(const V hi, c
+ template <class V>
+ HWY_API V ConcatLowerLower(const V hi, const V lo) {
+   // Move lower half into upper
+-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
++  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi_up, lo);
+ }
+ 
+@@ -1414,7 +1386,7 @@ HWY_API V ConcatLowerLower(const V hi, c
+ template <class V>
+ HWY_API V ConcatUpperUpper(const V hi, const V lo) {
+   // Move upper half into lower
+-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
++  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi, lo_down);
+ }
+ 
+@@ -1423,8 +1395,8 @@ HWY_API V ConcatUpperUpper(const V hi, c
+ template <class V>
+ HWY_API V ConcatLowerUpper(const V hi, const V lo) {
+   // Move half of both inputs to the other half
+-  const auto hi_up = detail::SlideUp(hi, Lanes(DFromV<V>()) / 2);
+-  const auto lo_down = detail::SlideDown(lo, Lanes(DFromV<V>()) / 2);
++  const auto hi_up = detail::SlideUp(hi, hi, Lanes(DFromV<V>()) / 2);
++  const auto lo_down = detail::SlideDown(lo, lo, Lanes(DFromV<V>()) / 2);
+   return ConcatUpperLower(hi_up, lo_down);
+ }
+ 
+@@ -1491,61 +1463,55 @@ HWY_API V Combine(const V a, const V b)
+ // ================================================== REDUCE
+ 
+ // vector = f(vector, zero_m1)
+-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)             \
+-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
+-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, 1) v0) {    \
+-    vsetvlmax_e##SEW##m##LMUL();                                          \
+-    return Set(HWY_RVV_D(CHAR, SEW, LMUL)(),                              \
+-               GetLane(v##OP##_vs_##CHAR##SEW##m##LMUL##_##CHAR##SEW##m1( \
+-                   v0, v, v0)));                                          \
++#define HWY_RVV_REDUCE(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)         \
++  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
++      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) {      \
++    vsetvlmax_e##SEW##LMUL();                                                \
++    return Set(                                                              \
++        HWY_RVV_D(CHAR, SEW, LMUL)(),                                        \
++        GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(v0, v, v0))); \
+   }
+ 
+ // ------------------------------ SumOfLanes
+ 
+ namespace detail {
+-
+ HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredsum)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V SumOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const auto v0 = Zero(Simd<T, HWY_LANES(T)>());  // always m1
++  const auto v0 = Zero(Full<T>());  // always m1
+   return detail::RedSum(v, v0);
+ }
+ 
+ // ------------------------------ MinOfLanes
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu)
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V MinOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const Simd<T, HWY_LANES(T)> d1;  // always m1
++  const Full<T> d1;  // always m1
+   const auto neutral = Set(d1, HighestValue<T>());
+   return detail::RedMin(v, neutral);
+ }
+ 
+ // ------------------------------ MaxOfLanes
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu)
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax)
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax)
+-
+ }  // namespace detail
+ 
+ template <class V>
+ HWY_API V MaxOfLanes(const V v) {
+   using T = TFromV<V>;
+-  const Simd<T, HWY_LANES(T)> d1;  // always m1
++  const Full<T> d1;  // always m1
+   const auto neutral = Set(d1, LowestValue<T>());
+   return detail::RedMax(v, neutral);
+ }
+@@ -1570,7 +1536,7 @@ HWY_API VFromD<D> LoadDup128(D d, const
+ #define HWY_RVV_STORE_MASK_BITS(MLEN, NAME, OP)                 \
+   HWY_API size_t StoreMaskBits(HWY_RVV_M(MLEN) m, uint8_t* p) { \
+     /* LMUL=1 is always enough */                               \
+-    Simd<uint8_t, HWY_LANES(uint8_t)> d8;                       \
++    Full<uint8_t> d8;                                           \
+     const size_t num_bytes = (Lanes(d8) + MLEN - 1) / MLEN;     \
+     /* TODO(janwas): how to convert vbool* to vuint?*/          \
+     /*Store(m, d8, p);*/                                        \
+@@ -1581,6 +1547,22 @@ HWY_API VFromD<D> LoadDup128(D d, const
+ HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, _, _)
+ #undef HWY_RVV_STORE_MASK_BITS
+ 
++// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
++
++// Disallow for 8-bit because Iota is likely to overflow.
++template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
++HWY_API MFromD<D> FirstN(const D d, const size_t n) {
++  const RebindToSigned<D> di;
++  return RebindMask(d, Lt(BitCast(di, detail::Iota0(d)), Set(di, n)));
++}
++
++template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
++HWY_API MFromD<D> FirstN(const D d, const size_t n) {
++  const auto zero = Zero(d);
++  const auto one = Set(d, 1);
++  return Eq(detail::SlideUp(one, zero, n), one);
++}
++
+ // ------------------------------ Neg
+ 
+ template <class V, HWY_IF_SIGNED_V(V)>
+@@ -1589,9 +1571,9 @@ HWY_API V Neg(const V v) {
+ }
+ 
+ // vector = f(vector), but argument is repeated
+-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, MLEN, NAME, OP)         \
++#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, LMUL, SHIFT, MLEN, NAME, OP)  \
+   HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
+-    return v##OP##_vv_##CHAR##SEW##m##LMUL(v, v);                         \
++    return v##OP##_vv_##CHAR##SEW##LMUL(v, v);                            \
+   }
+ 
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn)
+@@ -1628,7 +1610,6 @@ template <class V>
+ HWY_API auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
+   return Lt(Abs(v), Set(DFromV<V>(), MantissaEnd<TFromV<V>>()));
+ }
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1699,10 +1680,8 @@ HWY_API VFromD<D> Iota(const D d, TFromD
+ // Using vwmul does not work for m8, so use mulh instead. Highway only provides
+ // MulHigh for 16-bit, so use a private wrapper.
+ namespace detail {
+-
+ HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu)
+ HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh)
+-
+ }  // namespace detail
+ 
+ template <class V>
+@@ -1712,7 +1691,7 @@ HWY_API VFromD<RepartitionToWide<DFromV<
+   const auto lo = Mul(a, b);
+   const auto hi = detail::MulHigh(a, b);
+   const RepartitionToWide<DFromV<V>> dw;
+-  return BitCast(dw, OddEven(detail::SlideUp(hi, 1), lo));
++  return BitCast(dw, OddEven(detail::SlideUp(hi, hi, 1), lo));
+ }
+ 
+ // ================================================== END MACROS
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_128-inl.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_128-inl.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_128-inl.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_128-inl.h	2021-07-26 17:19:52.153729522 -0400
+@@ -154,27 +154,28 @@ HWY_API Vec128<double, N> Zero(Simd<doub
+ // Returns a vector/part with all lanes set to "t".
+ template <size_t N, HWY_IF_LE128(uint8_t, N)>
+ HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
+-  return Vec128<uint8_t, N>{_mm_set1_epi8(t)};
++  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(uint16_t, N)>
+ HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
+-  return Vec128<uint16_t, N>{_mm_set1_epi16(t)};
++  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(uint32_t, N)>
+ HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
+-  return Vec128<uint32_t, N>{_mm_set1_epi32(t)};
++  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
+ }
+ template <size_t N, HWY_IF_LE128(uint64_t, N)>
+ HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
+-  return Vec128<uint64_t, N>{_mm_set1_epi64x(t)};
++  return Vec128<uint64_t, N>{
++      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int8_t, N)>
+ HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
+-  return Vec128<int8_t, N>{_mm_set1_epi8(t)};
++  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int16_t, N)>
+ HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
+-  return Vec128<int16_t, N>{_mm_set1_epi16(t)};
++  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(int32_t, N)>
+ HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
+@@ -182,7 +183,8 @@ HWY_API Vec128<int32_t, N> Set(Simd<int3
+ }
+ template <size_t N, HWY_IF_LE128(int64_t, N)>
+ HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
+-  return Vec128<int64_t, N>{_mm_set1_epi64x(t)};
++  return Vec128<int64_t, N>{
++      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ template <size_t N, HWY_IF_LE128(float, N)>
+ HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
+@@ -684,6 +686,14 @@ HWY_API Mask128<double, N> operator>=(co
+   return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
+ }
+ 
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
++}
++
+ // ================================================== ARITHMETIC
+ 
+ // ------------------------------ Addition
+@@ -895,7 +905,7 @@ template <size_t N>
+ HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
+   return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
+ }
+-
++// i64 is implemented after BroadcastSignBit.
+ template <size_t N>
+ HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
+   const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
+@@ -1067,15 +1077,24 @@ HWY_API Vec128<int64_t, N> BroadcastSign
+   return VecFromMask(v < Zero(Simd<int64_t, N>()));
+ #else
+   // Efficient Gt() requires SSE4.2 but we only have SSE4.1. BLENDVPD requires
+-  // two constants and domain crossing. 32-bit compare only requires Zero()
+-  // plus a shuffle to replicate the upper 32 bits.
++  // two constants and domain crossing. 32-bit shift avoids generating a zero.
+   const Simd<int32_t, N * 2> d32;
+-  const auto sign = BitCast(d32, v) < Zero(d32);
++  const auto sign = ShiftRight<31>(BitCast(d32, v));
+   return Vec128<int64_t, N>{
+       _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
+ #endif
+ }
+ 
++template <size_t N>
++HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
++#if HWY_TARGET == HWY_AVX3
++  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
++#else
++  const auto zero = Zero(Simd<int64_t,N>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
+ template <int kBits, size_t N>
+ HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
+ #if HWY_TARGET == HWY_AVX3
+@@ -1787,6 +1806,10 @@ HWY_API void Stream(const Vec128<double,
+ 
+ // ------------------------------ Scatter
+ 
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
++
+ // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
+ using GatherIndex64 = long long int;  // NOLINT(google-runtime-int)
+ static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
+@@ -2048,6 +2071,8 @@ HWY_API Vec128<double, N> GatherIndex(Si
+ 
+ #endif  // HWY_TARGET != HWY_SSE4
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== SWIZZLE
+ 
+ // ------------------------------ Extract half
+@@ -2075,10 +2100,10 @@ HWY_INLINE Vec128<double, 1> UpperHalf(V
+ // ------------------------------ Shift vector by constant #bytes
+ 
+ // 0x01..0F, kBytes = 1 => 0x02..0F00
+-template <int kBytes, typename T>
+-HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
++template <int kBytes, typename T, size_t N>
++HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
+   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+-  return Vec128<T>{_mm_slli_si128(v.raw, kBytes)};
++  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
+ }
+ 
+ template <int kLanes, typename T, size_t N>
+@@ -2089,10 +2114,10 @@ HWY_API Vec128<T, N> ShiftLeftLanes(cons
+ }
+ 
+ // 0x01..0F, kBytes = 1 => 0x0001..0E
+-template <int kBytes, typename T>
+-HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
++template <int kBytes, typename T, size_t N>
++HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
+   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+-  return Vec128<T>{_mm_srli_si128(v.raw, kBytes)};
++  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
+ }
+ 
+ template <int kLanes, typename T, size_t N>
+@@ -2257,44 +2282,47 @@ HWY_API Vec128<float> Shuffle0123(const
+ // ------------------------------ TableLookupLanes
+ 
+ // Returned by SetTableIndices for use by TableLookupLanes.
+-template <typename T>
++template <typename T, size_t N>
+ struct Indices128 {
+   __m128i raw;
+ };
+ 
+-template <typename T>
+-HWY_API Indices128<T> SetTableIndices(Full128<T>, const int32_t* idx) {
++template <typename T, size_t N, HWY_IF_LE128(T, N)>
++HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
+ #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
+-  const size_t N = 16 / sizeof(T);
+   for (size_t i = 0; i < N; ++i) {
+     HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
+   }
+ #endif
+ 
+-  const Full128<uint8_t> d8;
+-  alignas(16) uint8_t control[16];
+-  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
+-    const size_t idx_lane = idx_byte / sizeof(T);
+-    const size_t mod = idx_byte % sizeof(T);
+-    control[idx_byte] = static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + mod);
++  const Repartition<uint8_t, decltype(d)> d8;
++  alignas(16) uint8_t control[16] = {0};
++  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
++    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
++      control[idx_lane * sizeof(T) + idx_byte] =
++          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
++    }
+   }
+-  return Indices128<T>{Load(d8, control).raw};
++  return Indices128<T, N>{Load(d8, control).raw};
+ }
+ 
+-HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
+-                                          const Indices128<uint32_t> idx) {
+-  return TableLookupBytes(v, Vec128<uint32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<uint32_t, N> TableLookupLanes(
++    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
+ }
+-HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
+-                                         const Indices128<int32_t> idx) {
+-  return TableLookupBytes(v, Vec128<int32_t>{idx.raw});
++template <size_t N>
++HWY_API Vec128<int32_t, N> TableLookupLanes(const Vec128<int32_t, N> v,
++                                            const Indices128<int32_t, N> idx) {
++  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
+ }
+-HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
+-                                       const Indices128<float> idx) {
+-  const Full128<int32_t> di;
+-  const Full128<float> df;
++template <size_t N>
++HWY_API Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
++                                          const Indices128<float, N> idx) {
++  const Simd<int32_t, N> di;
++  const Simd<float, N> df;
+   return BitCast(df,
+-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>{idx.raw}));
++                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
+ }
+ 
+ // ------------------------------ Interleave lanes
+@@ -2502,47 +2530,47 @@ HWY_INLINE Vec128<double> ConcatUpperLow
+ 
+ namespace detail {
+ 
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  const Full128<T> d;
+-  const Full128<uint8_t> d8;
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  const Simd<T, N> d;
++  const Repartition<uint8_t, decltype(d)> d8;
+   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
+                                             0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
+   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
+ }
+-template <typename T>
+-HWY_API Vec128<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T> a,
+-                          const Vec128<T> b) {
+-  return Vec128<T>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
++                             const Vec128<T, N> b) {
++  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
+ }
+ 
+ }  // namespace detail
+ 
+-template <typename T>
+-HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
++template <typename T, size_t N>
++HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
+   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
+ }
+-template <>
+-HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
+-                                        const Vec128<float> b) {
+-  return Vec128<float>{_mm_blend_ps(a.raw, b.raw, 5)};
++template <size_t N>
++HWY_INLINE Vec128<float, N> OddEven(const Vec128<float, N> a,
++                                    const Vec128<float, N> b) {
++  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
+ }
+ 
+-template <>
+-HWY_INLINE Vec128<double> OddEven<double>(const Vec128<double> a,
+-                                          const Vec128<double> b) {
+-  return Vec128<double>{_mm_blend_pd(a.raw, b.raw, 1)};
++template <size_t N>
++HWY_INLINE Vec128<double, N> OddEven(const Vec128<double, N> a,
++                                     const Vec128<double, N> b) {
++  return Vec128<double, N>{_mm_blend_pd(a.raw, b.raw, 1)};
+ }
+ 
+ // ------------------------------ Shl (ZipLower, Mul)
+@@ -2980,7 +3008,7 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
+   return LowerHalf(LowerHalf(BitCast(d8, quad)));
+ }
+ 
+-// ------------------------------ Convert integer <=> floating point
++// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
+ 
+ template <size_t N>
+ HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
+@@ -2995,13 +3023,20 @@ HWY_API Vec128<double, N> ConvertTo(Simd
+   (void)dd;
+   return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
+ #else
+-  alignas(16) int64_t lanes_i[2];
+-  Store(v, Simd<int64_t, N>(), lanes_i);
+-  alignas(16) double lanes_d[2];
+-  for (size_t i = 0; i < N; ++i) {
+-    lanes_d[i] = static_cast<double>(lanes_i[i]);
+-  }
+-  return Load(dd, lanes_d);
++  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
++  const Repartition<uint32_t, decltype(dd)> d32;
++  const Repartition<uint64_t, decltype(dd)> d64;
++
++  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
++  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
++  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
++
++  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
++  const auto k52 = Set(d32, 0x43300000);
++  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
++
++  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
++  return (v_upper - k84_63_52) + v_lower;  // order matters!
+ #endif
+ }
+ 
+@@ -3572,55 +3607,87 @@ HWY_API void StoreInterleaved4(const Vec
+ 
+ namespace detail {
+ 
+-// For u32/i32/f32.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++// N=1 for any T: no-op
++template <typename T>
++HWY_API Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++template <typename T>
++HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
++                                const Vec128<T, 1> v) {
++  return v;
++}
++
++// u32/i32/f32:
++
++// N=2
++template <typename T>
++HWY_API Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
++}
++template <typename T>
++HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++template <typename T>
++HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
++                                const Vec128<T, 2> v10) {
++  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
++}
++
++// N=4 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = v3210 + v1032;
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return v20_31_20_31 + v31_20_31_20;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Min(v20_31_20_31, v31_20_31_20);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
+-                                const Vec128<T, N> v3210) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
+   const Vec128<T> v1032 = Shuffle1032(v3210);
+   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
+   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
+   return Max(v20_31_20_31, v31_20_31_20);
+ }
+ 
+-// For u64/i64/f64.
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++// u64/i64/f64:
++
++// N=2 (full)
++template <typename T>
++HWY_API Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return v10 + v01;
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Min(v10, v01);
+ }
+-template <typename T, size_t N>
+-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
+-                                const Vec128<T, N> v10) {
++template <typename T>
++HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
+   const Vec128<T> v01 = Shuffle01(v10);
+   return Max(v10, v01);
+ }
+ 
+ }  // namespace detail
+ 
+-// Supported for u/i/f 32/64. Returns the sum in each lane.
++// Supported for u/i/f 32/64. Returns the same value in each lane.
+ template <typename T, size_t N>
+ HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
+   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_256-inl.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_256-inl.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_256-inl.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/ops/x86_256-inl.h	2021-07-26 17:19:30.740403369 -0400
+@@ -20,15 +20,18 @@
+ // particular, "Broadcast", pack and zip behavior may be surprising.
+ 
+ #include <immintrin.h>  // AVX2+
++
+ #if defined(_MSC_VER) && defined(__clang__)
+ // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
+ // including these headers when _MSC_VER is defined, like when using clang-cl.
+ // Include these directly here.
+-#include <smmintrin.h>
+ #include <avxintrin.h>
++// avxintrin defines __m256i and must come before avx2intrin.
+ #include <avx2intrin.h>
++#include <bmi2intrin.h>  // _pext_u64
+ #include <f16cintrin.h>
+ #include <fmaintrin.h>
++#include <smmintrin.h>
+ #endif
+ 
+ #include <stddef.h>
+@@ -159,23 +162,24 @@ HWY_API Vec256<uint16_t> Set(Full256<uin
+   return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
+-  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};  // NOLINT
++  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
+ }
+ HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
+   return Vec256<uint64_t>{
+       _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
+-  return Vec256<int8_t>{_mm256_set1_epi8(t)};
++  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
+-  return Vec256<int16_t>{_mm256_set1_epi16(t)};
++  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
+ }
+ HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
+   return Vec256<int32_t>{_mm256_set1_epi32(t)};
+ }
+ HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
+-  return Vec256<int64_t>{_mm256_set1_epi64x(t)};
++  return Vec256<int64_t>{
++      _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
+ }
+ HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
+   return Vec256<float>{_mm256_set1_ps(t)};
+@@ -351,6 +355,8 @@ HWY_API Vec256<T> VecFromMask(Full256<T>
+   return Vec256<T>{v.raw};
+ }
+ 
++// ------------------------------ IfThenElse
++
+ // mask ? yes : no
+ template <typename T>
+ HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
+@@ -681,6 +687,14 @@ HWY_API Vec256<double> Max(const Vec256<
+   return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
+ }
+ 
++// ------------------------------ FirstN (Iota, Lt)
++
++template <typename T>
++HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
++  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
++  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
++}
++
+ // ================================================== ARITHMETIC
+ 
+ // ------------------------------ Addition
+@@ -843,7 +857,13 @@ HWY_API Vec256<uint16_t> AverageRound(co
+ 
+ // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
+ HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
++#if HWY_COMPILER_MSVC
++  // Workaround for incorrect codegen? (wrong result)
++  const auto zero = Zero(Full256<int8_t>());
++  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
++#else
+   return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
++#endif
+ }
+ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
+   return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
+@@ -851,6 +871,7 @@ HWY_API Vec256<int16_t> Abs(const Vec256
+ HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
+   return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
+ }
++// i64 is implemented after BroadcastSignBit.
+ 
+ HWY_API Vec256<float> Abs(const Vec256<float> v) {
+   const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
+@@ -1027,6 +1048,15 @@ HWY_API Vec256<int64_t> ShiftRight(const
+ #endif
+ }
+ 
++HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
++#if HWY_TARGET == HWY_AVX3
++  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
++#else
++  const auto zero = Zero(Full256<int64_t>());
++  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
++#endif
++}
++
+ // ------------------------------ ShiftLeftSame
+ 
+ HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
+@@ -1398,6 +1428,10 @@ HWY_API void Stream(const Vec256<double>
+ 
+ // ------------------------------ Scatter
+ 
++// Work around warnings in the intrinsic definitions (passing -1 as a mask).
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
++
+ #if HWY_TARGET == HWY_AVX3
+ namespace detail {
+ 
+@@ -1584,6 +1618,8 @@ HWY_INLINE Vec256<double> GatherIndex<do
+   return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ // ================================================== SWIZZLE
+ 
+ template <typename T>
+@@ -2379,11 +2415,18 @@ HWY_API Vec128<int8_t> DemoteTo(Full128<
+       _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
+ }
+ 
++  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
++  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
++HWY_DIAGNOSTICS(push)
++HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
++
+ HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
+                                    const Vec256<float> v) {
+   return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
+ }
+ 
++HWY_DIAGNOSTICS(pop)
++
+ HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
+                                const Vec256<double> v) {
+   return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
+@@ -2409,7 +2452,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
+   return BitCast(Simd<uint8_t, 8>(), pair);
+ }
+ 
+-// ------------------------------ Convert integer <=> floating point
++// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
+ 
+ HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
+                                 const Vec256<int32_t> v) {
+@@ -2421,13 +2464,20 @@ HWY_API Vec256<double> ConvertTo(Full256
+   (void)dd;
+   return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
+ #else
+-  alignas(32) int64_t lanes_i[4];
+-  Store(v, Full256<int64_t>(), lanes_i);
+-  alignas(32) double lanes_d[4];
+-  for (size_t i = 0; i < 4; ++i) {
+-    lanes_d[i] = static_cast<double>(lanes_i[i]);
+-  }
+-  return Load(dd, lanes_d);
++  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
++  const Repartition<uint32_t, decltype(dd)> d32;
++  const Repartition<uint64_t, decltype(dd)> d64;
++
++  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
++  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
++  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
++
++  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
++  const auto k52 = Set(d32, 0x43300000);
++  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
++
++  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
++  return (v_upper - k84_63_52) + v_lower;  // order matters!
+ #endif
+ }
+ 
+@@ -2502,8 +2552,7 @@ HWY_API uint64_t BitsFromMask(hwy::SizeT
+   const auto compressed =
+       _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
+   return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
+-
+-#endif
++#endif  // HWY_ARCH_X86_64
+ }
+ 
+ template <typename T>
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/targets.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/targets.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/targets.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/targets.cc	2021-07-26 17:17:24.610482240 -0400
+@@ -32,8 +32,8 @@
+ #include <intrin.h>
+ #else  // HWY_COMPILER_MSVC
+ #include <cpuid.h>
+-#endif // HWY_COMPILER_MSVC
+-#endif
++#endif  // HWY_COMPILER_MSVC
++#endif  // HWY_ARCH_X86
+ 
+ namespace hwy {
+ namespace {
+@@ -126,7 +126,7 @@ constexpr uint32_t kAVX512VL = 1u << 13;
+ constexpr uint32_t kAVX512DQ = 1u << 14;
+ constexpr uint32_t kAVX512BW = 1u << 15;
+ constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
+-#endif
++#endif  // HWY_ARCH_X86
+ 
+ }  // namespace
+ 
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/targets.h.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/targets.h
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/targets.h.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/targets.h	2021-07-26 17:17:24.610482240 -0400
+@@ -65,7 +65,9 @@
+ // HWY_MAX_DYNAMIC_TARGETS in total.
+ #define HWY_HIGHEST_TARGET_BIT_X86 9
+ 
+-// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
++#define HWY_SVE2 0x400
++#define HWY_SVE 0x800
++// 0x1000 reserved for Helium
+ #define HWY_NEON 0x2000
+ 
+ #define HWY_HIGHEST_TARGET_BIT_ARM 13
+@@ -90,6 +92,9 @@
+ // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
+ 
+ #define HWY_SCALAR 0x20000000
++
++#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
++
+ // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+ 
+ //------------------------------------------------------------------------------
+@@ -106,25 +111,26 @@
+ #ifndef HWY_BROKEN_TARGETS
+ 
+ // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
+-// SSE4 codegen (msan failure), so disable all those targets.
++// SSE4 codegen (possibly only for msan), so disable all those targets.
+ #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
+-// TODO: Disable all non-scalar targets for every build target once we have
+-// clang-7 enabled in our builders.
+-#ifdef MEMORY_SANITIZER
+ #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
+-#else
+-#define HWY_BROKEN_TARGETS 0
+-#endif
+ // This entails a major speed reduction, so warn unless the user explicitly
+ // opts in to scalar-only.
+ #if !defined(HWY_COMPILE_ONLY_SCALAR)
+ #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
+ #endif
+ 
+-// MSVC, or 32-bit may fail to compile AVX2/3.
+-#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
++// 32-bit may fail to compile AVX2/3.
++#elif HWY_ARCH_X86_32
+ #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
+-#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
++
++// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
++#elif HWY_COMPILER_MSVC != 0
++#define HWY_BROKEN_TARGETS (HWY_AVX3)
++
++// armv7be has not been tested and is not yet supported.
++#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
++#define HWY_BROKEN_TARGETS (HWY_NEON)
+ 
+ #else
+ #define HWY_BROKEN_TARGETS 0
+@@ -145,53 +151,74 @@
+ // user to override this without any guarantee of success.
+ #ifndef HWY_BASELINE_TARGETS
+ 
+-#ifdef __wasm_simd128__
++// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
++// HWY_TARGET == HWY_SCALAR.
++
++#if HWY_ARCH_WASM && defined(__wasm_simd128__)
+ #define HWY_BASELINE_WASM HWY_WASM
+ #else
+ #define HWY_BASELINE_WASM 0
+ #endif
+ 
+-#ifdef __VSX__
++// Avoid choosing the PPC target until we have an implementation.
++#if HWY_ARCH_PPC && defined(__VSX__) && 0
+ #define HWY_BASELINE_PPC8 HWY_PPC8
+ #else
+ #define HWY_BASELINE_PPC8 0
+ #endif
+ 
+-// GCC 4.5.4 only defines the former; 5.4 defines both.
+-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
++// Avoid choosing the SVE[2] targets the implementation is ready.
++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
++#define HWY_BASELINE_SVE2 HWY_SVE2
++#else
++#define HWY_BASELINE_SVE2 0
++#endif
++
++#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
++#define HWY_BASELINE_SVE HWY_SVE
++#else
++#define HWY_BASELINE_SVE 0
++#endif
++
++// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
++#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+ #define HWY_BASELINE_NEON HWY_NEON
+ #else
+ #define HWY_BASELINE_NEON 0
+ #endif
+ 
+-#ifdef __SSE4_1__
++// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
++// we at least get SSE4 on machines supporting AVX but not AVX2.
++// https://stackoverflow.com/questions/18563978/
++#if HWY_ARCH_X86 && \
++    (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
+ #define HWY_BASELINE_SSE4 HWY_SSE4
+ #else
+ #define HWY_BASELINE_SSE4 0
+ #endif
+ 
+-#ifdef __AVX2__
++#if HWY_ARCH_X86 && defined(__AVX2__)
+ #define HWY_BASELINE_AVX2 HWY_AVX2
+ #else
+ #define HWY_BASELINE_AVX2 0
+ #endif
+ 
+-#ifdef __AVX512F__
++#if HWY_ARCH_X86 && defined(__AVX512F__)
+ #define HWY_BASELINE_AVX3 HWY_AVX3
+ #else
+ #define HWY_BASELINE_AVX3 0
+ #endif
+ 
+-#ifdef __riscv_vector
++#if HWY_ARCH_RVV && defined(__riscv_vector)
+ #define HWY_BASELINE_RVV HWY_RVV
+ #else
+ #define HWY_BASELINE_RVV 0
+ #endif
+ 
+ #define HWY_BASELINE_TARGETS                                                \
+-  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
+-   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
+-   HWY_BASELINE_RVV)
++  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
++   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 |               \
++   HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)
+ 
+ #endif  // HWY_BASELINE_TARGETS
+ 
+@@ -242,13 +269,12 @@
+ #define HWY_TARGETS HWY_STATIC_TARGET
+ 
+ // 3) For tests: include all attainable targets (in particular: scalar)
+-#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
++#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
+ #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
+ 
+ // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
+ // excluding superseded targets, in particular scalar.
+ #else
+-
+ #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+ 
+ #endif  // target policy
+@@ -323,6 +349,10 @@ static inline HWY_MAYBE_UNUSED const cha
+ #endif
+ 
+ #if HWY_ARCH_ARM
++    case HWY_SVE2:
++      return "SVE2";
++    case HWY_SVE:
++      return "SVE";
+     case HWY_NEON:
+       return "Neon";
+ #endif
+@@ -346,7 +376,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       return "Scalar";
+ 
+     default:
+-      return "?";
++      return "Unknown";  // must satisfy gtest IsValidParamName()
+   }
+ }
+ 
+@@ -405,21 +435,17 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                    /* SSE3 */     \
+       nullptr                     /* SSE2 */
+ 
+-#endif  // HWY_ARCH_X86
+-
+-#if HWY_ARCH_ARM
++#elif HWY_ARCH_ARM
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
+ #define HWY_CHOOSE_TARGET_LIST(func_name)       \
+-  nullptr,                       /* reserved */ \
+-      nullptr,                   /* reserved */ \
++  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
++      HWY_CHOOSE_SVE(func_name), /* SVE */      \
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_NEON(func_name) /* NEON */
+ 
+-#endif  // HWY_ARCH_ARM
+-
+-#if HWY_ARCH_PPC
++#elif HWY_ARCH_PPC
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 5
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
+@@ -430,9 +456,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                    /* VSX */      \
+       nullptr                     /* AltiVec */
+ 
+-#endif  // HWY_ARCH_PPC
+-
+-#if HWY_ARCH_WASM
++#elif HWY_ARCH_WASM
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
+@@ -442,9 +466,7 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_WASM(func_name) /* WASM */
+ 
+-#endif  // HWY_ARCH_WASM
+-
+-#if HWY_ARCH_RVV
++#elif HWY_ARCH_RVV
+ // See HWY_ARCH_X86 above for details.
+ #define HWY_MAX_DYNAMIC_TARGETS 4
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
+@@ -454,7 +476,12 @@ static inline HWY_MAYBE_UNUSED const cha
+       nullptr,                   /* reserved */ \
+       HWY_CHOOSE_RVV(func_name) /* RVV */
+ 
+-#endif  // HWY_ARCH_RVV
++#else
++// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
++// still creating single-entry tables in HWY_EXPORT to ensure portability.
++#define HWY_MAX_DYNAMIC_TARGETS 1
++#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
++#endif
+ 
+ struct ChosenTarget {
+  public:
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/tests/memory_test.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/tests/memory_test.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/tests/memory_test.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/tests/memory_test.cc	2021-07-26 17:10:40.022319820 -0400
+@@ -12,6 +12,12 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+ 
++// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
++// detected. Must come before Highway headers.
++#if defined(_WIN32) || defined(_WIN64)
++#include <Windows.h>
++#endif
++
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+@@ -199,13 +205,14 @@ struct TestLoadDup128 {
+     for (size_t i = 0; i < N128; ++i) {
+       lanes[i] = static_cast<T>(1 + i);
+     }
+-    const auto v = LoadDup128(d, lanes);
++
+     const size_t N = Lanes(d);
+-    auto out = AllocateAligned<T>(N);
+-    Store(v, d, out.get());
++    auto expected = AllocateAligned<T>(N);
+     for (size_t i = 0; i < N; ++i) {
+-      HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
++      expected[i] = static_cast<T>(i % N128 + 1);
+     }
++
++    HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
+ #else
+     (void)d;
+ #endif
+@@ -327,6 +334,84 @@ HWY_NOINLINE void TestAllScatter() {
+   ForFloatTypes(test);
+ }
+ 
++// Assumes little-endian byte order!
++struct TestScatter {
++  template <class T, class D>
++  HWY_NOINLINE void operator()(T /*unused*/, D d) {
++    using Offset = MakeSigned<T>;
++
++    const size_t N = Lanes(d);
++    const size_t range = 4 * N;                  // number of items to scatter
++    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
++
++    RandomState rng;
++
++    // Data to be scattered
++    auto bytes = AllocateAligned<uint8_t>(max_bytes);
++    for (size_t i = 0; i < max_bytes; ++i) {
++      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
++    }
++    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
++
++    // Scatter into these regions, ensure vector results match scalar
++    auto expected = AllocateAligned<T>(range);
++    auto actual = AllocateAligned<T>(range);
++
++    const Rebind<Offset, D> d_offsets;
++    auto offsets = AllocateAligned<Offset>(N);  // or indices
++
++    for (size_t rep = 0; rep < 100; ++rep) {
++      // Byte offsets
++      std::fill(expected.get(), expected.get() + range, T(0));
++      std::fill(actual.get(), actual.get() + range, T(0));
++      for (size_t i = 0; i < N; ++i) {
++        offsets[i] =
++            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
++        CopyBytes<sizeof(T)>(
++            bytes.get() + i * sizeof(T),
++            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
++      }
++      const auto voffsets = Load(d_offsets, offsets.get());
++      ScatterOffset(data, d, actual.get(), voffsets);
++      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
++        Print(d, "Data", data);
++        Print(d_offsets, "Offsets", voffsets);
++        HWY_ASSERT(false);
++      }
++
++      // Indices
++      std::fill(expected.get(), expected.get() + range, T(0));
++      std::fill(actual.get(), actual.get() + range, T(0));
++      for (size_t i = 0; i < N; ++i) {
++        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
++        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
++                             &expected[offsets[i]]);
++      }
++      const auto vindices = Load(d_offsets, offsets.get());
++      ScatterIndex(data, d, actual.get(), vindices);
++      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
++        Print(d, "Data", data);
++        Print(d_offsets, "Indices", vindices);
++        HWY_ASSERT(false);
++      }
++    }
++  }
++};
++
++HWY_NOINLINE void TestAllScatter() {
++  // No u8,u16,i8,i16.
++  const ForPartialVectors<TestScatter> test;
++  test(uint32_t());
++  test(int32_t());
++
++#if HWY_CAP_INTEGER64
++  test(uint64_t());
++  test(int64_t());
++#endif
++
++  ForFloatTypes(test);
++}
++
+ struct TestGather {
+   template <class T, class D>
+   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+@@ -391,6 +476,7 @@ HWY_NOINLINE void TestAllCache() {
+   int test = 0;
+   Prefetch(&test);
+   FlushCacheline(&test);
++  Pause();
+ }
+ 
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
+diff -up chromium-92.0.4515.107/third_party/highway/src/hwy/tests/swizzle_test.cc.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/hwy/tests/swizzle_test.cc
+--- chromium-92.0.4515.107/third_party/highway/src/hwy/tests/swizzle_test.cc.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/hwy/tests/swizzle_test.cc	2021-07-26 17:10:40.023319835 -0400
+@@ -223,6 +223,7 @@ struct TestTableLookupBytes {
+ HWY_NOINLINE void TestAllTableLookupBytes() {
+   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
+ }
++
+ struct TestTableLookupLanes {
+ #if HWY_TARGET == HWY_RVV
+   using Index = uint32_t;
+@@ -242,12 +243,13 @@ struct TestTableLookupLanes {
+     if (N <= 8) {  // Test all permutations
+       for (size_t i0 = 0; i0 < N; ++i0) {
+         idx[0] = static_cast<Index>(i0);
++
+         for (size_t i1 = 0; i1 < N; ++i1) {
+-          idx[1] = static_cast<Index>(i1);
++          if (N >= 2) idx[1] = static_cast<Index>(i1);
+           for (size_t i2 = 0; i2 < N; ++i2) {
+-            idx[2] = static_cast<Index>(i2);
++            if (N >= 4) idx[2] = static_cast<Index>(i2);
+             for (size_t i3 = 0; i3 < N; ++i3) {
+-              idx[3] = static_cast<Index>(i3);
++              if (N >= 4) idx[3] = static_cast<Index>(i3);
+ 
+               for (size_t i = 0; i < N; ++i) {
+                 expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+@@ -286,7 +288,7 @@ struct TestTableLookupLanes {
+ };
+ 
+ HWY_NOINLINE void TestAllTableLookupLanes() {
+-  const ForFullVectors<TestTableLookupLanes> test;
++  const ForPartialVectors<TestTableLookupLanes> test;
+   test(uint32_t());
+   test(int32_t());
+   test(float());
+diff -up chromium-92.0.4515.107/third_party/highway/src/README.md.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/README.md
+--- chromium-92.0.4515.107/third_party/highway/src/README.md.update-highway-0.12.2	2021-07-26 17:10:40.838332249 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/README.md	2021-07-26 17:15:00.832292309 -0400
+@@ -15,7 +15,7 @@ applying the same operation to 'lanes'.
+ ## Current status
+ 
+ Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
+-A port to RVV is in progress.
++Ports to RVV and SVE/SVE2 are in progress.
+ 
+ Version 0.11 is considered stable enough to use in other projects, and is
+ expected to remain backwards compatible unless serious issues are discovered
+@@ -23,8 +23,11 @@ while implementing SVE/RVV targets. Afte
+ reach version 1.0.
+ 
+ Continuous integration tests build with a recent version of Clang (running on
+-x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). Also periodically
+-tested on x86 with Clang 7-11 and GCC 8, 9 and 10.2.1.
++x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
++
++Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
++GCC cross-compile and QEMU. See the
++[testing process](g3doc/release_testing_process.md) for details.
+ 
+ The `contrib` directory contains SIMD-related utilities: an image class with
+ aligned rows, and a math library (16 functions already implemented, mostly
+@@ -63,6 +66,8 @@ To test on all the attainable targets fo
+ default configuration skips baseline targets (e.g. scalar) that are superseded
+ by another baseline target.
+ 
++Bazel is also supported for building, but it is not as widely used/tested.
++
+ ## Quick start
+ 
+ You can use the `benchmark` inside examples/ as a starting point.
+diff -up chromium-92.0.4515.107/third_party/highway/src/run_tests.bat.update-highway-0.12.2 chromium-92.0.4515.107/third_party/highway/src/run_tests.bat
+--- chromium-92.0.4515.107/third_party/highway/src/run_tests.bat.update-highway-0.12.2	2021-07-19 14:47:23.000000000 -0400
++++ chromium-92.0.4515.107/third_party/highway/src/run_tests.bat	2021-07-26 17:14:47.466088723 -0400
+@@ -2,9 +2,9 @@
+ REM Switch directory of this batch file
+ cd %~dp0
+ 
+-if not exist build mkdir build
++if not exist build_win mkdir build_win
+ 
+-cd build
++cd build_win
+ cmake .. -G Ninja || goto error
+ ninja || goto error
+ ctest -j || goto error
diff --git a/chromium-92.0.4515.107-widevine-other-locations.patch b/chromium-92.0.4515.107-widevine-other-locations.patch
new file mode 100644
index 0000000..739778d
--- /dev/null
+++ b/chromium-92.0.4515.107-widevine-other-locations.patch
@@ -0,0 +1,20 @@
+diff -up chromium-92.0.4515.107/chrome/common/chrome_paths.cc.widevine-other-locations chromium-92.0.4515.107/chrome/common/chrome_paths.cc
+--- chromium-92.0.4515.107/chrome/common/chrome_paths.cc.widevine-other-locations	2021-07-26 16:50:41.815065696 -0400
++++ chromium-92.0.4515.107/chrome/common/chrome_paths.cc	2021-07-26 16:58:08.334868284 -0400
+@@ -313,6 +313,16 @@ bool PathProvider(int key, base::FilePat
+ 
+ #if BUILDFLAG(ENABLE_WIDEVINE)
+     case chrome::DIR_BUNDLED_WIDEVINE_CDM:
++      base::PathService::Get(base::DIR_HOME, &cur);
++      cur = cur.Append(FILE_PATH_LITERAL(".local/lib/libwidevinecdm.so"));
++      if (base::PathExists(cur)) {
++        break;
++      }
++      // Yes, this has an arch hardcoded in the path, but at this time, it is the only place to find libwidevinecdm.so
++      if (base::PathExists(base::FilePath(FILE_PATH_LITERAL("/opt/google/chrome/WidevineCdm/_platform_specific/linux_x64/libwidevinecdm.so")))) {
++        cur = base::FilePath(FILE_PATH_LITERAL("/opt/google/chrome/WidevineCdm/_platform_specific/linux_x64/libwidevinecdm.so"));
++        break;
++      }
+       if (!GetComponentDirectory(&cur))
+         return false;
+ #if !BUILDFLAG(IS_CHROMEOS_ASH)
diff --git a/chromium-freetype-2.11.patch b/chromium-freetype-2.11.patch
new file mode 100644
index 0000000..aee6dc8
--- /dev/null
+++ b/chromium-freetype-2.11.patch
@@ -0,0 +1,50 @@
+--- a/third_party/skia/src/ports/SkFontHost_FreeType_common.cpp
++++ b/third_party/skia/src/ports/SkFontHost_FreeType_common.cpp
+@@ -712,7 +712,11 @@ void colrv1_draw_paint(SkCanvas* canvas,
+             canvas->drawPaint(colrPaint);
+             break;
+         }
++#if FREETYPE_MAJOR == 2 && FREETYPE_MINOR >= 11
++        case FT_COLR_PAINTFORMAT_TRANSFORM:
++#else
+         case FT_COLR_PAINTFORMAT_TRANSFORMED:
++#endif
+         case FT_COLR_PAINTFORMAT_TRANSLATE:
+         case FT_COLR_PAINTFORMAT_ROTATE:
+         case FT_COLR_PAINTFORMAT_SKEW:
+@@ -759,10 +763,17 @@ void colrv1_transform(SkCanvas* canvas, FT_Face face, FT_COLR_Paint colrv1_paint
+     SkMatrix transform;
+ 
+     switch (colrv1_paint.format) {
++#if FREETYPE_MAJOR == 2 && FREETYPE_MINOR >= 11
++        case FT_COLR_PAINTFORMAT_TRANSFORM: {
++            transform = ToSkMatrix(colrv1_paint.u.transform.affine);
++            break;
++        }
++#else
+         case FT_COLR_PAINTFORMAT_TRANSFORMED: {
+             transform = ToSkMatrix(colrv1_paint.u.transformed.affine);
+             break;
+         }
++#endif
+         case FT_COLR_PAINTFORMAT_TRANSLATE: {
+             transform = SkMatrix::Translate(
+                 SkFixedToScalar(colrv1_paint.u.translate.dx),
+@@ -880,10 +891,17 @@ bool colrv1_traverse_paint(SkCanvas* canvas,
+             traverse_result = colrv1_start_glyph(canvas, palette, face, paint.u.colr_glyph.glyphID,
+                                                  FT_COLOR_NO_ROOT_TRANSFORM);
+             break;
++#if FREETYPE_MAJOR == 2 && FREETYPE_MINOR >= 11
++        case FT_COLR_PAINTFORMAT_TRANSFORM:
++            colrv1_transform(canvas, face, paint);
++            traverse_result = colrv1_traverse_paint(canvas, palette, face,
++                                                    paint.u.transform.paint, visited_set);
++#else
+         case FT_COLR_PAINTFORMAT_TRANSFORMED:
+             colrv1_transform(canvas, face, paint);
+             traverse_result = colrv1_traverse_paint(canvas, palette, face,
+                                                     paint.u.transformed.paint, visited_set);
++#endif
+             break;
+         case FT_COLR_PAINTFORMAT_TRANSLATE:
+             colrv1_transform(canvas, face, paint);
diff --git a/chromium.spec b/chromium.spec
index e4af9ba..4f13b61 100644
--- a/chromium.spec
+++ b/chromium.spec
@@ -31,6 +31,16 @@
 # This doesn't work and it doesn't even build as of Chromium 83
 %global build_remoting 1
 
+# This will probably be truely possible with Chromium 93
+# Right now, we fake it a bit and pull in both python2 and python3 stacks. sorry.
+%global build_with_python3 1
+
+%if 0%{?build_with_python3}
+%global chromium_pybin %{__python3}
+%else
+%global chromium_pybin %{__python2}
+%endif
+
 # We'd like to always have this on...
 # ... but the libva in EL7 (and EL8) is too old.
 %if 0%{?rhel} == 7 || 0%{?rhel} == 8
@@ -208,14 +218,14 @@ BuildRequires:  libicu-devel >= 5.4
 %global chromoting_client_id %nil
 %endif
 
-%global majorversion 91
+%global majorversion 92
 
 %if %{freeworld}
 Name:		chromium%{chromium_channel}%{nsuffix}
 %else
 Name:		chromium%{chromium_channel}
 %endif
-Version:	%{majorversion}.0.4472.164
+Version:	%{majorversion}.0.4515.159
 Release:	1%{?dist}
 %if %{?freeworld}
 %if %{?shared}
@@ -244,18 +254,22 @@ Patch4:		chromium-60.0.3112.78-jpeg-nomangle.patch
 # Do not mangle zlib
 Patch5:		chromium-77.0.3865.75-no-zlib-mangle.patch
 # Do not use unrar code, it is non-free
-Patch6:		chromium-89.0.4389.72-norar.patch
+Patch6:		chromium-92.0.4515.107-norar.patch
 # Use Gentoo's Widevine hack
 # https://gitweb.gentoo.org/repo/gentoo.git/tree/www-client/chromium/files/chromium-widevine-r3.patch
 Patch7:		chromium-71.0.3578.98-widevine-r3.patch
 # Disable fontconfig cache magic that breaks remoting
 Patch8:		chromium-91.0.4472.77-disable-fontconfig-cache-magic.patch
 # drop rsp clobber, which breaks gcc9 (thanks to Jeff Law)
-Patch9:	chromium-78.0.3904.70-gcc9-drop-rsp-clobber.patch
+Patch9:		chromium-78.0.3904.70-gcc9-drop-rsp-clobber.patch
 # Try to load widevine from other places
-Patch10:	chromium-89.0.4389.72-widevine-other-locations.patch
+Patch10:	chromium-92.0.4515.107-widevine-other-locations.patch
 # Try to fix version.py for Rawhide
-Patch11:	chromium-71.0.3578.98-py2-bootstrap.patch
+%if 0%{?build_with_python3}
+Patch11:        chromium-92.0.4515.107-py3-bootstrap.patch
+%else
+Patch11:	chromium-92.0.4515.107-py2-bootstrap.patch
+%endif
 # Add "Fedora" to the user agent string
 Patch12:	chromium-86.0.4240.75-fedora-user-agent.patch
 
@@ -274,13 +288,11 @@ Patch57:	chromium-89.0.4389.72-missing-cstring-header.patch
 # prepare for using system ffmpeg (clean)
 # http://svnweb.mageia.org/packages/cauldron/chromium-browser-stable/current/SOURCES/chromium-53-ffmpeg-no-deprecation-errors.patch?view=markup
 Patch58:	chromium-53-ffmpeg-no-deprecation-errors.patch
-# https://github.com/stha09/chromium-patches/blob/master/chromium-91-pcscan-vector-types.patch
-Patch59:	chromium-91-pcscan-vector-types.patch
 # https://github.com/stha09/chromium-patches/blob/master/chromium-91-libyuv-aarch64.patch
 Patch60:	chromium-91-libyuv-aarch64.patch
 # Update third_party/highway to 0.12.2
 # this is needed for sane arm/aarch64
-Patch61:	chromium-91.0.4472.77-update-highway-0.12.2.patch
+Patch61:	chromium-92.0.4515.107-update-highway-0.12.2.patch
 # https://github.com/stha09/chromium-patches/blob/master/chromium-90-ruy-include.patch
 Patch62:	chromium-90-ruy-include.patch
 # Extra CXXFLAGS for aarch64
@@ -290,7 +302,7 @@ Patch63:	chromium-91.0.4472.77-aarch64-cxxflags-addition.patch
 Patch64:	chromium-91.0.4472.77-java-only-allowed-in-android-builds.patch
 
 # Silence GCC warnings during gn compile
-Patch65:	chromium-84.0.4147.105-gn-gcc-cleanup.patch
+Patch65:	chromium-92.0.4515.107-gn-gcc-cleanup.patch
 # Fix missing cstring in remoting code
 Patch66:	chromium-84.0.4147.125-remoting-cstring.patch
 # Apply fix_textrels hack for i686 (even without lld)
@@ -301,17 +313,27 @@ Patch68:	chromium-84.0.4147.125-aarch64-clearkeycdm-binutils-workaround.patch
 # Thanks to Kevin Kofler for the fix.
 Patch75:	chromium-90.0.4430.72-fstatfix.patch
 # Rawhide (f35) glibc defines SIGSTKSZ as a long instead of a constant
-Patch76:	chromium-88.0.4324.182-rawhide-gcc-std-max-fix.patch
+Patch76:	chromium-92.0.4515.107-rawhide-gcc-std-max-fix.patch
 # Fix symbol visibility with gcc on swiftshader's libEGL
 Patch77:	chromium-88.0.4324.182-gcc-fix-swiftshader-libEGL-visibility.patch
 # Do not download proprietary widevine module in the background (thanks Debian)
 Patch79:	chromium-90.0.4430.72-widevine-no-download.patch
 # Fix crashes with components/cast_*
 # Thanks to Gentoo
-Patch80:	https://gitweb.gentoo.org/repo/gentoo.git/plain/www-client/chromium/files/chromium-89-EnumTable-crash.patch
-# Fix crashes with ThemeService, thanks OpenSUSE
-Patch81:	chromium-91-1190561-boo1186948.patch
-
+Patch80:	chromium-92.0.4515.107-EnumTable-crash.patch
+# https://github.com/stha09/chromium-patches/blob/master/chromium-92-v8-constexpr.patch
+Patch82:	chromium-92-v8-constexpr.patch
+# Fixes for python3
+Patch83:	chromium-92.0.4515.107-py3-fixes.patch
+# Fix build with Freetype 2.11
+Patch84:	https://gitweb.gentoo.org/repo/gentoo.git/plain/www-client/chromium/files/chromium-freetype-2.11.patch
+# https://bugs.chromium.org/p/chromium/issues/detail?id=1213452
+# https://chromium.googlesource.com/chromium/src/sandbox/+/482404adee4fc0487452c7ae5ac9c192b0f4fd30%5E%21/#F0
+# Needed for F35+, but safe everywhere
+Patch85:	chromium-92.0.4515.107-sandbox-clone3.patch
+# Clean up clang-format for python3
+# thanks to Jon Nettleton
+Patch86:	chromium-92-clang-format.patch
 
 # Use lstdc++ on EPEL7 only
 Patch101:	chromium-75.0.3770.100-epel7-stdc++.patch
@@ -339,7 +361,6 @@ Patch109:	chromium-90.0.4430.93-epel7-erase-fix.patch
 # AARCH64 neon symbols need to be prefixed too to prevent multiple definition issue at linktime
 Patch110:	chromium-90.0.4430.93-epel8-aarch64-libpng16-symbol-prefixes.patch
 
-
 # VAAPI
 # Upstream turned VAAPI on in Linux in 86
 Patch202:	chromium-89.0.4389.72-enable-hardware-accelerated-mjpeg.patch
@@ -347,7 +368,7 @@ Patch203:	chromium-86.0.4240.75-vaapi-i686-fpermissive.patch
 Patch205:	chromium-86.0.4240.75-fix-vaapi-on-intel.patch
 
 # Apply these patches to work around EPEL8 issues
-Patch300:	chromium-89.0.4389.82-rhel8-force-disable-use_gnome_keyring.patch
+Patch300:	chromium-92.0.4515.107-rhel8-force-disable-use_gnome_keyring.patch
 
 # And fixes for new compilers
 Patch400:       %{name}-gcc11.patch
@@ -463,6 +484,8 @@ BuildRequires:	libstdc++-devel, openssl-devel
 # Fedora tries to use system libs whenever it can.
 BuildRequires:	bzip2-devel
 BuildRequires:	dbus-glib-devel
+# For eu-strip
+BuildRequires:	elfutils
 BuildRequires:	elfutils-libelf-devel
 BuildRequires:	flac-devel
 %if 0%{?bundlefreetype}
@@ -529,17 +552,23 @@ BuildRequires:	pkgconfig(gtk+-3.0)
 %else
 BuildRequires:	pkgconfig(gtk+-2.0)
 %endif
-BuildRequires:	/usr/bin/python2
+BuildRequires:	%{chromium_pybin}
+# %%if ! %%{build_with_python3}
 BuildRequires:	python2-devel
+# %%else
+BuildRequires:  python3-devel
+# %%endif
+
+# %%if 0%{?build_with_python3}
 %if 0%{?bundlepylibs}
 # Using bundled bits, do nothing.
 %else
 %if 0%{?fedora}
-BuildRequires:	python2-beautifulsoup4
-BuildRequires:	python2-beautifulsoup
-BuildRequires:	python2-html5lib
-BuildRequires:	python2-markupsafe
-BuildRequires:	python2-ply
+BuildRequires:	python3-beautifulsoup4
+# BuildRequires:	python2-beautifulsoup
+BuildRequires:	python3-html5lib
+BuildRequires:	python3-markupsafe
+BuildRequires:	python3-ply
 %else
 BuildRequires:	python-beautifulsoup4
 BuildRequires:	python-BeautifulSoup
@@ -547,8 +576,30 @@ BuildRequires:	python-html5lib
 BuildRequires:	python-markupsafe
 BuildRequires:	python-ply
 %endif
-BuildRequires:	python2-simplejson
+BuildRequires:	python3-simplejson
 %endif
+#%%else
+%if 0%{?bundlepylibs}
+# Using bundled bits, do nothing.
+%else
+%if 0%{?fedora}
+BuildRequires:  python2-beautifulsoup4
+BuildRequires:  python2-beautifulsoup
+BuildRequires:  python2-html5lib
+BuildRequires:  python2-markupsafe
+BuildRequires:  python2-ply
+%else
+BuildRequires:  python-beautifulsoup4
+BuildRequires:  python-BeautifulSoup
+BuildRequires:  python-html5lib
+BuildRequires:  python-markupsafe
+BuildRequires:  python-ply
+%endif
+BuildRequires:  python2-simplejson
+%endif
+# %%endif
+
+
 %if 0%{?bundlere2}
 # Using bundled bits, do nothing.
 %else
@@ -859,7 +910,11 @@ Requires(post): systemd
 Requires(preun): systemd
 Requires(postun): systemd
 Requires: xorg-x11-server-Xvfb
+%if 0%{?build_with_python3}
+Requires: python3-psutil
+%else
 Requires: python2-psutil
+%endif
 %if 0%{?shared}
 Requires: chromium-libs%{_isa} = %{version}-%{release}
 %else
@@ -916,7 +971,9 @@ udev.
 %patch8 -p1 -b .nofontconfigcache
 %patch9 -p1 -b .gcc9
 %patch10 -p1 -b .widevine-other-locations
-%patch11 -p1 -b .py2
+%if 0%{?build_with_python3}
+%patch11 -p1 -b .py3
+%endif
 
 # Short term fixes (usually gcc and backports)
 %patch51 -p1 -b .gcc-remoting-constexpr
@@ -928,7 +985,6 @@ udev.
 %patch56 -p1 -b .missing-cstdint
 %patch57 -p1 -b .missing-cstring
 %patch58 -p1 -b .ffmpeg-deprecations
-%patch59 -p1 -b .pcscan-vector-types
 %patch60 -p1 -b .libyuv-aarch64
 %patch61 -p1 -b .update-highway-0.12.2
 %patch62 -p1 -b .ruy-include
@@ -945,7 +1001,12 @@ udev.
 %patch77 -p1 -b .gcc-swiftshader-visibility
 %patch79 -p1 -b .widevine-no-download
 %patch80 -p1 -b .EnumTable-crash
-%patch81 -p1 -b .ThemeService-crash
+%patch82 -p1 -b .v8-constexpr
+%patch83 -p1 -b .py3fixes
+%patch84 -p1 -b .freetype-2.11
+%patch85 -p1 -b .clone3
+# Still using python2 in 92.
+# %%patch86 -p1 -b .clang-format-py3
 
 # Fedora branded user agent
 %if 0%{?fedora}
@@ -984,7 +1045,11 @@ udev.
 
 # Change shebang in all relevant files in this directory and all subdirectories
 # See `man find` for how the `-exec command {} +` syntax works
+%if 0%{?build_with_python3}
+find -type f -exec sed -iE '1s=^#! */usr/bin/\(python\|env python\)[23]\?=#!%{__python3}=' {} +
+%else
 find -type f -exec sed -iE '1s=^#! */usr/bin/\(python\|env python\)[23]\?=#!%{__python2}=' {} +
+%endif
 
 %if 0%{?asan}
 export CC="clang"
@@ -1162,6 +1227,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'base/third_party/valgrind' \
 	'base/third_party/xdg_mime' \
 	'base/third_party/xdg_user_dirs' \
+	'buildtools/third_party/eu-strip' \
 	'buildtools/third_party/libc++' \
 	'buildtools/third_party/libc++abi' \
 	'chrome/third_party/mozilla_security_manager' \
@@ -1222,7 +1288,7 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/devtools-frontend/src/front_end/third_party/axe-core' \
 	'third_party/devtools-frontend/src/front_end/third_party/chromium' \
 	'third_party/devtools-frontend/src/front_end/third_party/codemirror' \
-	'third_party/devtools-frontend/src/front_end/third_party/fabricjs' \
+	'third_party/devtools-frontend/src/front_end/third_party/diff' \
 	'third_party/devtools-frontend/src/front_end/third_party/i18n' \
 	'third_party/devtools-frontend/src/front_end/third_party/intl-messageformat' \
 	'third_party/devtools-frontend/src/front_end/third_party/lighthouse' \
@@ -1375,7 +1441,6 @@ build/linux/unbundle/remove_bundled_libraries.py \
 	'third_party/tflite/src/third_party/eigen3' \
 	'third_party/tflite/src/third_party/fft2d' \
 	'third_party/tflite-support' \
-	'third_party/tint' \
 	'third_party/ukey2' \
         'third_party/usb_ids' \
 	'third_party/usrsctp' \
@@ -1415,8 +1480,12 @@ build/linux/unbundle/remove_bundled_libraries.py \
 %if ! 0%{?bundlepylibs}
 # Look, I don't know. This package is spit and chewing gum. Sorry.
 rm -rf third_party/markupsafe
+%if 0%{?build_with_python3}
+ln -s %{python3_sitearch}/markupsafe third_party/markupsafe
+%else
 ln -s %{python2_sitearch}/markupsafe third_party/markupsafe
-# We should look on removing other python2 packages as well i.e. ply
+%endif
+# We should look on removing other python packages as well i.e. ply
 %endif
 
 # Fix hardcoded path in remoting code
@@ -1493,6 +1562,11 @@ sed -i '/aarch64)/ a \        exec "/usr/bin/ninja-build" "$@";;\' ../depot_tool
 %endif
 sed -i 's|exec "${THIS_DIR}/ninja-linux${LONG_BIT}"|exec "/usr/bin/ninja-build"|g' ../depot_tools/ninja
 
+# Get rid of the pre-built eu-strip binary, it is x86_64 and of mysterious origin
+rm -rf buildtools/third_party/eu-strip/bin/eu-strip
+# Replace it with a symlink to the Fedora copy
+ln -s %{_bindir}/eu-strip buildtools/third_party/eu-strip/bin/eu-strip
+
 %if 0%{?rhel} == 7
 . /opt/rh/devtoolset-%{dts_version}/enable
 %endif
@@ -1502,24 +1576,29 @@ sed -i 's|exec "${THIS_DIR}/ninja-linux${LONG_BIT}"|exec "/usr/bin/ninja-build"|
 %endif
 
 # Check that there is no system 'google' module, shadowing bundled ones:
+%if 0%{?build_with_python3}
+if python3 -c 'import google ; print google.__path__' 2> /dev/null ; then \
+    echo "Python 3 'google' module is defined, this will shadow modules of this build"; \
+%else
 if python2 -c 'import google ; print google.__path__' 2> /dev/null ; then \
     echo "Python 2 'google' module is defined, this will shadow modules of this build"; \
+%endif
     exit 1 ; \
 fi
 
 tools/gn/bootstrap/bootstrap.py -v --no-clean --gn-gen-args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_BROWSER_GN_DEFINES"
-%{builddir}/gn --script-executable=/usr/bin/python2 gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_BROWSER_GN_DEFINES" %{builddir}
+%{builddir}/gn --script-executable=%{chromium_pybin} gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_BROWSER_GN_DEFINES" %{builddir}
 
 %if %{freeworld}
 # do not need to do headless gen
 %else
 %if %{build_headless}
-%{builddir}/gn --script-executable=/usr/bin/python2 gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_HEADLESS_GN_DEFINES" %{headlessbuilddir}
+%{builddir}/gn --script-executable=%{chromium_pybin} gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_HEADLESS_GN_DEFINES" %{headlessbuilddir}
 %endif
 %endif
 
 %if %{build_remoting}
-%{builddir}/gn --script-executable=/usr/bin/python2 gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_BROWSER_GN_DEFINES" %{remotingbuilddir}
+%{builddir}/gn --script-executable=%{chromium_pybin} gen --args="$CHROMIUM_CORE_GN_DEFINES $CHROMIUM_BROWSER_GN_DEFINES" %{remotingbuilddir}
 %endif
 
 %if %{bundlelibusbx}
@@ -1555,7 +1634,8 @@ tar xf %{SOURCE20}
 %global optflags %(echo %{optflags} | sed 's/-g /-g1 /')
 %endif
 
-export PYTHONPATH="../../third_party/pyjson5/src:../../third_party/catapult/third_party/google-endpoints:../../xcb-proto-1.14"
+# export PYTHONPATH="../../third_party/pyjson5/src:../../third_party/catapult/third_party/google-endpoints:../../xcb-proto-1.14"
+export PYTHONPATH="../../third_party/pyjson5/src:../../xcb-proto-1.14"
 
 echo
 # Now do the full browser
@@ -2010,6 +2090,20 @@ getent group chrome-remote-desktop >/dev/null || groupadd -r chrome-remote-deskt
 
 
 %changelog
+* Tue Aug 17 2021 Tom Callaway <spot@fedoraproject.org> - 92.0.4515.159-1
+- update to 92.0.4515.159
+
+* Mon Aug 16 2021 Tom Callaway <spot@fedoraproject.org> - 92.0.4515.131-1
+- update to 92.0.4515.131
+- apply upstream fix for clone3 crash
+
+* Mon Jul 26 2021 Tom Callaway <spot@fedoraproject.org> - 92.0.4515.107-1
+- update to 92.0.4515.107
+- drop python2 deps (finally)
+
+* Wed Jul 21 2021 Fedora Release Engineering <releng@fedoraproject.org> - 91.0.4472.164-2
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_35_Mass_Rebuild
+
 * Fri Jul 16 2021 Tom Callaway <spot@fedoraproject.org> - 91.0.4472.164-1
 - update to 91.0.4472.164
 
diff --git a/sources b/sources
index 2e8e02b..aef7285 100644
--- a/sources
+++ b/sources
@@ -20,4 +20,4 @@ SHA512 (xcb-proto-1.14.tar.xz) = de66d568163b6da2be9d6c59984f3afa3acd119a7813786
 SHA512 (depot_tools.git-master.tar.gz) = dc323888812b66cc92c53a24a8a58ccf9e2961be67aa21852bd091b8b49569071f06ae9104cb58950e6253ac3a29f0db0663e9f35ef2b1ea28696efb38b42708
 SHA512 (NotoSansSymbols2-Regular.ttf) = 2644b42c3fdccfe12395f9b61553aced169a0f1dc09f5a0fd7898e9d0a372ee4422b6b1cdab3c86ecc91db437e9ae8a951e64e85edc3ac9e9fca428852dbb2ad
 SHA512 (NotoSansTibetan-Regular.ttf) = fb5a48fcaea80eebe7d692f6fcf00d59d47658a358d0ec8e046fc559873f88bd595b2da474d2826abd9e9305f3741c69058d867b1e6048f37fe7d71b5d3af36a
-SHA512 (chromium-91.0.4472.164-clean.tar.xz) = 71e3449e2042d83df50a7ea753e5b09bfc331640665f0d9b8d99a424b32a316a92c7bb2726ce51c5418936f423f2f7167fcfc57d51ca658e7e32347e8452efbc
+SHA512 (chromium-92.0.4515.159-clean.tar.xz) = e5062c35c55232f672008d7c4a06daa69a92e0ed4104ea78e60280e2d7d20bcbf1b52c33229fd3b81983b02cbc949d188e5385b6250662248e11660d1fced31d