Sync with upstream branch release/2.34/master

Upstream commit: 55640ed3fde48360a8e8083be4843bd2dc7cecfe - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module
2022-04-27 22:27:50 -04:00 · 2022-04-27 22:27:50 -04:00 · 4e3257320c
parent a8db42ba53
commit 4e3257320c
23 changed files with 8751 additions and 4 deletions
--- a/glibc-upstream-2.34-167.patch
+++ b/glibc-upstream-2.34-167.patch
--- a/glibc-upstream-2.34-168.patch
+++ b/glibc-upstream-2.34-168.patch
@ -0,0 +1,407 @@
+commit f0c71b34f96c816292c49122d50da3a511b67bf2
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Apr 11 11:30:31 2022 +0200
+
+    Default to --with-default-link=no (bug 25812)
+    
+    This is necessary to place the libio vtables into the RELRO segment.
+    New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to
+    verify that this is what actually happens.
+    
+    The new tests fail on ia64 due to lack of (default) RELRO support
+    inbutils, so they are XFAILed there.
+    
+    (cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7)
+
+diff --git a/INSTALL b/INSTALL
+index d8d4e9f155f56616..60d01568d77645c7 100644
+--- a/INSTALL
+++ b/INSTALL
+@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
+      library will still be usable, but functionality may be lost--for
+      example, you can't build a shared libc with old binutils.
+ 
+'--with-default-link=FLAG'
+     With '--with-default-link=yes', the build system does not use a
+     custom linker script for linking shared objects.  The default for
+     FLAG is the opposite, 'no', because the custom linker script is
+     needed for full RELRO protection.
+
+ '--with-nonshared-cflags=CFLAGS'
+      Use additional compiler flags CFLAGS to build the parts of the
+      library which are always statically linked into applications and
+diff --git a/configure b/configure
+index 03f4e59e754b5463..34c64f8de44e3086 100755
+--- a/configure
+++ b/configure
+@@ -3373,7 +3373,7 @@ fi
+ if test "${with_default_link+set}" = set; then :
+   withval=$with_default_link; use_default_link=$withval
+ else
+-  use_default_link=default
+  use_default_link=no
+ fi
+ 
+ 
+@@ -6085,69 +6085,6 @@ fi
+ $as_echo "$libc_cv_hashstyle" >&6; }
+ 
+ 
+-# The linker's default -shared behavior is good enough if it
+-# does these things that our custom linker scripts ensure that
+-# all allocated NOTE sections come first.
+-if test "$use_default_link" = default; then
+-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
+-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
+-if ${libc_cv_use_default_link+:} false; then :
+-  $as_echo_n "(cached) " >&6
+-else
+-    libc_cv_use_default_link=no
+-  cat > conftest.s <<\EOF
+-	  .section .note.a,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "foo"
+-	  .section .note.b,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "bar"
+-EOF
+-  if { ac_try='  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
+-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+-  (eval $ac_try) 2>&5
+-  ac_status=$?
+-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+-  test $ac_status = 0; }; } &&
+-       ac_try=`$READELF -S conftest.so | sed -n \
+-	 '${x;p;}
+-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
+-	  t a
+-	  b
+-	  : a
+-	  H'`
+-  then
+-    libc_seen_a=no libc_seen_b=no
+-    set -- $ac_try
+-    while test $# -ge 2 -a "$1" = NOTE; do
+-      case "$2" in
+-      .note.a) libc_seen_a=yes ;;
+-      .note.b) libc_seen_b=yes ;;
+-      esac
+-      shift 2
+-    done
+-    case "$libc_seen_a$libc_seen_b" in
+-    yesyes)
+-      libc_cv_use_default_link=yes
+-      ;;
+-    *)
+-      echo >&5 "\
+-$libc_seen_a$libc_seen_b from:
+-$ac_try"
+-      ;;
+-    esac
+-  fi
+-  rm -f conftest*
+-fi
+-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
+-$as_echo "$libc_cv_use_default_link" >&6; }
+-  use_default_link=$libc_cv_use_default_link
+-fi
+-
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
+ $as_echo_n "checking for GLOB_DAT reloc... " >&6; }
+ if ${libc_cv_has_glob_dat+:} false; then :
+diff --git a/configure.ac b/configure.ac
+index eb9431875fae1b0e..2c69af0807266e7e 100644
+--- a/configure.ac
+++ b/configure.ac
+@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
+ 	    AS_HELP_STRING([--with-default-link],
+ 			   [do not use explicit linker scripts]),
+ 	    [use_default_link=$withval],
+-	    [use_default_link=default])
+	    [use_default_link=no])
+ 
+ dnl Additional build flags injection.
+ AC_ARG_WITH([nonshared-cflags],
+@@ -1378,59 +1378,6 @@ fi
+ rm -f conftest*])
+ AC_SUBST(libc_cv_hashstyle)
+ 
+-# The linker's default -shared behavior is good enough if it
+-# does these things that our custom linker scripts ensure that
+-# all allocated NOTE sections come first.
+-if test "$use_default_link" = default; then
+-  AC_CACHE_CHECK([for sufficient default -shared layout],
+-		  libc_cv_use_default_link, [dnl
+-  libc_cv_use_default_link=no
+-  cat > conftest.s <<\EOF
+-	  .section .note.a,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "foo"
+-	  .section .note.b,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "bar"
+-EOF
+-  if AC_TRY_COMMAND([dnl
+-  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
+-       ac_try=`$READELF -S conftest.so | sed -n \
+-	 ['${x;p;}
+-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
+-	  t a
+-	  b
+-	  : a
+-	  H']`
+-  then
+-    libc_seen_a=no libc_seen_b=no
+-    set -- $ac_try
+-    while test $# -ge 2 -a "$1" = NOTE; do
+-      case "$2" in
+-      .note.a) libc_seen_a=yes ;;
+-      .note.b) libc_seen_b=yes ;;
+-      esac
+-      shift 2
+-    done
+-    case "$libc_seen_a$libc_seen_b" in
+-    yesyes)
+-      libc_cv_use_default_link=yes
+-      ;;
+-    *)
+-      echo >&AS_MESSAGE_LOG_FD "\
+-$libc_seen_a$libc_seen_b from:
+-$ac_try"
+-      ;;
+-    esac
+-  fi
+-  rm -f conftest*])
+-  use_default_link=$libc_cv_use_default_link
+-fi
+-
+ AC_CACHE_CHECK(for GLOB_DAT reloc,
+ 	       libc_cv_has_glob_dat, [dnl
+ cat > conftest.c <<EOF
+diff --git a/elf/Makefile b/elf/Makefile
+index 8afbe3f6ab259331..fec6e23b5b625e3b 100644
+--- a/elf/Makefile
+++ b/elf/Makefile
+@@ -504,6 +504,40 @@ tests-execstack-yes = \
+   # tests-execstack-yes
+ endif
+ endif
+
+tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
+$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+  $(objpfx)ld.so
+	$(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
+	  --required=_rtld_global_ro \
+	  > $@ 2>&1; $(evaluate-test)
+# The optional symbols are present in libc only if the architecture has
+# the GLIBC_2.0 symbol set in libc.
+$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+  $(common-objpfx)libc.so
+	$(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
+	    --required=_IO_cookie_jumps \
+	    --required=_IO_file_jumps \
+	    --required=_IO_file_jumps_maybe_mmap \
+	    --required=_IO_file_jumps_mmap \
+	    --required=_IO_helper_jumps \
+	    --required=_IO_mem_jumps \
+	    --required=_IO_obstack_jumps \
+	    --required=_IO_proc_jumps \
+	    --required=_IO_str_chk_jumps \
+	    --required=_IO_str_jumps \
+	    --required=_IO_strn_jumps \
+	    --required=_IO_wfile_jumps \
+	    --required=_IO_wfile_jumps_maybe_mmap \
+	    --required=_IO_wfile_jumps_mmap \
+	    --required=_IO_wmem_jumps \
+	    --required=_IO_wstr_jumps \
+	    --required=_IO_wstrn_jumps \
+	    --optional=_IO_old_cookie_jumps \
+	    --optional=_IO_old_file_jumps \
+	    --optional=_IO_old_proc_jumps \
+	  > $@ 2>&1; $(evaluate-test)
+
+ tests += $(tests-execstack-$(have-z-execstack))
+ ifeq ($(run-built-tests),yes)
+ tests-special += \
+diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
+new file mode 100644
+index 0000000000000000..368ea3349f86bd81
+--- /dev/null
+++ b/elf/tst-relro-symbols.py
+@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+# Verify that certain symbols are covered by RELRO.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Analyze a (shared) object to verify that certain symbols are
+present and covered by the PT_GNU_RELRO segment.
+
+"""
+
+import argparse
+import os.path
+import sys
+
+# Make available glibc Python modules.
+sys.path.append(os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
+
+import glibcelf
+
+def find_relro(path: str, img: glibcelf.Image) -> (int, int):
+    """Discover the address range of the PT_GNU_RELRO segment."""
+    for phdr in img.phdrs():
+        if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
+            # The computation is not entirely accurate because
+            # _dl_protect_relro in elf/dl-reloc.c rounds both the
+            # start end and downwards using the run-time page size.
+            return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
+    sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
+    sys.exit(1)
+
+def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
+    """Check if a section or symbol falls within in the RELRO segment."""
+    end = start + size - 1
+    if not (relro_begin <= start < end < relro_end):
+        error(
+            '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
+                kind, name.decode('UTF-8'), start, size,
+                relro_begin, relro_end))
+
+def get_parser():
+    """Return an argument parser for this script."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('object', help='path to object file to check')
+    parser.add_argument('--required', metavar='NAME', default=(),
+                        help='required symbol names', nargs='*')
+    parser.add_argument('--optional', metavar='NAME', default=(),
+                        help='required symbol names', nargs='*')
+    return parser
+
+def main(argv):
+    """The main entry point."""
+    parser = get_parser()
+    opts = parser.parse_args(argv)
+    img = glibcelf.Image.readfile(opts.object)
+
+    required_symbols = frozenset([sym.encode('UTF-8')
+                                  for sym in opts.required])
+    optional_symbols = frozenset([sym.encode('UTF-8')
+                                  for sym in opts.optional])
+    check_symbols = required_symbols | optional_symbols
+
+    # Tracks the symbols in check_symbols that have been found.
+    symbols_found = set()
+
+    # Discover the extent of the RELRO segment.
+    relro_begin, relro_end = find_relro(opts.object, img)
+    symbol_table_found = False
+
+    errors = False
+    def error(msg: str) -> None:
+        """Record an error condition and write a message to standard output."""
+        nonlocal errors
+        errors = True
+        sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
+
+    # Iterate over section headers to find the symbol table.
+    for shdr in img.shdrs():
+        if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
+            symbol_table_found = True
+            for sym in img.syms(shdr):
+                if sym.st_name in check_symbols:
+                    symbols_found.add(sym.st_name)
+
+                    # Validate symbol type, section, and size.
+                    if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
+                        error('symbol {!r} has wrong type {}'.format(
+                            sym.st_name.decode('UTF-8'), sym.st_info.type))
+                    if sym.st_shndx in glibcelf.Shn:
+                        error('symbol {!r} has reserved section {}'.format(
+                            sym.st_name.decode('UTF-8'), sym.st_shndx))
+                        continue
+                    if sym.st_size == 0:
+                        error('symbol {!r} has size zero'.format(
+                            sym.st_name.decode('UTF-8')))
+                        continue
+
+                    check_in_relro('symbol', relro_begin, relro_end,
+                                   sym.st_name, sym.st_value, sym.st_size,
+                                   error)
+            continue # SHT_SYMTAB
+        if shdr.sh_name == b'.data.rel.ro' \
+           or shdr.sh_name.startswith(b'.data.rel.ro.'):
+            check_in_relro('section', relro_begin, relro_end,
+                           shdr.sh_name, shdr.sh_addr, shdr.sh_size,
+                           error)
+            continue
+
+    if required_symbols - symbols_found:
+        for sym in sorted(required_symbols - symbols_found):
+            error('symbol {!r} not found'.format(sym.decode('UTF-8')))
+
+    if errors:
+        sys.exit(1)
+
+    if not symbol_table_found:
+        sys.stdout.write(
+            '{}: warning: no symbol table found (stripped object)\n'.format(
+                opts.object))
+        sys.exit(77)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+diff --git a/manual/install.texi b/manual/install.texi
+index 816b77a0a25a88a7..36a5af62bc5722b0 100644
+--- a/manual/install.texi
+++ b/manual/install.texi
+@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be
+ usable, but functionality may be lost---for example, you can't build a
+ shared libc with old binutils.
+ 
+@item --with-default-link=@var{FLAG}
+With @code{--with-default-link=yes}, the build system does not use a
+custom linker script for linking shared objects.  The default for
+@var{FLAG} is the opposite, @samp{no}, because the custom linker script
+is needed for full RELRO protection.
+
+ @item --with-nonshared-cflags=@var{cflags}
+ Use additional compiler flags @var{cflags} to build the parts of the
+ library which are always statically linked into applications and
+diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
+index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644
+--- a/sysdeps/unix/sysv/linux/ia64/Makefile
+++ b/sysdeps/unix/sysv/linux/ia64/Makefile
+@@ -1,3 +1,9 @@
+ifeq ($(subdir),elf)
+# ia64 does not support PT_GNU_RELRO.
+test-xfail-tst-relro-ldso = yes
+test-xfail-tst-relro-libc = yes
+endif
+
+ ifeq ($(subdir),misc)
+ sysdep_headers += sys/rse.h
+ endif
--- a/glibc-upstream-2.34-169.patch
+++ b/glibc-upstream-2.34-169.patch
@ -0,0 +1,87 @@
+commit ca0faa140ff8cebe4c041d935f0f5eb480873d99
+Author: Joan Bruguera <joanbrugueram@gmail.com>
+Date:   Mon Apr 11 19:49:56 2022 +0200
+
+    misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
+    
+    If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify
+    checks should pass, and `__whatever_alias` should be called.
+    
+    Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but
+    on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`.
+    
+    A comment says the -1 case should work as: "The -1 check is redundant because
+    since it implies that __glibc_safe_len_cond is true.". But this fails when:
+    * `__s > 1`
+    * `__osz == -1` (i.e. unknown size at compile time)
+    * `__l` is big enough
+    * `__l * __s <= __osz` can be folded to a constant
+    (I only found this to be true for `mbsrtowcs` and other functions in wchar2.h)
+    
+    In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be
+    called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program.
+    
+    This commit adds the explicit `__osz == -1` check again.
+    moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041
+    
+    Minimal test case (test.c):
+        #include <wchar.h>
+    
+        int main (void)
+        {
+            const char *hw = "HelloWorld";
+            mbsrtowcs (NULL, &hw, (size_t)-1, NULL);
+            return 0;
+        }
+    
+    Build with:
+        gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test
+    
+    Output:
+        *** buffer overflow detected ***: terminated
+    
+    Fixes: BZ #29030
+    Signed-off-by: Joan Bruguera <joanbrugueram@gmail.com>
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    (cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e)
+
+diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
+index 8b5902423cf0ad88..fb02452f5993c594 100644
+--- a/debug/tst-fortify.c
+++ b/debug/tst-fortify.c
+@@ -1505,6 +1505,11 @@ do_test (void)
+       CHK_FAIL_END
+ #endif
+ 
+      /* Bug 29030 regresion check */
+      cp = "HelloWorld";
+      if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
+        FAIL ();
+
+       cp = "A";
+       if (mbstowcs (wenough, cp, 10) != 1
+ 	  || wcscmp (wenough, L"A") != 0)
+diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
+index 515fb681a0547217..b36013b9a6b4d9c3 100644
+--- a/misc/sys/cdefs.h
+++ b/misc/sys/cdefs.h
+@@ -161,13 +161,13 @@
+    || (__builtin_constant_p (__l) && (__l) > 0))
+ 
+ /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
+-   condition can be folded to a constant and if it is true.  The -1 check is
+-   redundant because since it implies that __glibc_safe_len_cond is true.  */
+   condition can be folded to a constant and if it is true, or unknown (-1) */
+ #define __glibc_safe_or_unknown_len(__l, __s, __osz) \
+-  (__glibc_unsigned_or_positive (__l)					      \
+-   && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l),     \
+-						   __s, __osz))		      \
+-   && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
+  ((__osz) == (__SIZE_TYPE__) -1					      \
+   || (__glibc_unsigned_or_positive (__l)				      \
+       && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
+						       (__s), (__osz)))	      \
+       && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
+ 
+ /* Conversely, we know at compile time that the length is unsafe if the
+    __L * __S <= __OBJSZ condition can be folded to a constant and if it is
--- a/glibc-upstream-2.34-170.patch
+++ b/glibc-upstream-2.34-170.patch
@ -0,0 +1,49 @@
+commit 0d477e92c49db2906b32e44135b98746ccc73c7b
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue Apr 26 14:22:10 2022 +0200
+
+    INSTALL: Rephrase -with-default-link documentation
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f)
+
+diff --git a/INSTALL b/INSTALL
+index 60d01568d77645c7..10a3dcdc0a8db665 100644
+--- a/INSTALL
+++ b/INSTALL
+@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
+      library will still be usable, but functionality may be lost--for
+      example, you can't build a shared libc with old binutils.
+ 
+-'--with-default-link=FLAG'
+-     With '--with-default-link=yes', the build system does not use a
+-     custom linker script for linking shared objects.  The default for
+-     FLAG is the opposite, 'no', because the custom linker script is
+'--with-default-link'
+     With '--with-default-link', the build system does not use a custom
+     linker script for linking shared objects.  The default is
+     '--without-default-link', because the custom linker script is
+      needed for full RELRO protection.
+ 
+ '--with-nonshared-cflags=CFLAGS'
+diff --git a/manual/install.texi b/manual/install.texi
+index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644
+--- a/manual/install.texi
+++ b/manual/install.texi
+@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be
+ usable, but functionality may be lost---for example, you can't build a
+ shared libc with old binutils.
+ 
+-@item --with-default-link=@var{FLAG}
+-With @code{--with-default-link=yes}, the build system does not use a
+-custom linker script for linking shared objects.  The default for
+-@var{FLAG} is the opposite, @samp{no}, because the custom linker script
+-is needed for full RELRO protection.
+@item --with-default-link
+With @code{--with-default-link}, the build system does not use a custom
+linker script for linking shared objects.  The default is
+@code{--without-default-link}, because the custom linker script is
+needed for full RELRO protection.
+ 
+ @item --with-nonshared-cflags=@var{cflags}
+ Use additional compiler flags @var{cflags} to build the parts of the
--- a/glibc-upstream-2.34-171.patch
+++ b/glibc-upstream-2.34-171.patch
@ -0,0 +1,377 @@
+commit bc56ab1f4aa937665034373d3e320d0779a839aa
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue Apr 26 14:23:02 2022 +0200
+
+    dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
+    
+    When audit modules are loaded, ld.so initialization is not yet
+    complete, and rtld_active () returns false even though ld.so is
+    mostly working.  Instead, the static dlopen hook is used, but that
+    does not work at all because this is not a static dlopen situation.
+    
+    Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework
+    static dlopen hooks") moved the hook pointer into _rtld_global_ro,
+    which means that separate protection is not needed anymore and the
+    hook pointer can be checked directly.
+    
+    The guard for disabling libio vtable hardening in _IO_vtable_check
+    should stay for now.
+    
+    Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so:
+    Examine GLRO to detect inactive loader [BZ #20204]").
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b)
+
+diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
+index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644
+--- a/dlfcn/dladdr.c
+++ b/dlfcn/dladdr.c
+@@ -24,7 +24,7 @@ int
+ __dladdr (const void *address, Dl_info *info)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dladdr (address, info);
+ #endif
+   return _dl_addr (address, info, NULL, NULL);
+diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
+index 78560dbac208c316..93ce68c1d6067fe2 100644
+--- a/dlfcn/dladdr1.c
+++ b/dlfcn/dladdr1.c
+@@ -24,7 +24,7 @@ int
+ __dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
+ #endif
+ 
+diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
+index 6a013a81bb648191..07ecb21bf7d43be4 100644
+--- a/dlfcn/dlclose.c
+++ b/dlfcn/dlclose.c
+@@ -24,7 +24,7 @@ int
+ __dlclose (void *handle)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlclose (handle);
+ #endif
+ 
+diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
+index 5047b140662bc33e..63da79c63000eef0 100644
+--- a/dlfcn/dlerror.c
+++ b/dlfcn/dlerror.c
+@@ -32,7 +32,7 @@ char *
+ __dlerror (void)
+ {
+ # ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlerror ();
+ # endif
+ 
+diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
+index c6f9a1da09ff8622..47d2daa96fa5986f 100644
+--- a/dlfcn/dlinfo.c
+++ b/dlfcn/dlinfo.c
+@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg)
+ int
+ ___dlinfo (void *handle, int request, void *arg)
+ {
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
+   else
+     return dlinfo_implementation (handle, request, arg);
+diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
+index c171c8953da20fc7..2309224eb8484b1a 100644
+--- a/dlfcn/dlmopen.c
+++ b/dlfcn/dlmopen.c
+@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
+ void *
+ ___dlmopen (Lmid_t nsid, const char *file, int mode)
+ {
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
+   else
+     return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
+index e04b374b82b04337..9c59c751c4eaf7a7 100644
+--- a/dlfcn/dlopen.c
+++ b/dlfcn/dlopen.c
+@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
+ void *
+ ___dlopen (const char *file, int mode)
+ {
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
+   else
+     return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
+index 9115501ac121eeca..c2f2a42194d50953 100644
+--- a/dlfcn/dlopenold.c
+++ b/dlfcn/dlopenold.c
+@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
+     mode |= RTLD_LAZY;
+   args.mode = mode;
+ 
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
+ 
+   return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
+diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
+index 43044cf7bb95801e..d3861170a7631d01 100644
+--- a/dlfcn/dlsym.c
+++ b/dlfcn/dlsym.c
+@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
+ void *
+ ___dlsym (void *handle, const char *name)
+ {
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
+   else
+     return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
+index 9b76f9afa513e11f..3af02109c306b800 100644
+--- a/dlfcn/dlvsym.c
+++ b/dlfcn/dlvsym.c
+@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
+ void *
+ ___dlvsym (void *handle, const char *name, const char *version)
+ {
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
+ 					 RETURN_ADDRESS (0));
+   else
+diff --git a/elf/Makefile b/elf/Makefile
+index fec6e23b5b625e3b..c89a6a58690646ee 100644
+--- a/elf/Makefile
+++ b/elf/Makefile
+@@ -376,6 +376,7 @@ tests += \
+   tst-audit24d \
+   tst-audit25a \
+   tst-audit25b \
+  tst-audit26 \
+   tst-auditmany \
+   tst-auxobj \
+   tst-auxobj-dlopen \
+@@ -721,6 +722,7 @@ modules-names = \
+   tst-auditmod24c \
+   tst-auditmod24d \
+   tst-auditmod25 \
+  tst-auditmod26 \
+   tst-auxvalmod \
+   tst-big-note-lib \
+   tst-deep1mod1 \
+@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
+ LDFLAGS-tst-audit25b = -Wl,-z,now
+ tst-audit25b-ARGS = -- $(host-test-program-cmd)
+ 
+$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
+$(objpfx)tst-auditmod26.so: $(libsupport)
+tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
+
+ # tst-sonamemove links against an older implementation of the library.
+ LDFLAGS-tst-sonamemove-linkmod1.so = \
+   -Wl,--version-script=tst-sonamemove-linkmod1.map \
+diff --git a/elf/dl-libc.c b/elf/dl-libc.c
+index d5bc4a277f4c6ef3..db4342a3256921f0 100644
+--- a/elf/dl-libc.c
+++ b/elf/dl-libc.c
+@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode)
+   args.caller_dlopen = RETURN_ADDRESS (0);
+ 
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
+ #endif
+   return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
+@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name)
+   args.name = name;
+ 
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
+ #endif
+   return (dlerror_run (do_dlsym, &args) ? NULL
+@@ -199,7 +199,7 @@ void *
+ __libc_dlvsym (void *map, const char *name, const char *version)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
+ #endif
+ 
+@@ -222,7 +222,7 @@ int
+ __libc_dlclose (void *map)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
+ #endif
+   return dlerror_run (do_dlclose, map);
+diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
+new file mode 100644
+index 0000000000000000..3f920e83bac247a5
+--- /dev/null
+++ b/elf/tst-audit26.c
+@@ -0,0 +1,35 @@
+/* Check the usability of <dlfcn.h> functions in audit modules.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <gnu/lib-names.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+  /* Check that the audit module has been loaded.  */
+  void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
+  TEST_VERIFY (handle
+	       == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
+
+  return 0;
+}
+
+#include <support/test-driver.c>
+diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
+new file mode 100644
+index 0000000000000000..db7ba95abec20f53
+--- /dev/null
+++ b/elf/tst-auditmod26.c
+@@ -0,0 +1,104 @@
+/* Check the usability of <dlfcn.h> functions in audit modules.  Audit module.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <first-versions.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+unsigned int
+la_version (unsigned int current)
+{
+  /* Exercise various <dlfcn.h> functions.  */
+
+  /* Check dlopen, dlsym, dlclose.   */
+  void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
+  void *ptr = xdlsym (handle, "sincos");
+  TEST_VERIFY (ptr != NULL);
+  ptr = dlsym (handle, "SINCOS");
+  TEST_VERIFY (ptr == NULL);
+  const char *message = dlerror ();
+  TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
+  ptr = dlsym (handle, "SINCOS");
+  TEST_VERIFY (ptr == NULL);
+  xdlclose (handle);
+  TEST_COMPARE_STRING (dlerror (), NULL);
+
+  handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
+
+  /* Check dlvsym.  _exit is unlikely to gain another symbol
+     version.  */
+  TEST_VERIFY (xdlsym (handle, "_exit")
+               == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
+
+  /* Check dlinfo.  */
+  {
+    void *handle2 = NULL;
+    TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
+    TEST_VERIFY (handle2 == handle);
+  }
+
+  /* Check dladdr and dladdr1.  */
+  Dl_info info = { };
+  TEST_VERIFY (dladdr (&_exit, &info) != 0);
+  if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias.  */
+    TEST_COMPARE_STRING (info.dli_sname, "_exit");
+  TEST_VERIFY (info.dli_saddr == &_exit);
+  TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
+  void *extra_info;
+  memset (&info, 0, sizeof (info));
+  TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+  TEST_VERIFY (extra_info == handle);
+
+  /* Verify that dlmopen creates a new namespace.  */
+  void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
+  TEST_VERIFY (dlmopen_handle != handle);
+  memset (&info, 0, sizeof (info));
+  extra_info = NULL;
+  ptr = xdlsym (dlmopen_handle, "_exit");
+  TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+  TEST_VERIFY (extra_info == dlmopen_handle);
+  xdlclose (dlmopen_handle);
+
+  /* Terminate the process with an error state.  This does not happen
+     automatically because the audit module state is not shared with
+     the main program.  */
+  if (support_record_failure_is_failed ())
+    {
+      fflush (stdout);
+      fflush (stderr);
+      _exit (1);
+    }
+
+  return LAV_CURRENT;
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  if (strcmp (name, "mapped to libc") == 0)
+    return (char *) LIBC_SO;
+  else
+    return (char *) name;
+}
--- a/glibc-upstream-2.34-172.patch
+++ b/glibc-upstream-2.34-172.patch
@ -0,0 +1,28 @@
+commit 83cc145830bdbefdabe03787ed884d548bea9c99
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Apr 22 19:34:52 2022 +0200
+
+    scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
+    
+    enum.IntFlag and enum.EnumMeta._missing_ support are not part of
+    earlier Python versions.
+    
+    (cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b)
+
+diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
+index 8f7d0ca184845714..da0d5380f33a195e 100644
+--- a/scripts/glibcelf.py
+++ b/scripts/glibcelf.py
+@@ -28,6 +28,12 @@ import collections
+ import enum
+ import struct
+ 
+if not hasattr(enum, 'IntFlag'):
+    import sys
+    sys.stdout.write(
+        'warning: glibcelf.py needs Python 3.6 for enum support\n')
+    sys.exit(77)
+
+ class _OpenIntEnum(enum.IntEnum):
+     """Integer enumeration that supports arbitrary int values."""
+     @classmethod
--- a/glibc-upstream-2.34-173.patch
+++ b/glibc-upstream-2.34-173.patch
@ -0,0 +1,254 @@
+commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Aug 20 06:42:24 2021 -0700
+
+    x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
+    
+    Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+    by replacing
+    
+            vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+    
+    and
+    
+            vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+    
+    with
+            vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+    
+    This fixes BZ #28252.
+    
+    (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index dfa2acafc486b56b..f5f117d474f66176 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index be8ab7c6e0e33819..48d251db16ccab9d 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 611887082a545854..a4944a4feef6aa98 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index f671d60d5dab5a0e..fe8474fed943e8ad 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
+        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 637bfe3c06ab9ad4..229b7828cde04db2 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
+        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 9cf359c86ff9bd70..a446c504f63c9399 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index bd05109a62181f22..c1b352d0ad1992cd 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
--- a/glibc-upstream-2.34-174.patch
+++ b/glibc-upstream-2.34-174.patch
@ -0,0 +1,42 @@
+commit b5a44a6a471aafd3677659a610f32468c40a666b
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Sep 21 18:31:49 2021 -0500
+
+    x86: Modify ENTRY in sysdep.h so that p2align can be specified
+    
+    No bug.
+    
+    This change adds a new macro ENTRY_P2ALIGN which takes a second
+    argument, log2 of the desired function alignment.
+    
+    The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+    doesn't affect any existing functionality.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index cac1d762fb3f99d0..937180c1bd791570 100644
+--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
+#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
+  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
+/* Common entry 16 byte aligns.  */
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
+
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
--- a/glibc-upstream-2.34-175.patch
+++ b/glibc-upstream-2.34-175.patch
@ -0,0 +1,653 @@
+commit 5ec3416853c4150c4d13312e05f93a053586d528
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Sep 21 18:45:03 2021 -0500
+
+    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
+    
+    No bug.
+    
+    The frontend optimizations are to:
+    1. Reorganize logically connected basic blocks so they are either in
+       the same cache line or adjacent cache lines.
+    2. Avoid cases when basic blocks unnecissarily cross cache lines.
+    3. Try and 32 byte align any basic blocks possible without sacrificing
+       code size. Smaller / Less hot basic blocks are used for this.
+    
+    Overall code size shrunk by 168 bytes. This should make up for any
+    extra costs due to aligning to 64 bytes.
+    
+    In general performance before deviated a great deal dependending on
+    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+    essentially make it so that the current implementation is at least
+    equal to the best alignment of the original for any arguments.
+    
+    The only additional optimization is in the page cross case. Branch on
+    equals case was removed from the size == [4, 7] case. As well the [4,
+    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+    argument size.
+    
+    test-memcmp and test-wmemcmp are both passing.
+    
+    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
+
+When possible the implementation tries to optimize for frontend in the
+following ways:
+Throughput:
+    1. All code sections that fit are able to run optimally out of the
+       LSD.
+    2. All code sections that fit are able to run optimally out of the
+       DSB
+    3. Basic blocks are contained in minimum number of fetch blocks
+       necessary.
+
+Latency:
+    1. Logically connected basic blocks are put in the same
+       cache-line.
+    2. Logically connected basic blocks that do not fit in the same
+       cache-line are put in adjacent lines. This can get beneficial
+       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
+/* Cache align memcmp entry. This allows for much more thorough
+   frontend optimization.  */
+ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	   oring with YMM1. Result is stored in YMM4.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
+	 */
+	VPTEST	%YMM4, %YMM4, %k1
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+	.p2align 4
+L(8x_end_return_vec_0_1_2_3):
+	movq	%rdx, %rdi
+L(8x_return_vec_0_1_2_3):
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	VPTEST	%YMM1, %YMM1, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+	VPTEST	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
+	VPTEST	%YMM3, %YMM3, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
+	   line.  */
+	bsfl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
+	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
+	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
+	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
+	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
+	   not aligning keeps it in the same cache line as
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
+	   size.  */
+	.p2align 4,, 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addq	%rdx, %rax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
+	.p2align 4,, 10
+L(return_vec_1_end):
+	/* Use bsf to save code size. This is necessary to have
+	   L(one_or_less) fit in aligning bytes between.  */
+	bsfl	%eax, %eax
+	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+	/* NB: L(one_or_less) fits in alignment padding between
+	   L(return_vec_1_end) and L(return_vec_0_end).  */
+# ifdef USE_AS_WMEMCMP
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	ret
+# else
+L(one_or_less):
+	jb	L(zero)
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+	subl	%ecx, %eax
+	ret
+# endif
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size == 0
+	   but is also faster for size == CHAR_SIZE.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMP	$4, (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check if any matches where in bounds. Intentionally not
+	   storing result in eax to limit dependency chain if it goes to
+	   L(return_vec_0_lv).  */
+	bzhil	%edx, %eax, %edx
+	jnz	L(return_vec_0_lv)
+	xorl	%eax, %eax
+	ret
+
+	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+	   the jump and ends up fitting in aligning bytes. As well fits on
+	   same cache line as L(less_vec) so also saves a line from having
+	   to be fetched on cold calls to memcmp.  */
+	.p2align 4,, 4
+L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+	jb	L(between_2_3)
+
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* edx is guranteed to be positive int32 in range [4, 7].  */
+	cmovne	%edx, %eax
+	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+	sbbl	%ecx, %ecx
+	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+	   eax doesn't matter.  */
+	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
+	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+	VPCMP	$4, %xmm1, %xmm2, %k1
+	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
+
+	/* Use movups to save code size.  */
+	movups	(%rsi), %xmm2
+	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
+	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
+# ifndef USE_AS_WMEMCMP
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	shll	$8, %eax
+	shll	$8, %ecx
+	bswap	%eax
+	bswap	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper 8 bits are zero.  */
+	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
--- a/glibc-upstream-2.34-176.patch
+++ b/glibc-upstream-2.34-176.patch
@ -0,0 +1,497 @@
+commit 6d18a93dbbde2958001d65dff3080beed7ae675a
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Sep 20 16:20:15 2021 -0500
+
+    x86: Optimize memset-vec-unaligned-erms.S
+    
+    No bug.
+    
+    Optimization are
+    
+    1. change control flow for L(more_2x_vec) to fall through to loop and
+       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+       size and saves jumps for length > 4x VEC_SIZE.
+    
+    2. For EVEX/AVX512 move L(less_vec) closer to entry.
+    
+    3. Avoid complex address mode for length > 2x VEC_SIZE
+    
+    4. Slightly better aligning code for the loop from the perspective of
+       code size and uops.
+    
+    5. Align targets so they make full use of their fetch block and if
+       possible cache line.
+    
+    6. Try and reduce total number of icache lines that will need to be
+       pulled in for a given length.
+    
+    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+       jumping to the stosb target in the sse2 code section will almost
+       certainly be to a new page. The new version does increase code size
+       marginally by duplicating the target but should get better iTLB
+       behavior as a result.
+    
+    test-memset, test-wmemset, and test-bzero are all passing.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
+#define MOV_SIZE	3
+#define RET_SIZE	1
+
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
+#define VMOVU     movups
+#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f36a47d594..1af668af0aeda59e 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
+# define USE_WITH_AVX2	1
+
+ # define VEC_SIZE	32
+# define MOV_SIZE	4
+# define RET_SIZE	4
+
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
+
+# define VMOVU     vmovdqu
+# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc2f140527..f14d6f8493c21a36 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
+# define USE_WITH_AVX512	1
+
+ # define VEC_SIZE	64
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f092903302ad0..64b09e77cc20cc42 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
+# define USE_WITH_EVEX	1
+
+ # define VEC_SIZE	32
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index ff196844a093dc3b..e723413a664c088f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
+#if VEC_SIZE == 64
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
+#else
+# define LOOP_4X_OFFSET	(0)
+#endif
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+# define END_REG	rcx
+# define LOOP_REG	rdi
+#else
+# define END_REG	rdi
+# define LOOP_REG	rdx
+#endif
+
+ #define PAGE_SIZE 4096
+ 
+/* Macro to calculate size of small memset block for aligning
+   purposes.  */
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
+
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	%VEC(0), (%rax)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 10
+L(last_2x_vec):
+#ifdef USE_LESS_VEC_MASK_STORE
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
+	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
+	/* If have AVX512 mask instructions put L(less_vec) close to
+	   entry as it doesn't take much space and is likely a hot target.
+	 */
+#ifdef USE_LESS_VEC_MASK_STORE
+	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
+	   serious performance degradation when it has to fault supress.
+	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
+	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* Include L(stosb_local) here if including L(less_vec) between
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+	   L(stosb_more_2x_vec) target.  */
+	.p2align 4,, 10
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
+# endif
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
+L(stosb_more_2x_vec):
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+	ja	L(stosb_local)
+#endif
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+	   and (4x, 8x] jump to target.  */
+L(more_2x_vec):
+
+	/* Two different methods of setting up pointers / compare. The
+	   two methods are based on the fact that EVEX/AVX512 mov
+	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+	   this saves code size and keeps a few targets in one fetch block.
+	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+	   LOOP_4X_OFFSET) with LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
+	VMOVU	%VEC(0), (%rax)
+	VMOVU	%VEC(0), VEC_SIZE(%rax)
+
+
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+	addq	%rdx, %END_REG
+#endif
+
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
+	   extra offset to addresses in loop. Used for AVX512 to save space
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
+# if LOOP_4X_OFFSET == 0
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
+	/* Avoid imm32 compare here to save code size.  */
+	cmpq	%rdi, %rcx
+#else
+	addq	$-(VEC_SIZE * 4), %END_REG
+	cmpq	$(VEC_SIZE * 8), %rdx
+#endif
+	jbe	L(last_4x_vec)
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* Set LOOP_REG (rdx).  */
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
+#endif
+	/* Align dst for loop.  */
+	andq	$(VEC_SIZE * -2), %LOOP_REG
+	.p2align 4
+L(loop):
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+	cmpq	%END_REG, %LOOP_REG
+	jb	L(loop)
+	.p2align 4,, MOV_SIZE
+L(last_4x_vec):
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return):
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
+	ret
+#endif
+
+	.p2align 4,, 10
+#ifndef USE_LESS_VEC_MASK_STORE
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+	   range for 2-byte jump encoding.  */
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
+	/* Define L(less_vec) only if not otherwise defined.  */
+	.p2align 4
+L(less_vec):
+#endif
+L(cross_page):
+#if VEC_SIZE > 32
+	cmpl	$32, %edx
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
+#endif
+	MOVQ	%XMM0, %rdi
+	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
+	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
+	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
+	jb	L(return)
+	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
+
+	/* Align small targets only if not doing so would cross a fetch
+	   line.  */
+#if VEC_SIZE > 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
+	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
+#endif
+
+#if VEC_SIZE >= 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
+	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
+	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
+#endif
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	%rdi, (%rax)
+	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
+	movl	%edi, (%rax)
+	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
+	movw	%di, (%rax)
+	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
--- a/glibc-upstream-2.34-177.patch
+++ b/glibc-upstream-2.34-177.patch
@ -0,0 +1,40 @@
+commit baf3ece63453adac59c5688930324a78ced5b2e4
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sat Oct 23 01:26:47 2021 -0400
+
+    x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
+    
+    This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+    
+    it could potentially be dangerous to use SSE2 if this function is ever
+    called without using 'vzeroupper' beforehand. While compilers appear
+    to use 'vzeroupper' before function calls if AVX2 has been used, using
+    SSE2 here is more brittle. Since it is not absolutely necessary it
+    should be avoided.
+    
+    It costs 2-extra bytes but the extra bytes should only eat into
+    alignment padding.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f2e7dea9f..640f6757fac8a356 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
+	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
--- a/glibc-upstream-2.34-178.patch
+++ b/glibc-upstream-2.34-178.patch
@ -0,0 +1,690 @@
+commit f35ad30da4880a1574996df0674986ecf82fa7ae
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Oct 29 12:40:20 2021 -0700
+
+    x86-64: Improve EVEX strcmp with masked load
+    
+    In strcmp-evex.S, to compare 2 32-byte strings, replace
+    
+            VMOVU   (%rdi, %rdx), %YMM0
+            VMOVU   (%rsi, %rdx), %YMM1
+            /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+            VPCMP   $4, %YMM0, %YMM1, %k0
+            VPCMP   $0, %YMMZERO, %YMM0, %k1
+            VPCMP   $0, %YMMZERO, %YMM1, %k2
+            /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+            kord    %k1, %k2, %k1
+            /* Each bit in K1 represents a NULL or a mismatch.  */
+            kord    %k0, %k1, %k1
+            kmovd   %k1, %ecx
+            testl   %ecx, %ecx
+            jne     L(last_vector)
+    
+    with
+    
+            VMOVU   (%rdi, %rdx), %YMM0
+            VPTESTM %YMM0, %YMM0, %k2
+            /* Each bit cleared in K1 represents a mismatch or a null CHAR
+               in YMM0 and 32 bytes at (%rsi, %rdx).  */
+            VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+            kmovd   %k1, %ecx
+            incl    %ecx
+            jne     L(last_vector)
+    
+    It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+    and Ice Lake.
+    
+    Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
+# define YMM8		ymm25
+# define YMM9		ymm26
+# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (%rsi).  */
+	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
+
+	VPMINU	%YMM0, %YMM2, %YMM8
+	VPMINU	%YMM4, %YMM6, %YMM9
+
+	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+	VPMINU	%YMM8, %YMM9, %YMM8
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	VPTESTM	%YMM8, %YMM8, %k1
+
+	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+	vpxorq	(%rdx), %YMM0, %YMM1
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+
+	vporq	%YMM1, %YMM3, %YMM9
+	vporq	%YMM5, %YMM7, %YMM10
+
+	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+	vporq	%YMM9, %YMM10, %YMM9
+
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+	kmovd   %k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	je	 L(loop)
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM0 and (%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
+	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+	VPTESTM	%YMM2, %YMM2, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM2 and VEC_SIZE(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
+	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
+	VPTESTM	%YMM4, %YMM4, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
+	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+	VPTESTM	%YMM6, %YMM6, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
+
+	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+	VPTESTM	%YMM2, %YMM2, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+	kmovd	%k1, %r9d
+	/* Don't use subl since it is the lower 16/32 bits of RDI
+	   below.  */
+	notl	%r9d
+# ifdef USE_AS_WCSCMP
+	/* Only last 8 bits are valid.  */
+	andl	$0xff, %r9d
+# endif
+
+	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+	VPTESTM	%YMM3, %YMM3, %k4
+	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+	kmovd	%k3, %edi
+# ifdef USE_AS_WCSCMP
+	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+	notl	%edi
+	andl	$0xff, %edi
+# else
+	incl	%edi
+# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
+	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
+
+	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
+	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
+	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+	orq	%r9, %rdi
+# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+	kmovd	%k1, %r9d
+	/* Don't use subl since it is the lower 16/32 bits of RDI
+	   below.  */
+	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
+	/* Only last 8 bits are valid.  */
+	andl	$0xff, %r9d
+# endif
+
+	VPTESTM	%YMM1, %YMM1, %k4
+	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+	kmovd	%k3, %edi
+# ifdef USE_AS_WCSCMP
+	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+	notl	%edi
+	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
+# ifdef USE_AS_WCSCMP
+	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+	sall	$8, %edi
+
+	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+	orl	%r9d, %edi
+# else
+	salq	$32, %rdi
+
+	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+	orq	%r9, %rdi
+# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
+	/* Skip ECX bytes.  */
+	shrl	%cl, %edi
+# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
+# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
+	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
+
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xf, %ecx
+# else
+	subl	$0xffff, %ecx
+# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and XMM1.  */
+	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
+	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and XMM1.  */
+	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
+	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
--- a/glibc-upstream-2.34-179.patch
+++ b/glibc-upstream-2.34-179.patch
@ -0,0 +1,85 @@
+commit a182bb7a3922404f79def09d79ef89678b4049f0
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Oct 29 12:56:53 2021 -0700
+
+    x86-64: Remove Prefer_AVX2_STRCMP
+    
+    Remove Prefer_AVX2_STRCMP to enable EVEX strcmp.  When comparing 2 32-byte
+    strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
+    VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
+    and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
+    VPMOVMSKB and 1 TESTL.  EVEX strcmp is now faster than AVX2 strcmp by up
+    to 40% on Tiger Lake and Ice Lake.
+    
+    (cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index de4e3c3b7258120d..f4d4049e391cbabd 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -574,14 +574,6 @@ disable_tsx:
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	      |= bit_arch_Prefer_No_VZEROUPPER;
+-
+-	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+-	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+-	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+-	     AVX2 strcmp is faster than EVEX strcmp.  */
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+-	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+-	      |= bit_arch_Prefer_AVX2_STRCMP;
+ 	}
+ 
+       /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 58f2fad4323d5d91..957db3ad229ba39f 100644
+--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
+-	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+-		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 3bdc76cf71007948..8250bfcbecd29a9f 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -31,5 +31,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+-BIT (Prefer_AVX2_STRCMP)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 62b7abeeee646ab4..7c2901bf44456259 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 60ba0fe356b31779..f94a421784bfe923 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
--- a/glibc-upstream-2.34-180.patch
+++ b/glibc-upstream-2.34-180.patch
@ -0,0 +1,48 @@
+commit 2e64237a8744dd50f9222293275fa52e7248ff76
+Author: Fangrui Song <maskray@google.com>
+Date:   Tue Nov 2 20:59:52 2021 -0700
+
+    x86-64: Replace movzx with movzbl
+    
+    Clang cannot assemble movzx in the AT&T dialect mode.
+    
+    ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+     movzx (%rsi), %ecx
+                   ^~~~
+    
+    Change movzx to movzbl, which follows the AT&T dialect and is used
+    elsewhere in the file.
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index bc19547b09639071..6197a723b9e0606e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index 824e648230a15739..7f8a1bc756f86aee 100644
+--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
--- a/glibc-upstream-2.34-181.patch
+++ b/glibc-upstream-2.34-181.patch
@ -0,0 +1,843 @@
+commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Nov 1 00:49:51 2021 -0500
+
+    x86: Optimize memmove-vec-unaligned-erms.S
+    
+    No bug.
+    
+    The optimizations are as follows:
+    
+    1) Always align entry to 64 bytes. This makes behavior more
+       predictable and makes other frontend optimizations easier.
+    
+    2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
+       significant benefits in the case that:
+            0 < (dst - src) < [256, 512]
+    
+    3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
+       improvement and for FSRM [-10%, 25%].
+    
+    In addition to these primary changes there is general cleanup
+    throughout to optimize the aligning routines and control flow logic.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
+
+diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
+index db106a7a1f23f268..b2b318084823dceb 100644
+--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
+@@ -25,7 +25,7 @@
+ /* Use movups and movaps for smaller code sizes.  */
+ #define VMOVU		movups
+ #define VMOVA		movaps
+-
+#define MOV_SIZE	3
+ #define SECTION(p)		p
+ 
+ #ifdef USE_MULTIARCH
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+index 1ec1962e861dbf63..67a55f0c85af841c 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
+# define MOV_SIZE	4
+ # define ZERO_UPPER_VEC_REGISTERS_RETURN \
+   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+index e195e93f153c9512..975ae6c0515b83cb 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
+# define MOV_SIZE	4
+ # define SECTION(p)		p##.avx
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index 848848ab39ff9326..0fa7126830af7acb 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
+# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+index 0cbce8f944da51a0..88715441feaaccf5 100644
+--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
+# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex
+ # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index abde8438d41f2320..7b27cbdda5fb99f7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -76,6 +76,25 @@
+ # endif
+ #endif
+ 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
+ #ifndef PAGE_SIZE
+ # define PAGE_SIZE 4096
+ #endif
+@@ -199,25 +218,21 @@ L(start):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(last_2x_vec):
+-#endif
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(nop):
+-	ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+ #else
+ 	VZEROUPPER_RETURN
+ #endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+-
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
+ 	movq	%rdi, %rax
+ L(start_erms):
+ # ifdef __ILP32__
+@@ -298,310 +313,448 @@ L(start_erms):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+-L(last_2x_vec):
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
+ L(return):
+-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+-#else
+# else
+ 	ret
+# endif
+ #endif
+ 
+-L(movsb):
+-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+-	jae	L(more_8x_vec)
+-	cmpq	%rsi, %rdi
+-	jb	1f
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	leaq	(%rsi,%rdx), %r9
+-	cmpq	%r9, %rdi
+-	/* Avoid slow backward REP MOVSB.  */
+-	jb	L(more_8x_vec_backward)
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rdi, %rcx
+-	subq	%rsi, %rcx
+-	jmp	2f
+-# endif
+-1:
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rsi, %rcx
+-	subq	%rdi, %rcx
+-2:
+-/* Avoid "rep movsb" if RCX, the distance between source and destination,
+-   is N*4GB + [1..63] with N >= 0.  */
+-	cmpl	$63, %ecx
+-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+-3:
+-# endif
+-	mov	%RDX_LP, %RCX_LP
+-	rep movsb
+-L(nop):
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+ 	ret
+ #endif
+ 
+	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ # error Unsupported VEC_SIZE!
+ #endif
+ #if VEC_SIZE > 32
+-	cmpb	$32, %dl
+	cmpl	$32, %edx
+ 	jae	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+-	cmpb	$16, %dl
+	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+ #endif
+-	cmpb	$8, %dl
+	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
+-	ja	L(between_2_3)
+-	jb	1f
+-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
+ 	movb	%cl, (%rdi)
+-1:
+L(copy_0):
+ 	ret
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
+	ret
+#endif
+
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
+	ret
+#endif
+
+ #if VEC_SIZE > 32
+	.p2align 4,, 10
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ 	VMOVU	(%rsi), %YMM0
+-	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	-32(%rsi, %rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+-	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-#endif
+-#if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
+-L(between_16_31):
+-	VMOVU	(%rsi), %XMM0
+-	VMOVU	-16(%rsi,%rdx), %XMM1
+-	VMOVU	%XMM0, (%rdi)
+-	VMOVU	%XMM1, -16(%rdi,%rdx)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+
+	.p2align 4,, 10
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
+ 	movq	(%rsi), %rsi
+-	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rsi, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+ 	ret
+-L(between_4_7):
+-	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	-4(%rsi,%rdx), %ecx
+-	movl	(%rsi), %esi
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%esi, (%rdi)
+-	ret
+-L(between_2_3):
+-	/* From 2 to 3.  No branch when size == 2.  */
+-	movzwl	-2(%rsi,%rdx), %ecx
+-	movzwl	(%rsi), %esi
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%si, (%rdi)
+-	ret
+ 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ L(movsb_more_2x_vec):
+ 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+ 	ja	L(movsb)
+ #endif
+ L(more_2x_vec):
+-	/* More than 2 * VEC and there may be overlap between destination
+-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_4x_vec)
+-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-L(last_4x_vec):
+-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+	.p2align 4,, 4
+ L(more_8x_vec):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+ 	/* Check if non-temporal move candidate.  */
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_memcpy_2x)
+ #endif
+-	/* Entry if rdx is greater than non-temporal threshold but there
+-       is overlap.  */
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
+ L(more_8x_vec_check):
+-	cmpq	%rsi, %rdi
+-	ja	L(more_8x_vec_backward)
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	/* Load the first VEC and last 4 * VEC to support overlapping
+-	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
+ 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+ 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+-	/* Save start and stop of the destination buffer.  */
+-	movq	%rdi, %r11
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+-	/* Align destination for aligned stores in the loop.  Compute
+-	   how much destination is misaligned.  */
+-	movq	%rdi, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rsi
+-	/* Adjust destination which should be aligned now.  */
+-	subq	%r8, %rdi
+-	/* Adjust length.  */
+-	addq	%r8, %rdx
+ 
+-	.p2align 4
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+ 	subq	$-(VEC_SIZE * 4), %rsi
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	%rdi, %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
+ 	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
+ 	VZEROUPPER_RETURN
+ 
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+ 	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+-	/* Save stop of the destination buffer.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+-	/* Align destination end for aligned stores in the loop.  Compute
+-	   how much destination end is misaligned.  */
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+-	movq	%r11, %r9
+-	movq	%r11, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rcx
+-	/* Adjust the end of destination which should be aligned now.  */
+-	subq	%r8, %r9
+-	/* Adjust length.  */
+-	subq	%r8, %rdx
+-
+-	.p2align 4
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	addq	$-(VEC_SIZE * 4), %rcx
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%r9)
+-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	addq	$-(VEC_SIZE * 4), %r9
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+ 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
+	ret
+# endif
+#endif
+ 
+	.p2align 4,, 10
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	.p2align 4
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
+ L(large_memcpy_2x):
+-	/* Compute absolute value of difference between source and
+-	   destination.  */
+-	movq	%rdi, %r9
+-	subq	%rsi, %r9
+-	movq	%r9, %r8
+-	leaq	-1(%r9), %rcx
+-	sarq	$63, %r8
+-	xorq	%r8, %r9
+-	subq	%r8, %r9
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache when
+-	   source is loaded.  */
+-	cmpq	%r9, %rdx
+-	ja	L(more_8x_vec_check)
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
+ 
+ 	/* Cache align destination. First store the first 64 bytes then
+ 	   adjust alignments.  */
+-	VMOVU	(%rsi), %VEC(8)
+-#if VEC_SIZE < 64
+-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+-#if VEC_SIZE < 32
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+-#endif
+-#endif
+-	VMOVU	%VEC(8), (%rdi)
+-#if VEC_SIZE < 64
+-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+-#if VEC_SIZE < 32
+-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+-#endif
+-#endif
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
+ 	/* Adjust source, destination, and size.  */
+ 	movq	%rdi, %r8
+ 	andq	$63, %r8
+@@ -614,9 +767,13 @@ L(large_memcpy_2x):
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+ 
+-	/* Test if source and destination addresses will alias. If they do
+-	   the larger pipeline in large_memcpy_4x alleviated the
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
+ 	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
+ 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ 	jz	L(large_memcpy_4x)
+ 
+@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
+ 	/* ecx stores inner loop counter.  */
+ 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+ L(loop_large_memcpy_4x_inner):
+-	/* Only one prefetch set per page as doing 4 pages give more time
+-	   for prefetcher to keep up.  */
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
+ 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
--- a/glibc-upstream-2.34-182.patch
+++ b/glibc-upstream-2.34-182.patch
@ -0,0 +1,131 @@
+commit cecbac52123456e2fbcff062a4165bf7b9174797
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Nov 1 00:49:52 2021 -0500
+
+    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
+    
+    No bug.
+    
+    This patch doubles the rep_movsb_threshold when using ERMS. Based on
+    benchmarks the vector copy loop, especially now that it handles 4k
+    aliasing, is better for these medium ranged.
+    
+    On Skylake with ERMS:
+    
+    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+    4096,   0,      0,      0,      0.975
+    4096,   0,      0,      1,      0.953
+    4096,   12,     0,      0,      0.969
+    4096,   12,     0,      1,      0.872
+    4096,   44,     0,      0,      0.979
+    4096,   44,     0,      1,      0.83
+    4096,   0,      12,     0,      1.006
+    4096,   0,      12,     1,      0.989
+    4096,   0,      44,     0,      0.739
+    4096,   0,      44,     1,      0.942
+    4096,   12,     12,     0,      1.009
+    4096,   12,     12,     1,      0.973
+    4096,   44,     44,     0,      0.791
+    4096,   44,     44,     1,      0.961
+    4096,   2048,   0,      0,      0.978
+    4096,   2048,   0,      1,      0.951
+    4096,   2060,   0,      0,      0.986
+    4096,   2060,   0,      1,      0.963
+    4096,   2048,   12,     0,      0.971
+    4096,   2048,   12,     1,      0.941
+    4096,   2060,   12,     0,      0.977
+    4096,   2060,   12,     1,      0.949
+    8192,   0,      0,      0,      0.85
+    8192,   0,      0,      1,      0.845
+    8192,   13,     0,      0,      0.937
+    8192,   13,     0,      1,      0.939
+    8192,   45,     0,      0,      0.932
+    8192,   45,     0,      1,      0.927
+    8192,   0,      13,     0,      0.621
+    8192,   0,      13,     1,      0.62
+    8192,   0,      45,     0,      0.53
+    8192,   0,      45,     1,      0.516
+    8192,   13,     13,     0,      0.664
+    8192,   13,     13,     1,      0.659
+    8192,   45,     45,     0,      0.593
+    8192,   45,     45,     1,      0.575
+    8192,   2048,   0,      0,      0.854
+    8192,   2048,   0,      1,      0.834
+    8192,   2061,   0,      0,      0.863
+    8192,   2061,   0,      1,      0.857
+    8192,   2048,   13,     0,      0.63
+    8192,   2048,   13,     1,      0.629
+    8192,   2061,   13,     0,      0.627
+    8192,   2061,   13,     1,      0.62
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+ #endif
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
+ #if HAVE_TUNABLES
+       minimum_rep_movsb_threshold = 64 * 8;
+ #endif
+@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
+ #if HAVE_TUNABLES
+       minimum_rep_movsb_threshold = 32 * 8;
+ #endif
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index dd6e1d65c9490d4f..419313804d49cf65 100644
+--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
--- a/glibc-upstream-2.34-183.patch
+++ b/glibc-upstream-2.34-183.patch
--- a/glibc-upstream-2.34-184.patch
+++ b/glibc-upstream-2.34-184.patch
@ -0,0 +1,104 @@
+commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Dec 3 15:29:25 2021 -0800
+
+    x86-64: Use notl in EVEX strcmp [BZ #28646]
+    
+    Must use notl %edi here as lower bits are for CHAR comparisons
+    potentially out of range thus can be 0 without indicating mismatch.
+    This fixes BZ #28646.
+    
+    Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
+
+diff --git a/string/test-strcmp.c b/string/test-strcmp.c
+index 7feababf4ddc5603..a0255b9625fbcedd 100644
+--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
+@@ -25,6 +25,7 @@
+ # define TEST_NAME "strcmp"
+ #endif
+ #include "test-string.h"
+#include <support/test-driver.h>
+ 
+ #ifdef WIDE
+ # include <wchar.h>
+@@ -392,6 +393,32 @@ check2 (void)
+ 	}
+ }
+ 
+static void
+check3 (void)
+{
+  size_t size = 0xd000 + 0x4000;
+  CHAR *s1, *s2;
+  CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANON, -1, 0);
+  CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANON, -1, 0);
+  if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
+  s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
+
+  STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
+  STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
+
+  int exp_result = SIMPLE_STRCMP (s1, s2);
+  FOR_EACH_IMPL (impl, 0)
+    check_result (impl, s1, s2, exp_result);
+
+  munmap ((void *) buffer1, size);
+  munmap ((void *) buffer2, size);
+}
+
+ int
+ test_main (void)
+ {
+@@ -400,6 +427,7 @@ test_main (void)
+   test_init ();
+   check();
+   check2 ();
+  check3 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac89bcae20b..6f5c4bf984da2b80 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
+    /* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
+	/* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
--- a/glibc-upstream-2.34-185.patch
+++ b/glibc-upstream-2.34-185.patch
@ -0,0 +1,30 @@
+commit f3a99b2216114f89b20329ae7664b764248b4bbd
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Dec 6 07:14:12 2021 -0800
+
+    x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
+    
+    Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+    they won't lower CPU frequency when ZMM load and store instructions are
+    used.
+    
+    (cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index f4d4049e391cbabd..09590d8794b1c6fb 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -566,8 +566,11 @@ disable_tsx:
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
+	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
+	     when ZMM load and store instructions are used.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
+	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
+	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
--- a/glibc-upstream-2.34-186.patch
+++ b/glibc-upstream-2.34-186.patch
@ -0,0 +1,384 @@
+commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Dec 24 18:54:41 2021 -0600
+
+    x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+    
+    No bug.
+    Optimizations are twofold.
+    
+    1) Replace page cross and 0/1 checks with masked load instructions in
+       L(less_vec). In applications this reduces branch-misses in the
+       hot [0, 32] case.
+    2) Change controlflow so that L(less_vec) case gets the fall through.
+    
+    Change 2) helps copies in the [0, 32] size range but comes at the cost
+    of copies in the [33, 64] size range.  From profiles of GCC and
+    Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+    appears to the the right tradeoff.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757fac8a356..d2899e7c7078cd41 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
+#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
+
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
+	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
+	ja	L(more_1x_vec)
+
+	/* Create mask for CHAR's we want to compare. This allows us to
+	   avoid having to include page cross logic.  */
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k2
+
+	/* Safe to load full ymm with mask.  */
+	VMOVU_MASK (%rsi), %YMM2{%k2}
+	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+	ret
+ 
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+
+	.p2align 4
+L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
+	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
+
+	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
+
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
--- a/glibc-upstream-2.34-187.patch
+++ b/glibc-upstream-2.34-187.patch
@ -0,0 +1,42 @@
+commit 9681691402052b727e01ae3375c73e0f76566593
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Wed Apr 27 13:59:26 2022 -0300
+
+    linux: Fix missing internal 64 bit time_t stat usage
+    
+    These are two missing spots initially done by 52a5fe70a2c77935.
+    
+    Checked on i686-linux-gnu.
+    
+    (cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9)
+
+diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
+index 13160d32499c4e58..00e4ce7f80ee2dfe 100644
+--- a/sysdeps/unix/sysv/linux/faccessat.c
+++ b/sysdeps/unix/sysv/linux/faccessat.c
+@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
+   if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
+     return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
+ 
+-  struct stat64 stats;
+-  if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+  struct __stat64_t64 stats;
+  if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+     return -1;
+ 
+   mode &= (X_OK | W_OK | R_OK);	/* Clear any bogus bits. */
+diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
+index b599a66c930cad4d..f79930303118ebcd 100644
+--- a/sysdeps/unix/sysv/linux/pathconf.c
+++ b/sysdeps/unix/sysv/linux/pathconf.c
+@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
+ 	      && strcmp (mntbuf.mnt_type, "ext4") != 0)
+ 	    continue;
+ 
+-	  struct stat64 fsst;
+-	  if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
+	  struct __stat64_t64 fsst;
+	  if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
+ 	      && st.st_dev == fsst.st_dev)
+ 	    {
+ 	      if (strcmp (mntbuf.mnt_type, "ext4") == 0)
--- a/glibc-upstream-2.34-188.patch
+++ b/glibc-upstream-2.34-188.patch
@ -0,0 +1,39 @@
+commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Tue Apr 26 10:52:41 2022 -0400
+
+    i386: Regenerate ulps
+    
+    These failures were caught while building glibc master for Fedora
+    Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse'
+    using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon
+    processor.
+    
+    (cherry picked from commit e465d97653311c3687aee49de782177353acfe86)
+
+diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
+index 7601049110789201..84e6686eba5fe79a 100644
+--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
+@@ -668,7 +668,7 @@ ldouble: 4
+ 
+ Function: Imaginary part of "clog10":
+ double: 2
+-float: 1
+float: 2
+ float128: 2
+ ldouble: 2
+ 
+diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+index a39c89cec1141935..cc21e6907fe8b6a3 100644
+--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+@@ -668,7 +668,7 @@ ldouble: 4
+ 
+ Function: Imaginary part of "clog10":
+ double: 2
+-float: 1
+float: 2
+ float128: 2
+ ldouble: 2
+ 
--- a/glibc.spec
+++ b/glibc.spec
@ -148,7 +148,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 30%{?dist}
+Release: 31%{?dist}

 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch
 Patch176: glibc-rh2058230.patch
 Patch177: glibc-rh2054789.patch
 Patch178: glibc-upstream-2.34-108.patch
-Patch179: glibc-upstream-2.34-110.patch
 # glibc-2.34-109-gd64b08d5ba only changes NEWS.
+Patch179: glibc-upstream-2.34-110.patch
 Patch180: glibc-upstream-2.34-111.patch
 Patch181: glibc-upstream-2.34-112.patch
 Patch182: glibc-upstream-2.34-113.patch
 Patch183: glibc-upstream-2.34-114.patch
+# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
+# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch184: glibc-upstream-2.34-117.patch
 Patch185: glibc-upstream-2.34-118.patch
 Patch186: glibc-upstream-2.34-119.patch
-# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
-# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch187: glibc-upstream-2.34-120.patch
 Patch188: glibc-upstream-2.34-121.patch
 Patch189: glibc-upstream-2.34-122.patch
@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch
 Patch230: glibc-upstream-2.34-164.patch
 Patch231: glibc-upstream-2.34-165.patch
 Patch232: glibc-upstream-2.34-166.patch
+Patch233: glibc-upstream-2.34-167.patch
+Patch234: glibc-upstream-2.34-168.patch
+Patch235: glibc-upstream-2.34-169.patch
+Patch236: glibc-upstream-2.34-170.patch
+Patch237: glibc-upstream-2.34-171.patch
+Patch238: glibc-upstream-2.34-172.patch
+Patch239: glibc-upstream-2.34-173.patch
+Patch240: glibc-upstream-2.34-174.patch
+Patch241: glibc-upstream-2.34-175.patch
+Patch242: glibc-upstream-2.34-176.patch
+Patch243: glibc-upstream-2.34-177.patch
+Patch244: glibc-upstream-2.34-178.patch
+Patch245: glibc-upstream-2.34-179.patch
+Patch246: glibc-upstream-2.34-180.patch
+Patch247: glibc-upstream-2.34-181.patch
+Patch248: glibc-upstream-2.34-182.patch
+Patch249: glibc-upstream-2.34-183.patch
+Patch250: glibc-upstream-2.34-184.patch
+Patch251: glibc-upstream-2.34-185.patch
+Patch252: glibc-upstream-2.34-186.patch
+Patch253: glibc-upstream-2.34-187.patch
+Patch254: glibc-upstream-2.34-188.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2493,6 +2515,32 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared

 %changelog
+* Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31
+- Sync with upstream branch release/2.34/master,
+  commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe:
+- i386: Regenerate ulps
+- linux: Fix missing internal 64 bit time_t stat usage
+- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
+- x86-64: Use notl in EVEX strcmp [BZ #28646]
+- x86: Shrink memcmp-sse4.S code size
+- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
+- x86: Optimize memmove-vec-unaligned-erms.S
+- x86-64: Replace movzx with movzbl
+- x86-64: Remove Prefer_AVX2_STRCMP
+- x86-64: Improve EVEX strcmp with masked load
+- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
+- x86: Optimize memset-vec-unaligned-erms.S
+- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
+- x86: Modify ENTRY in sysdep.h so that p2align can be specified
+- x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
+- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
+- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
+- INSTALL: Rephrase -with-default-link documentation
+- misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
+- Default to --with-default-link=no (bug 25812)
+- scripts: Add glibcelf.py module
+
 * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30
 - Sync with upstream branch release/2.34/master,
  commit 71326f1f2fd09dafb9c34404765fb88129e94237: