Sync with upstream branch release/2.34/master

Upstream commit: 55640ed3fde48360a8e8083be4843bd2dc7cecfe - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module
2022-04-27 22:27:50 -04:00 · 2022-04-27 22:27:50 -04:00 · 4e3257320c
parent a8db42ba53
commit 4e3257320c
23 changed files with 8751 additions and 4 deletions
--- a/glibc-upstream-2.34-167.patch
+++ b/glibc-upstream-2.34-167.patch
--- a/glibc-upstream-2.34-168.patch
+++ b/glibc-upstream-2.34-168.patch
@ -0,0 +1,407 @@
 commit f0c71b34f96c816292c49122d50da3a511b67bf2
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Mon Apr 11 11:30:31 2022 +0200
    Default to --with-default-link=no (bug 25812)
    This is necessary to place the libio vtables into the RELRO segment.
    New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to
    verify that this is what actually happens.
    The new tests fail on ia64 due to lack of (default) RELRO support
    inbutils, so they are XFAILed there.
    (cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7)
 diff --git a/INSTALL b/INSTALL
 index d8d4e9f155f56616..60d01568d77645c7 100644
 --- a/INSTALL
 +++ b/INSTALL
@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
      library will still be usable, but functionality may be lost--for
      example, you can't build a shared libc with old binutils.
 +'--with-default-link=FLAG'
 +     With '--with-default-link=yes', the build system does not use a
 +     custom linker script for linking shared objects.  The default for
 +     FLAG is the opposite, 'no', because the custom linker script is
 +     needed for full RELRO protection.
 +
 '--with-nonshared-cflags=CFLAGS'
      Use additional compiler flags CFLAGS to build the parts of the
      library which are always statically linked into applications and
 diff --git a/configure b/configure
 index 03f4e59e754b5463..34c64f8de44e3086 100755
 --- a/configure
 +++ b/configure
@@ -3373,7 +3373,7 @@ fi
 if test "${with_default_link+set}" = set; then :
   withval=$with_default_link; use_default_link=$withval
 else
 -  use_default_link=default
 +  use_default_link=no
 fi
@@ -6085,69 +6085,6 @@ fi
 $as_echo "$libc_cv_hashstyle" >&6; }
 -# The linker's default -shared behavior is good enough if it
 -# does these things that our custom linker scripts ensure that
 -# all allocated NOTE sections come first.
 -if test "$use_default_link" = default; then
 -  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
 -$as_echo_n "checking for sufficient default -shared layout... " >&6; }
 -if ${libc_cv_use_default_link+:} false; then :
 -  $as_echo_n "(cached) " >&6
 -else
 -    libc_cv_use_default_link=no
 -  cat > conftest.s <<\EOF
 -	  .section .note.a,"a",%note
 -	  .balign 4
 -	  .long 4,4,9
 -	  .string "GNU"
 -	  .string "foo"
 -	  .section .note.b,"a",%note
 -	  .balign 4
 -	  .long 4,4,9
 -	  .string "GNU"
 -	  .string "bar"
 -EOF
 -  if { ac_try='  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
 -  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
 -  (eval $ac_try) 2>&5
 -  ac_status=$?
 -  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
 -  test $ac_status = 0; }; } &&
 -       ac_try=`$READELF -S conftest.so | sed -n \
 -	 '${x;p;}
 -	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
 -	  t a
 -	  b
 -	  : a
 -	  H'`
 -  then
 -    libc_seen_a=no libc_seen_b=no
 -    set -- $ac_try
 -    while test $# -ge 2 -a "$1" = NOTE; do
 -      case "$2" in
 -      .note.a) libc_seen_a=yes ;;
 -      .note.b) libc_seen_b=yes ;;
 -      esac
 -      shift 2
 -    done
 -    case "$libc_seen_a$libc_seen_b" in
 -    yesyes)
 -      libc_cv_use_default_link=yes
 -      ;;
 -    *)
 -      echo >&5 "\
 -$libc_seen_a$libc_seen_b from:
 -$ac_try"
 -      ;;
 -    esac
 -  fi
 -  rm -f conftest*
 -fi
 -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
 -$as_echo "$libc_cv_use_default_link" >&6; }
 -  use_default_link=$libc_cv_use_default_link
 -fi
 -
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
 $as_echo_n "checking for GLOB_DAT reloc... " >&6; }
 if ${libc_cv_has_glob_dat+:} false; then :
 diff --git a/configure.ac b/configure.ac
 index eb9431875fae1b0e..2c69af0807266e7e 100644
 --- a/configure.ac
 +++ b/configure.ac
@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
 	    AS_HELP_STRING([--with-default-link],
 			   [do not use explicit linker scripts]),
 	    [use_default_link=$withval],
 -	    [use_default_link=default])
 +	    [use_default_link=no])
 dnl Additional build flags injection.
 AC_ARG_WITH([nonshared-cflags],
@@ -1378,59 +1378,6 @@ fi
 rm -f conftest*])
 AC_SUBST(libc_cv_hashstyle)
 -# The linker's default -shared behavior is good enough if it
 -# does these things that our custom linker scripts ensure that
 -# all allocated NOTE sections come first.
 -if test "$use_default_link" = default; then
 -  AC_CACHE_CHECK([for sufficient default -shared layout],
 -		  libc_cv_use_default_link, [dnl
 -  libc_cv_use_default_link=no
 -  cat > conftest.s <<\EOF
 -	  .section .note.a,"a",%note
 -	  .balign 4
 -	  .long 4,4,9
 -	  .string "GNU"
 -	  .string "foo"
 -	  .section .note.b,"a",%note
 -	  .balign 4
 -	  .long 4,4,9
 -	  .string "GNU"
 -	  .string "bar"
 -EOF
 -  if AC_TRY_COMMAND([dnl
 -  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
 -       ac_try=`$READELF -S conftest.so | sed -n \
 -	 ['${x;p;}
 -	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
 -	  t a
 -	  b
 -	  : a
 -	  H']`
 -  then
 -    libc_seen_a=no libc_seen_b=no
 -    set -- $ac_try
 -    while test $# -ge 2 -a "$1" = NOTE; do
 -      case "$2" in
 -      .note.a) libc_seen_a=yes ;;
 -      .note.b) libc_seen_b=yes ;;
 -      esac
 -      shift 2
 -    done
 -    case "$libc_seen_a$libc_seen_b" in
 -    yesyes)
 -      libc_cv_use_default_link=yes
 -      ;;
 -    *)
 -      echo >&AS_MESSAGE_LOG_FD "\
 -$libc_seen_a$libc_seen_b from:
 -$ac_try"
 -      ;;
 -    esac
 -  fi
 -  rm -f conftest*])
 -  use_default_link=$libc_cv_use_default_link
 -fi
 -
 AC_CACHE_CHECK(for GLOB_DAT reloc,
 	       libc_cv_has_glob_dat, [dnl
 cat > conftest.c <<EOF
 diff --git a/elf/Makefile b/elf/Makefile
 index 8afbe3f6ab259331..fec6e23b5b625e3b 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
@@ -504,6 +504,40 @@ tests-execstack-yes = \
   # tests-execstack-yes
 endif
 endif
 +
 +tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
 +$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
 +  $(objpfx)ld.so
 +	$(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
 +	  --required=_rtld_global_ro \
 +	  > $@ 2>&1; $(evaluate-test)
 +# The optional symbols are present in libc only if the architecture has
 +# the GLIBC_2.0 symbol set in libc.
 +$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
 +  $(common-objpfx)libc.so
 +	$(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
 +	    --required=_IO_cookie_jumps \
 +	    --required=_IO_file_jumps \
 +	    --required=_IO_file_jumps_maybe_mmap \
 +	    --required=_IO_file_jumps_mmap \
 +	    --required=_IO_helper_jumps \
 +	    --required=_IO_mem_jumps \
 +	    --required=_IO_obstack_jumps \
 +	    --required=_IO_proc_jumps \
 +	    --required=_IO_str_chk_jumps \
 +	    --required=_IO_str_jumps \
 +	    --required=_IO_strn_jumps \
 +	    --required=_IO_wfile_jumps \
 +	    --required=_IO_wfile_jumps_maybe_mmap \
 +	    --required=_IO_wfile_jumps_mmap \
 +	    --required=_IO_wmem_jumps \
 +	    --required=_IO_wstr_jumps \
 +	    --required=_IO_wstrn_jumps \
 +	    --optional=_IO_old_cookie_jumps \
 +	    --optional=_IO_old_file_jumps \
 +	    --optional=_IO_old_proc_jumps \
 +	  > $@ 2>&1; $(evaluate-test)
 +
 tests += $(tests-execstack-$(have-z-execstack))
 ifeq ($(run-built-tests),yes)
 tests-special += \
 diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
 new file mode 100644
 index 0000000000000000..368ea3349f86bd81
 --- /dev/null
 +++ b/elf/tst-relro-symbols.py
@@ -0,0 +1,137 @@
 +#!/usr/bin/python3
 +# Verify that certain symbols are covered by RELRO.
 +# Copyright (C) 2022 Free Software Foundation, Inc.
 +# This file is part of the GNU C Library.
 +#
 +# The GNU C Library is free software; you can redistribute it and/or
 +# modify it under the terms of the GNU Lesser General Public
 +# License as published by the Free Software Foundation; either
 +# version 2.1 of the License, or (at your option) any later version.
 +#
 +# The GNU C Library is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +# Lesser General Public License for more details.
 +#
 +# You should have received a copy of the GNU Lesser General Public
 +# License along with the GNU C Library; if not, see
 +# <https://www.gnu.org/licenses/>.
 +
 +"""Analyze a (shared) object to verify that certain symbols are
 +present and covered by the PT_GNU_RELRO segment.
 +
 +"""
 +
 +import argparse
 +import os.path
 +import sys
 +
 +# Make available glibc Python modules.
 +sys.path.append(os.path.join(
 +    os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
 +
 +import glibcelf
 +
 +def find_relro(path: str, img: glibcelf.Image) -> (int, int):
 +    """Discover the address range of the PT_GNU_RELRO segment."""
 +    for phdr in img.phdrs():
 +        if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
 +            # The computation is not entirely accurate because
 +            # _dl_protect_relro in elf/dl-reloc.c rounds both the
 +            # start end and downwards using the run-time page size.
 +            return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
 +    sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
 +    sys.exit(1)
 +
 +def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
 +    """Check if a section or symbol falls within in the RELRO segment."""
 +    end = start + size - 1
 +    if not (relro_begin <= start < end < relro_end):
 +        error(
 +            '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
 +                kind, name.decode('UTF-8'), start, size,
 +                relro_begin, relro_end))
 +
 +def get_parser():
 +    """Return an argument parser for this script."""
 +    parser = argparse.ArgumentParser(description=__doc__)
 +    parser.add_argument('object', help='path to object file to check')
 +    parser.add_argument('--required', metavar='NAME', default=(),
 +                        help='required symbol names', nargs='*')
 +    parser.add_argument('--optional', metavar='NAME', default=(),
 +                        help='required symbol names', nargs='*')
 +    return parser
 +
 +def main(argv):
 +    """The main entry point."""
 +    parser = get_parser()
 +    opts = parser.parse_args(argv)
 +    img = glibcelf.Image.readfile(opts.object)
 +
 +    required_symbols = frozenset([sym.encode('UTF-8')
 +                                  for sym in opts.required])
 +    optional_symbols = frozenset([sym.encode('UTF-8')
 +                                  for sym in opts.optional])
 +    check_symbols = required_symbols | optional_symbols
 +
 +    # Tracks the symbols in check_symbols that have been found.
 +    symbols_found = set()
 +
 +    # Discover the extent of the RELRO segment.
 +    relro_begin, relro_end = find_relro(opts.object, img)
 +    symbol_table_found = False
 +
 +    errors = False
 +    def error(msg: str) -> None:
 +        """Record an error condition and write a message to standard output."""
 +        nonlocal errors
 +        errors = True
 +        sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
 +
 +    # Iterate over section headers to find the symbol table.
 +    for shdr in img.shdrs():
 +        if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
 +            symbol_table_found = True
 +            for sym in img.syms(shdr):
 +                if sym.st_name in check_symbols:
 +                    symbols_found.add(sym.st_name)
 +
 +                    # Validate symbol type, section, and size.
 +                    if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
 +                        error('symbol {!r} has wrong type {}'.format(
 +                            sym.st_name.decode('UTF-8'), sym.st_info.type))
 +                    if sym.st_shndx in glibcelf.Shn:
 +                        error('symbol {!r} has reserved section {}'.format(
 +                            sym.st_name.decode('UTF-8'), sym.st_shndx))
 +                        continue
 +                    if sym.st_size == 0:
 +                        error('symbol {!r} has size zero'.format(
 +                            sym.st_name.decode('UTF-8')))
 +                        continue
 +
 +                    check_in_relro('symbol', relro_begin, relro_end,
 +                                   sym.st_name, sym.st_value, sym.st_size,
 +                                   error)
 +            continue # SHT_SYMTAB
 +        if shdr.sh_name == b'.data.rel.ro' \
 +           or shdr.sh_name.startswith(b'.data.rel.ro.'):
 +            check_in_relro('section', relro_begin, relro_end,
 +                           shdr.sh_name, shdr.sh_addr, shdr.sh_size,
 +                           error)
 +            continue
 +
 +    if required_symbols - symbols_found:
 +        for sym in sorted(required_symbols - symbols_found):
 +            error('symbol {!r} not found'.format(sym.decode('UTF-8')))
 +
 +    if errors:
 +        sys.exit(1)
 +
 +    if not symbol_table_found:
 +        sys.stdout.write(
 +            '{}: warning: no symbol table found (stripped object)\n'.format(
 +                opts.object))
 +        sys.exit(77)
 +
 +if __name__ == '__main__':
 +    main(sys.argv[1:])
 diff --git a/manual/install.texi b/manual/install.texi
 index 816b77a0a25a88a7..36a5af62bc5722b0 100644
 --- a/manual/install.texi
 +++ b/manual/install.texi
@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be
 usable, but functionality may be lost---for example, you can't build a
 shared libc with old binutils.
 +@item --with-default-link=@var{FLAG}
 +With @code{--with-default-link=yes}, the build system does not use a
 +custom linker script for linking shared objects.  The default for
 +@var{FLAG} is the opposite, @samp{no}, because the custom linker script
 +is needed for full RELRO protection.
 +
 @item --with-nonshared-cflags=@var{cflags}
 Use additional compiler flags @var{cflags} to build the parts of the
 library which are always statically linked into applications and
 diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
 index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644
 --- a/sysdeps/unix/sysv/linux/ia64/Makefile
 +++ b/sysdeps/unix/sysv/linux/ia64/Makefile
@@ -1,3 +1,9 @@
 +ifeq ($(subdir),elf)
 +# ia64 does not support PT_GNU_RELRO.
 +test-xfail-tst-relro-ldso = yes
 +test-xfail-tst-relro-libc = yes
 +endif
 +
 ifeq ($(subdir),misc)
 sysdep_headers += sys/rse.h
 endif
--- a/glibc-upstream-2.34-169.patch
+++ b/glibc-upstream-2.34-169.patch
@ -0,0 +1,87 @@
 commit ca0faa140ff8cebe4c041d935f0f5eb480873d99
 Author: Joan Bruguera <joanbrugueram@gmail.com>
 Date:   Mon Apr 11 19:49:56 2022 +0200
    misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
    If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify
    checks should pass, and `__whatever_alias` should be called.
    Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but
    on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`.
    A comment says the -1 case should work as: "The -1 check is redundant because
    since it implies that __glibc_safe_len_cond is true.". But this fails when:
    * `__s > 1`
    * `__osz == -1` (i.e. unknown size at compile time)
    * `__l` is big enough
    * `__l * __s <= __osz` can be folded to a constant
    (I only found this to be true for `mbsrtowcs` and other functions in wchar2.h)
    In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be
    called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program.
    This commit adds the explicit `__osz == -1` check again.
    moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041
    Minimal test case (test.c):
        #include <wchar.h>
        int main (void)
        {
            const char *hw = "HelloWorld";
            mbsrtowcs (NULL, &hw, (size_t)-1, NULL);
            return 0;
        }
    Build with:
        gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test
    Output:
        *** buffer overflow detected ***: terminated
    Fixes: BZ #29030
    Signed-off-by: Joan Bruguera <joanbrugueram@gmail.com>
    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
    (cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e)
 diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
 index 8b5902423cf0ad88..fb02452f5993c594 100644
 --- a/debug/tst-fortify.c
 +++ b/debug/tst-fortify.c
@@ -1505,6 +1505,11 @@ do_test (void)
       CHK_FAIL_END
 #endif
 +      /* Bug 29030 regresion check */
 +      cp = "HelloWorld";
 +      if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
 +        FAIL ();
 +
       cp = "A";
       if (mbstowcs (wenough, cp, 10) != 1
 	  || wcscmp (wenough, L"A") != 0)
 diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
 index 515fb681a0547217..b36013b9a6b4d9c3 100644
 --- a/misc/sys/cdefs.h
 +++ b/misc/sys/cdefs.h
@@ -161,13 +161,13 @@
    || (__builtin_constant_p (__l) && (__l) > 0))
 /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
 -   condition can be folded to a constant and if it is true.  The -1 check is
 -   redundant because since it implies that __glibc_safe_len_cond is true.  */
 +   condition can be folded to a constant and if it is true, or unknown (-1) */
 #define __glibc_safe_or_unknown_len(__l, __s, __osz) \
 -  (__glibc_unsigned_or_positive (__l)					      \
 -   && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l),     \
 -						   __s, __osz))		      \
 -   && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
 +  ((__osz) == (__SIZE_TYPE__) -1					      \
 +   || (__glibc_unsigned_or_positive (__l)				      \
 +       && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
 +						       (__s), (__osz)))	      \
 +       && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
 /* Conversely, we know at compile time that the length is unsafe if the
    __L * __S <= __OBJSZ condition can be folded to a constant and if it is
--- a/glibc-upstream-2.34-170.patch
+++ b/glibc-upstream-2.34-170.patch
@ -0,0 +1,49 @@
 commit 0d477e92c49db2906b32e44135b98746ccc73c7b
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Tue Apr 26 14:22:10 2022 +0200
    INSTALL: Rephrase -with-default-link documentation
    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
    (cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f)
 diff --git a/INSTALL b/INSTALL
 index 60d01568d77645c7..10a3dcdc0a8db665 100644
 --- a/INSTALL
 +++ b/INSTALL
@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
      library will still be usable, but functionality may be lost--for
      example, you can't build a shared libc with old binutils.
 -'--with-default-link=FLAG'
 -     With '--with-default-link=yes', the build system does not use a
 -     custom linker script for linking shared objects.  The default for
 -     FLAG is the opposite, 'no', because the custom linker script is
 +'--with-default-link'
 +     With '--with-default-link', the build system does not use a custom
 +     linker script for linking shared objects.  The default is
 +     '--without-default-link', because the custom linker script is
      needed for full RELRO protection.
 '--with-nonshared-cflags=CFLAGS'
 diff --git a/manual/install.texi b/manual/install.texi
 index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644
 --- a/manual/install.texi
 +++ b/manual/install.texi
@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be
 usable, but functionality may be lost---for example, you can't build a
 shared libc with old binutils.
 -@item --with-default-link=@var{FLAG}
 -With @code{--with-default-link=yes}, the build system does not use a
 -custom linker script for linking shared objects.  The default for
 -@var{FLAG} is the opposite, @samp{no}, because the custom linker script
 -is needed for full RELRO protection.
 +@item --with-default-link
 +With @code{--with-default-link}, the build system does not use a custom
 +linker script for linking shared objects.  The default is
 +@code{--without-default-link}, because the custom linker script is
 +needed for full RELRO protection.
 @item --with-nonshared-cflags=@var{cflags}
 Use additional compiler flags @var{cflags} to build the parts of the
--- a/glibc-upstream-2.34-171.patch
+++ b/glibc-upstream-2.34-171.patch
@ -0,0 +1,377 @@
 commit bc56ab1f4aa937665034373d3e320d0779a839aa
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Tue Apr 26 14:23:02 2022 +0200
    dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
    When audit modules are loaded, ld.so initialization is not yet
    complete, and rtld_active () returns false even though ld.so is
    mostly working.  Instead, the static dlopen hook is used, but that
    does not work at all because this is not a static dlopen situation.
    Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework
    static dlopen hooks") moved the hook pointer into _rtld_global_ro,
    which means that separate protection is not needed anymore and the
    hook pointer can be checked directly.
    The guard for disabling libio vtable hardening in _IO_vtable_check
    should stay for now.
    Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so:
    Examine GLRO to detect inactive loader [BZ #20204]").
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
    (cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b)
 diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
 index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644
 --- a/dlfcn/dladdr.c
 +++ b/dlfcn/dladdr.c
@@ -24,7 +24,7 @@ int
 __dladdr (const void *address, Dl_info *info)
 {
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dladdr (address, info);
 #endif
   return _dl_addr (address, info, NULL, NULL);
 diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
 index 78560dbac208c316..93ce68c1d6067fe2 100644
 --- a/dlfcn/dladdr1.c
 +++ b/dlfcn/dladdr1.c
@@ -24,7 +24,7 @@ int
 __dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
 {
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
 #endif
 diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
 index 6a013a81bb648191..07ecb21bf7d43be4 100644
 --- a/dlfcn/dlclose.c
 +++ b/dlfcn/dlclose.c
@@ -24,7 +24,7 @@ int
 __dlclose (void *handle)
 {
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlclose (handle);
 #endif
 diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
 index 5047b140662bc33e..63da79c63000eef0 100644
 --- a/dlfcn/dlerror.c
 +++ b/dlfcn/dlerror.c
@@ -32,7 +32,7 @@ char *
 __dlerror (void)
 {
 # ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlerror ();
 # endif
 diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
 index c6f9a1da09ff8622..47d2daa96fa5986f 100644
 --- a/dlfcn/dlinfo.c
 +++ b/dlfcn/dlinfo.c
@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg)
 int
 ___dlinfo (void *handle, int request, void *arg)
 {
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
   else
     return dlinfo_implementation (handle, request, arg);
 diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
 index c171c8953da20fc7..2309224eb8484b1a 100644
 --- a/dlfcn/dlmopen.c
 +++ b/dlfcn/dlmopen.c
@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
 void *
 ___dlmopen (Lmid_t nsid, const char *file, int mode)
 {
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
   else
     return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
 diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
 index e04b374b82b04337..9c59c751c4eaf7a7 100644
 --- a/dlfcn/dlopen.c
 +++ b/dlfcn/dlopen.c
@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
 void *
 ___dlopen (const char *file, int mode)
 {
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
   else
     return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
 diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
 index 9115501ac121eeca..c2f2a42194d50953 100644
 --- a/dlfcn/dlopenold.c
 +++ b/dlfcn/dlopenold.c
@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
     mode |= RTLD_LAZY;
   args.mode = mode;
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
   return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
 diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
 index 43044cf7bb95801e..d3861170a7631d01 100644
 --- a/dlfcn/dlsym.c
 +++ b/dlfcn/dlsym.c
@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
 void *
 ___dlsym (void *handle, const char *name)
 {
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
   else
     return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
 diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
 index 9b76f9afa513e11f..3af02109c306b800 100644
 --- a/dlfcn/dlvsym.c
 +++ b/dlfcn/dlvsym.c
@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
 void *
 ___dlvsym (void *handle, const char *name, const char *version)
 {
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
 					 RETURN_ADDRESS (0));
   else
 diff --git a/elf/Makefile b/elf/Makefile
 index fec6e23b5b625e3b..c89a6a58690646ee 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
@@ -376,6 +376,7 @@ tests += \
   tst-audit24d \
   tst-audit25a \
   tst-audit25b \
 +  tst-audit26 \
   tst-auditmany \
   tst-auxobj \
   tst-auxobj-dlopen \
@@ -721,6 +722,7 @@ modules-names = \
   tst-auditmod24c \
   tst-auditmod24d \
   tst-auditmod25 \
 +  tst-auditmod26 \
   tst-auxvalmod \
   tst-big-note-lib \
   tst-deep1mod1 \
@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
 LDFLAGS-tst-audit25b = -Wl,-z,now
 tst-audit25b-ARGS = -- $(host-test-program-cmd)
 +$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
 +$(objpfx)tst-auditmod26.so: $(libsupport)
 +tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
 +
 # tst-sonamemove links against an older implementation of the library.
 LDFLAGS-tst-sonamemove-linkmod1.so = \
   -Wl,--version-script=tst-sonamemove-linkmod1.map \
 diff --git a/elf/dl-libc.c b/elf/dl-libc.c
 index d5bc4a277f4c6ef3..db4342a3256921f0 100644
 --- a/elf/dl-libc.c
 +++ b/elf/dl-libc.c
@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode)
   args.caller_dlopen = RETURN_ADDRESS (0);
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
 #endif
   return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name)
   args.name = name;
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
 #endif
   return (dlerror_run (do_dlsym, &args) ? NULL
@@ -199,7 +199,7 @@ void *
 __libc_dlvsym (void *map, const char *name, const char *version)
 {
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
 #endif
@@ -222,7 +222,7 @@ int
 __libc_dlclose (void *map)
 {
 #ifdef SHARED
 -  if (!rtld_active ())
 +  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
 #endif
   return dlerror_run (do_dlclose, map);
 diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
 new file mode 100644
 index 0000000000000000..3f920e83bac247a5
 --- /dev/null
 +++ b/elf/tst-audit26.c
@@ -0,0 +1,35 @@
 +/* Check the usability of <dlfcn.h> functions in audit modules.
 +   Copyright (C) 2022 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <gnu/lib-names.h>
 +
 +#include <support/check.h>
 +#include <support/xdlfcn.h>
 +
 +static int
 +do_test (void)
 +{
 +  /* Check that the audit module has been loaded.  */
 +  void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
 +  TEST_VERIFY (handle
 +	       == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
 +
 +  return 0;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
 new file mode 100644
 index 0000000000000000..db7ba95abec20f53
 --- /dev/null
 +++ b/elf/tst-auditmod26.c
@@ -0,0 +1,104 @@
 +/* Check the usability of <dlfcn.h> functions in audit modules.  Audit module.
 +   Copyright (C) 2022 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <dlfcn.h>
 +#include <first-versions.h>
 +#include <gnu/lib-names.h>
 +#include <link.h>
 +#include <stdio.h>
 +#include <string.h>
 +#include <unistd.h>
 +
 +#include <support/check.h>
 +#include <support/xdlfcn.h>
 +
 +unsigned int
 +la_version (unsigned int current)
 +{
 +  /* Exercise various <dlfcn.h> functions.  */
 +
 +  /* Check dlopen, dlsym, dlclose.   */
 +  void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
 +  void *ptr = xdlsym (handle, "sincos");
 +  TEST_VERIFY (ptr != NULL);
 +  ptr = dlsym (handle, "SINCOS");
 +  TEST_VERIFY (ptr == NULL);
 +  const char *message = dlerror ();
 +  TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
 +  ptr = dlsym (handle, "SINCOS");
 +  TEST_VERIFY (ptr == NULL);
 +  xdlclose (handle);
 +  TEST_COMPARE_STRING (dlerror (), NULL);
 +
 +  handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
 +
 +  /* Check dlvsym.  _exit is unlikely to gain another symbol
 +     version.  */
 +  TEST_VERIFY (xdlsym (handle, "_exit")
 +               == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
 +
 +  /* Check dlinfo.  */
 +  {
 +    void *handle2 = NULL;
 +    TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
 +    TEST_VERIFY (handle2 == handle);
 +  }
 +
 +  /* Check dladdr and dladdr1.  */
 +  Dl_info info = { };
 +  TEST_VERIFY (dladdr (&_exit, &info) != 0);
 +  if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias.  */
 +    TEST_COMPARE_STRING (info.dli_sname, "_exit");
 +  TEST_VERIFY (info.dli_saddr == &_exit);
 +  TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
 +  void *extra_info;
 +  memset (&info, 0, sizeof (info));
 +  TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
 +  TEST_VERIFY (extra_info == handle);
 +
 +  /* Verify that dlmopen creates a new namespace.  */
 +  void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
 +  TEST_VERIFY (dlmopen_handle != handle);
 +  memset (&info, 0, sizeof (info));
 +  extra_info = NULL;
 +  ptr = xdlsym (dlmopen_handle, "_exit");
 +  TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
 +  TEST_VERIFY (extra_info == dlmopen_handle);
 +  xdlclose (dlmopen_handle);
 +
 +  /* Terminate the process with an error state.  This does not happen
 +     automatically because the audit module state is not shared with
 +     the main program.  */
 +  if (support_record_failure_is_failed ())
 +    {
 +      fflush (stdout);
 +      fflush (stderr);
 +      _exit (1);
 +    }
 +
 +  return LAV_CURRENT;
 +}
 +
 +char *
 +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
 +{
 +  if (strcmp (name, "mapped to libc") == 0)
 +    return (char *) LIBC_SO;
 +  else
 +    return (char *) name;
 +}
--- a/glibc-upstream-2.34-172.patch
+++ b/glibc-upstream-2.34-172.patch
@ -0,0 +1,28 @@
 commit 83cc145830bdbefdabe03787ed884d548bea9c99
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Fri Apr 22 19:34:52 2022 +0200
    scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
    enum.IntFlag and enum.EnumMeta._missing_ support are not part of
    earlier Python versions.
    (cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b)
 diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
 index 8f7d0ca184845714..da0d5380f33a195e 100644
 --- a/scripts/glibcelf.py
 +++ b/scripts/glibcelf.py
@@ -28,6 +28,12 @@ import collections
 import enum
 import struct
 +if not hasattr(enum, 'IntFlag'):
 +    import sys
 +    sys.stdout.write(
 +        'warning: glibcelf.py needs Python 3.6 for enum support\n')
 +    sys.exit(77)
 +
 class _OpenIntEnum(enum.IntEnum):
     """Integer enumeration that supports arbitrary int values."""
     @classmethod
--- a/glibc-upstream-2.34-173.patch
+++ b/glibc-upstream-2.34-173.patch
@ -0,0 +1,254 @@
 commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Fri Aug 20 06:42:24 2021 -0700
    x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
    Optimize loads of all bits set into ZMM register in AVX512 SVML codes
    by replacing
            vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
    and
            vmovups   .L_2il0floatpacket.13(%rip), %zmmX
    with
            vpternlogd $0xff, %zmmX, %zmmX, %zmmX
    This fixes BZ #28252.
    (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
         vmovaps   %zmm0, %zmm8
 /* Check for large arguments path */
 -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
 +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
 /*
   ARGUMENT RANGE REDUCTION:
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_cos_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.16:
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.16,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 index dfa2acafc486b56b..f5f117d474f66176 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
 /* preserve mantissa, set input exponent to 2^(-10) */
         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
 -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
 +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
         vpsrlq    $32, %zmm4, %zmm6
 /* reciprocal approximation good to at least 11 bits */
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_log_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.12:
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.12,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 index be8ab7c6e0e33819..48d251db16ccab9d 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
 -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
 +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
         vmovups __dAbsMask(%rax), %zmm7
         vmovups __dInvPI(%rax), %zmm2
         vmovups __dRShifter(%rax), %zmm1
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_sin_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.14:
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.14,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 index 611887082a545854..a4944a4feef6aa98 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
 /* SinPoly = SinR*SinPoly */
         vfmadd213pd %zmm5, %zmm5, %zmm4
 -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
 +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 /* Update Cos result's sign */
         vxorpd    %zmm2, %zmm1, %zmm1
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
 ENTRY (_ZGVeN8vvv_sincos_skx)
 WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
 END (_ZGVeN8vvv_sincos_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.15:
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.15,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 index f671d60d5dab5a0e..fe8474fed943e8ad 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
   X = X - Y*PI1 - Y*PI2 - Y*PI3
  */
         vmovaps   %zmm0, %zmm6
 -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
 +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
         vmovups __sRShifter(%rax), %zmm3
         vmovups __sPI1_FMA(%rax), %zmm5
         vmovups __sA9_FMA(%rax), %zmm9
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16v_cosf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.13:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 index 637bfe3c06ab9ad4..229b7828cde04db2 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
         vmovaps   %zmm0, %zmm7
 /* compare against threshold */
 -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
 +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
         vmovups __sInvLn2(%rax), %zmm4
         vmovups __sShifter(%rax), %zmm1
         vmovups __sLn2hi(%rax), %zmm6
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
 #endif
 END (_ZGVeN16v_expf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.13:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_slog_data@GOTPCREL(%rip), %rax
 -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
 +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
         vmovups _iBrkValue(%rax), %zmm4
         vmovups _sPoly_7(%rax), %zmm8
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
 #endif
 END (_ZGVeN16v_logf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.7:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.7,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         vpsrlq    $32, %zmm3, %zmm2
         vpmovqd   %zmm2, %ymm11
         vcvtps2pd %ymm14, %zmm13
 -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
 +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovaps   %zmm14, %zmm26
         vpandd _ABSMASK(%rax), %zmm1, %zmm8
         vpcmpd    $1, _INF(%rax), %zmm8, %k2
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         vpmovqd   %zmm11, %ymm5
         vpxord    %zmm10, %zmm10, %zmm10
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
 -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
 +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
         vpxord    %zmm11, %zmm11, %zmm11
         vcvtdq2pd %ymm7, %zmm7
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16vv_powf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.23:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.23,@object
 -.L_2il0floatpacket.24:
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.24,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 index 9cf359c86ff9bd70..a446c504f63c9399 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
 /* Result sign calculations */
         vpternlogd $150, %zmm0, %zmm14, %zmm1
 -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
 +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 /* Add correction term 0.5 for cos() part */
         vaddps    %zmm8, %zmm5, %zmm15
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
 ENTRY (_ZGVeN16vvv_sincosf_skx)
 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
 END (_ZGVeN16vvv_sincosf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.13:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 index bd05109a62181f22..c1b352d0ad1992cd 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
 /* Check for large and special values */
 -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
 +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovups __sAbsMask(%rax), %zmm5
         vmovups __sInvPI(%rax), %zmm1
         vmovups __sRShifter(%rax), %zmm2
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16v_sinf_skx)
 -
 -	.section .rodata, "a"
 -.L_2il0floatpacket.11:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.11,@object
--- a/glibc-upstream-2.34-174.patch
+++ b/glibc-upstream-2.34-174.patch
@ -0,0 +1,42 @@
 commit b5a44a6a471aafd3677659a610f32468c40a666b
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Tue Sep 21 18:31:49 2021 -0500
    x86: Modify ENTRY in sysdep.h so that p2align can be specified
    No bug.
    This change adds a new macro ENTRY_P2ALIGN which takes a second
    argument, log2 of the desired function alignment.
    The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
    doesn't affect any existing functionality.
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    (cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
 diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
 index cac1d762fb3f99d0..937180c1bd791570 100644
 --- a/sysdeps/x86/sysdep.h
 +++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
 /* Define an entry point visible from C.  */
 -#define	ENTRY(name)							      \
 +#define	ENTRY_P2ALIGN(name, alignment)					      \
   .globl C_SYMBOL_NAME(name);						      \
   .type C_SYMBOL_NAME(name),@function;					      \
 -  .align ALIGNARG(4);							      \
 +  .align ALIGNARG(alignment);						      \
   C_LABEL(name)								      \
   cfi_startproc;							      \
   _CET_ENDBR;								      \
   CALL_MCOUNT
 +/* Common entry 16 byte aligns.  */
 +#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
 +
 #undef	END
 #define END(name)							      \
   cfi_endproc;								      \
--- a/glibc-upstream-2.34-175.patch
+++ b/glibc-upstream-2.34-175.patch
@ -0,0 +1,653 @@
 commit 5ec3416853c4150c4d13312e05f93a053586d528
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Tue Sep 21 18:45:03 2021 -0500
    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
    No bug.
    The frontend optimizations are to:
    1. Reorganize logically connected basic blocks so they are either in
       the same cache line or adjacent cache lines.
    2. Avoid cases when basic blocks unnecissarily cross cache lines.
    3. Try and 32 byte align any basic blocks possible without sacrificing
       code size. Smaller / Less hot basic blocks are used for this.
    Overall code size shrunk by 168 bytes. This should make up for any
    extra costs due to aligning to 64 bytes.
    In general performance before deviated a great deal dependending on
    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
    essentially make it so that the current implementation is at least
    equal to the best alignment of the original for any arguments.
    The only additional optimization is in the page cross case. Branch on
    equals case was removed from the size == [4, 7] case. As well the [4,
    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
    argument size.
    test-memcmp and test-wmemcmp are both passing.
    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -34,7 +34,24 @@
       area.
    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
 -   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
 +   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
 +
 +When possible the implementation tries to optimize for frontend in the
 +following ways:
 +Throughput:
 +    1. All code sections that fit are able to run optimally out of the
 +       LSD.
 +    2. All code sections that fit are able to run optimally out of the
 +       DSB
 +    3. Basic blocks are contained in minimum number of fetch blocks
 +       necessary.
 +
 +Latency:
 +    1. Logically connected basic blocks are put in the same
 +       cache-line.
 +    2. Logically connected basic blocks that do not fit in the same
 +       cache-line are put in adjacent lines. This can get beneficial
 +       L2 spatial prefetching and L1 next-line prefetching.  */
 # include <sysdep.h>
@@ -47,9 +64,11 @@
 # ifdef USE_AS_WMEMCMP
 #  define CHAR_SIZE	4
 #  define VPCMP	vpcmpd
 +#  define VPTEST	vptestmd
 # else
 #  define CHAR_SIZE	1
 #  define VPCMP	vpcmpub
 +#  define VPTEST	vptestmb
 # endif
 # define VEC_SIZE	32
@@ -75,7 +94,9 @@
 */
 	.section .text.evex,"ax",@progbits
 -ENTRY (MEMCMP)
 +/* Cache align memcmp entry. This allows for much more thorough
 +   frontend optimization.  */
 +ENTRY_P2ALIGN (MEMCMP, 6)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
 	VPCMP	$4, (%rdi), %YMM1, %k1
 	kmovd	%k1, %eax
 	/* NB: eax must be destination register if going to
 -	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 +	   L(return_vec_[0,2]). For L(return_vec_3) destination register
 	   must be ecx.  */
 	testl	%eax, %eax
 	jnz	L(return_vec_0)
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
 	testl	%ecx, %ecx
 	jnz	L(return_vec_3)
 -	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
 -	   compare with zero to get a mask is needed.  */
 -	vpxorq	%XMM0, %XMM0, %XMM0
 -
 	/* Go to 4x VEC loop.  */
 	cmpq	$(CHAR_PER_VEC * 8), %rdx
 	ja	L(more_8x_vec)
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 -	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
 -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
 -	   oring with YMM3. Result is stored in YMM4.  */
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 -	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
 -	VPCMP	$4, %YMM4, %YMM0, %k1
 +	   oring with YMM1. Result is stored in YMM4.  */
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 +
 +	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
 +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 +
 +	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
 +	 */
 +	VPTEST	%YMM4, %YMM4, %k1
 +	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(return_vec_0_1_2_3)
 	/* NB: eax must be zero to reach here.  */
 	ret
 -	/* NB: aligning 32 here allows for the rest of the jump targets
 -	   to be tuned for 32 byte alignment. Most important this ensures
 -	   the L(more_8x_vec) loop is 32 byte aligned.  */
 -	.p2align 5
 -L(less_vec):
 -	/* Check if one or less CHAR. This is necessary for size = 0 but
 -	   is also faster for size = CHAR_SIZE.  */
 -	cmpl	$1, %edx
 -	jbe	L(one_or_less)
 +	.p2align 4
 +L(8x_end_return_vec_0_1_2_3):
 +	movq	%rdx, %rdi
 +L(8x_return_vec_0_1_2_3):
 +	addq	%rdi, %rsi
 +L(return_vec_0_1_2_3):
 +	VPTEST	%YMM1, %YMM1, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 -	/* Check if loading one VEC from either s1 or s2 could cause a
 -	   page cross. This can have false positives but is by far the
 -	   fastest method.  */
 -	movl	%edi, %eax
 -	orl	%esi, %eax
 -	andl	$(PAGE_SIZE - 1), %eax
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 -	jg	L(page_cross_less_vec)
 +	VPTEST	%YMM2, %YMM2, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_1)
 -	/* No page cross possible.  */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMP	$4, (%rdi), %YMM2, %k1
 -	kmovd	%k1, %eax
 -	/* Create mask in ecx for potentially in bound matches.  */
 -	bzhil	%edx, %eax, %eax
 -	jnz	L(return_vec_0)
 +	VPTEST	%YMM3, %YMM3, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_2)
 +L(return_vec_3):
 +	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
 +	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
 +	   line.  */
 +	bsfl	%ecx, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 +	subl	%ecx, %eax
 +# endif
 	ret
 	.p2align 4
@@ -209,10 +240,11 @@ L(return_vec_0):
 # endif
 	ret
 -	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
 -	   which is good enough for a target not in a loop.  */
 +	.p2align 4
 L(return_vec_1):
 -	tzcntl	%eax, %eax
 +	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
 +	   fetch block.  */
 +	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -226,10 +258,11 @@ L(return_vec_1):
 # endif
 	ret
 -	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
 -	   which is good enough for a target not in a loop.  */
 +	.p2align 4,, 10
 L(return_vec_2):
 -	tzcntl	%eax, %eax
 +	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
 +	   fetch block.  */
 +	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -243,40 +276,6 @@ L(return_vec_2):
 # endif
 	ret
 -	.p2align 4
 -L(8x_return_vec_0_1_2_3):
 -	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 -	addq	%rdi, %rsi
 -L(return_vec_0_1_2_3):
 -	VPCMP	$4, %YMM1, %YMM0, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_0)
 -
 -	VPCMP	$4, %YMM2, %YMM0, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_1)
 -
 -	VPCMP	$4, %YMM3, %YMM0, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_2)
 -L(return_vec_3):
 -	tzcntl	%ecx, %ecx
 -# ifdef USE_AS_WMEMCMP
 -	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 -	xorl	%edx, %edx
 -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 -	setg	%dl
 -	leal	-1(%rdx, %rdx), %eax
 -# else
 -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 -	subl	%ecx, %eax
 -# endif
 -	ret
 -
 	.p2align 4
 L(more_8x_vec):
 	/* Set end of s1 in rdx.  */
@@ -288,21 +287,19 @@ L(more_8x_vec):
 	andq	$-VEC_SIZE, %rdi
 	/* Adjust because first 4x vec where check already.  */
 	subq	$-(VEC_SIZE * 4), %rdi
 +
 	.p2align 4
 L(loop_4x_vec):
 	VMOVU	(%rsi, %rdi), %YMM1
 	vpxorq	(%rdi), %YMM1, %YMM1
 -
 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
 -
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 -
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 -	VPCMP	$4, %YMM4, %YMM0, %k1
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 +	VPTEST	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(8x_return_vec_0_1_2_3)
@@ -319,28 +316,25 @@ L(loop_4x_vec):
 	cmpl	$(VEC_SIZE * 2), %edi
 	jae	L(8x_last_2x_vec)
 +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 +
 	VMOVU	(%rsi, %rdx), %YMM1
 	vpxorq	(%rdx), %YMM1, %YMM1
 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
 -
 -	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 -
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
 -	VPCMP	$4, %YMM4, %YMM0, %k1
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 +	VPTEST	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
 -	/* Restore s1 pointer to rdi.  */
 -	movq	%rdx, %rdi
 	testl	%ecx, %ecx
 -	jnz	L(8x_return_vec_0_1_2_3)
 +	jnz	L(8x_end_return_vec_0_1_2_3)
 	/* NB: eax must be zero to reach here.  */
 	ret
 	/* Only entry is from L(more_8x_vec).  */
 -	.p2align 4
 +	.p2align 4,, 10
 L(8x_last_2x_vec):
 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
 	kmovd	%k1, %eax
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
 	jnz	L(8x_return_vec_3)
 	ret
 -	.p2align 4
 +	/* Not ideally aligned (at offset +9 bytes in fetch block) but
 +	   not aligning keeps it in the same cache line as
 +	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
 +	   size.  */
 +	.p2align 4,, 4
 +L(8x_return_vec_2):
 +	subq	$VEC_SIZE, %rdx
 +L(8x_return_vec_3):
 +	bsfl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 +	movl	(VEC_SIZE * 3)(%rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	addq	%rdx, %rax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	ret
 +
 +	.p2align 4,, 10
 L(last_2x_vec):
 	/* Check second to last VEC.  */
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
@@ -374,26 +392,49 @@ L(last_1x_vec):
 	jnz	L(return_vec_0_end)
 	ret
 -	.p2align 4
 -L(8x_return_vec_2):
 -	subq	$VEC_SIZE, %rdx
 -L(8x_return_vec_3):
 -	tzcntl	%eax, %eax
 +	.p2align 4,, 10
 +L(return_vec_1_end):
 +	/* Use bsf to save code size. This is necessary to have
 +	   L(one_or_less) fit in aligning bytes between.  */
 +	bsfl	%eax, %eax
 +	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 -	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 -	movl	(VEC_SIZE * 3)(%rax), %ecx
 +	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
 -	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 -	addq	%rdx, %rax
 -	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 -	movzbl	(VEC_SIZE * 3)(%rax), %eax
 +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 +	/* NB: L(one_or_less) fits in alignment padding between
 +	   L(return_vec_1_end) and L(return_vec_0_end).  */
 +# ifdef USE_AS_WMEMCMP
 +L(one_or_less):
 +	jb	L(zero)
 +	movl	(%rdi), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi), %ecx
 +	je	L(zero)
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +	ret
 +# else
 +L(one_or_less):
 +	jb	L(zero)
 +	movzbl	(%rsi), %ecx
 +	movzbl	(%rdi), %eax
 +	subl	%ecx, %eax
 +	ret
 +# endif
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 +
 	.p2align 4
 L(return_vec_0_end):
 	tzcntl	%eax, %eax
@@ -412,23 +453,56 @@ L(return_vec_0_end):
 	ret
 	.p2align 4
 -L(return_vec_1_end):
 +L(less_vec):
 +	/* Check if one or less CHAR. This is necessary for size == 0
 +	   but is also faster for size == CHAR_SIZE.  */
 +	cmpl	$1, %edx
 +	jbe	L(one_or_less)
 +
 +	/* Check if loading one VEC from either s1 or s2 could cause a
 +	   page cross. This can have false positives but is by far the
 +	   fastest method.  */
 +	movl	%edi, %eax
 +	orl	%esi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	jg	L(page_cross_less_vec)
 +
 +	/* No page cross possible.  */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMP	$4, (%rdi), %YMM2, %k1
 +	kmovd	%k1, %eax
 +	/* Check if any matches where in bounds. Intentionally not
 +	   storing result in eax to limit dependency chain if it goes to
 +	   L(return_vec_0_lv).  */
 +	bzhil	%edx, %eax, %edx
 +	jnz	L(return_vec_0_lv)
 +	xorl	%eax, %eax
 +	ret
 +
 +	/* Essentially duplicate of L(return_vec_0). Ends up not costing
 +	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
 +	   the jump and ends up fitting in aligning bytes. As well fits on
 +	   same cache line as L(less_vec) so also saves a line from having
 +	   to be fetched on cold calls to memcmp.  */
 +	.p2align 4,, 4
 +L(return_vec_0_lv):
 	tzcntl	%eax, %eax
 -	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 -	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
 -	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 +	/* NB: no partial register stall here because xorl zero idiom
 +	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 -	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 -	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 +	movzbl	(%rsi, %rax), %ecx
 +	movzbl	(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 -
 	.p2align 4
 L(page_cross_less_vec):
 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
 	cmpl	$8, %edx
 	jae	L(between_8_15)
 	cmpl	$4, %edx
 -	jae	L(between_4_7)
 -L(between_2_3):
 -	/* Load as big endian to avoid branches.  */
 -	movzwl	(%rdi), %eax
 -	movzwl	(%rsi), %ecx
 -	shll	$8, %eax
 -	shll	$8, %ecx
 -	bswap	%eax
 -	bswap	%ecx
 -	movzbl	-1(%rdi, %rdx), %edi
 -	movzbl	-1(%rsi, %rdx), %esi
 -	orl	%edi, %eax
 -	orl	%esi, %ecx
 -	/* Subtraction is okay because the upper 8 bits are zero.  */
 -	subl	%ecx, %eax
 -	ret
 -	.p2align 4
 -L(one_or_less):
 -	jb	L(zero)
 -	movzbl	(%rsi), %ecx
 -	movzbl	(%rdi), %eax
 -	subl	%ecx, %eax
 +	jb	L(between_2_3)
 +
 +	/* Load as big endian with overlapping movbe to avoid branches.
 +	 */
 +	movbe	(%rdi), %eax
 +	movbe	(%rsi), %ecx
 +	shlq	$32, %rax
 +	shlq	$32, %rcx
 +	movbe	-4(%rdi, %rdx), %edi
 +	movbe	-4(%rsi, %rdx), %esi
 +	orq	%rdi, %rax
 +	orq	%rsi, %rcx
 +	subq	%rcx, %rax
 +	/* edx is guranteed to be positive int32 in range [4, 7].  */
 +	cmovne	%edx, %eax
 +	/* ecx is -1 if rcx > rax. Otherwise 0.  */
 +	sbbl	%ecx, %ecx
 +	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
 +	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
 +	   eax doesn't matter.  */
 +	orl	%ecx, %eax
 	ret
 -	.p2align 4
 +	.p2align 4,, 8
 L(between_8_15):
 # endif
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 -	vmovq	(%rdi), %XMM1
 -	vmovq	(%rsi), %XMM2
 -	VPCMP	$4, %XMM1, %XMM2, %k1
 +	vmovq	(%rdi), %xmm1
 +	vmovq	(%rsi), %xmm2
 +	VPCMP	$4, %xmm1, %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 -	jnz	L(return_vec_0)
 +	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
 -	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
 -	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
 -	vmovq	(%rdi), %XMM1
 -	vmovq	(%rsi), %XMM2
 -	VPCMP	$4, %XMM1, %XMM2, %k1
 +	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
 +	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
 +	VPCMP	$4, %xmm1, %xmm2, %k1
 +	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 -	jnz	L(return_vec_0)
 -	ret
 -
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 +	jnz	L(return_vec_0_end)
 	ret
 -	.p2align 4
 +	.p2align 4,, 8
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 -	VMOVU	(%rsi), %XMM2
 -	VPCMP	$4, (%rdi), %XMM2, %k1
 +
 +	/* Use movups to save code size.  */
 +	movups	(%rsi), %xmm2
 +	VPCMP	$4, (%rdi), %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 -	jnz	L(return_vec_0)
 -
 +	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
 -
 -	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
 -	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
 -	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
 -	VPCMP	$4, (%rdi), %XMM2, %k1
 +	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 +	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 +	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 -	jnz	L(return_vec_0)
 -	ret
 -
 -# ifdef USE_AS_WMEMCMP
 -	.p2align 4
 -L(one_or_less):
 -	jb	L(zero)
 -	movl	(%rdi), %ecx
 -	xorl	%edx, %edx
 -	cmpl	(%rsi), %ecx
 -	je	L(zero)
 -	setg	%dl
 -	leal	-1(%rdx, %rdx), %eax
 +	jnz	L(return_vec_0_end)
 	ret
 -# else
 -	.p2align 4
 -L(between_4_7):
 -	/* Load as big endian with overlapping movbe to avoid branches.
 -	 */
 -	movbe	(%rdi), %eax
 -	movbe	(%rsi), %ecx
 -	shlq	$32, %rax
 -	shlq	$32, %rcx
 -	movbe	-4(%rdi, %rdx), %edi
 -	movbe	-4(%rsi, %rdx), %esi
 -	orq	%rdi, %rax
 -	orq	%rsi, %rcx
 -	subq	%rcx, %rax
 -	jz	L(zero_4_7)
 -	sbbl	%eax, %eax
 -	orl	$1, %eax
 -L(zero_4_7):
 +# ifndef USE_AS_WMEMCMP
 +L(between_2_3):
 +	/* Load as big endian to avoid branches.  */
 +	movzwl	(%rdi), %eax
 +	movzwl	(%rsi), %ecx
 +	shll	$8, %eax
 +	shll	$8, %ecx
 +	bswap	%eax
 +	bswap	%ecx
 +	movzbl	-1(%rdi, %rdx), %edi
 +	movzbl	-1(%rsi, %rdx), %esi
 +	orl	%edi, %eax
 +	orl	%esi, %ecx
 +	/* Subtraction is okay because the upper 8 bits are zero.  */
 +	subl	%ecx, %eax
 	ret
 # endif
 -
 END (MEMCMP)
 #endif
--- a/glibc-upstream-2.34-176.patch
+++ b/glibc-upstream-2.34-176.patch
@ -0,0 +1,497 @@
 commit 6d18a93dbbde2958001d65dff3080beed7ae675a
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Mon Sep 20 16:20:15 2021 -0500
    x86: Optimize memset-vec-unaligned-erms.S
    No bug.
    Optimization are
    1. change control flow for L(more_2x_vec) to fall through to loop and
       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
       size and saves jumps for length > 4x VEC_SIZE.
    2. For EVEX/AVX512 move L(less_vec) closer to entry.
    3. Avoid complex address mode for length > 2x VEC_SIZE
    4. Slightly better aligning code for the loop from the perspective of
       code size and uops.
    5. Align targets so they make full use of their fetch block and if
       possible cache line.
    6. Try and reduce total number of icache lines that will need to be
       pulled in for a given length.
    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
       jumping to the stosb target in the sse2 code section will almost
       certainly be to a new page. The new version does increase code size
       marginally by duplicating the target but should get better iTLB
       behavior as a result.
    test-memset, test-wmemset, and test-bzero are all passing.
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
 diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
 index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
 --- a/sysdeps/x86_64/memset.S
 +++ b/sysdeps/x86_64/memset.S
@@ -18,13 +18,15 @@
    <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 +#define USE_WITH_SSE2	1
 #define VEC_SIZE	16
 +#define MOV_SIZE	3
 +#define RET_SIZE	1
 +
 #define VEC(i)		xmm##i
 -/* Don't use movups and movaps since it will get larger nop paddings for
 -   alignment.  */
 -#define VMOVU		movdqu
 -#define VMOVA		movdqa
 +#define VMOVU     movups
 +#define VMOVA     movaps
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
 diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 index ae0860f36a47d594..1af668af0aeda59e 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,8 +1,14 @@
 #if IS_IN (libc)
 +# define USE_WITH_AVX2	1
 +
 # define VEC_SIZE	32
 +# define MOV_SIZE	4
 +# define RET_SIZE	4
 +
 # define VEC(i)		ymm##i
 -# define VMOVU		vmovdqu
 -# define VMOVA		vmovdqa
 +
 +# define VMOVU     vmovdqu
 +# define VMOVA     vmovdqa
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
 diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 index 8ad842fc2f140527..f14d6f8493c21a36 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,11 +1,18 @@
 #if IS_IN (libc)
 +# define USE_WITH_AVX512	1
 +
 # define VEC_SIZE	64
 +# define MOV_SIZE	6
 +# define RET_SIZE	1
 +
 # define XMM0		xmm16
 # define YMM0		ymm16
 # define VEC0		zmm16
 # define VEC(i)		VEC##i
 -# define VMOVU		vmovdqu64
 -# define VMOVA		vmovdqa64
 +
 +# define VMOVU     vmovdqu64
 +# define VMOVA     vmovdqa64
 +
 # define VZEROUPPER
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 index 640f092903302ad0..64b09e77cc20cc42 100644
 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -1,11 +1,18 @@
 #if IS_IN (libc)
 +# define USE_WITH_EVEX	1
 +
 # define VEC_SIZE	32
 +# define MOV_SIZE	6
 +# define RET_SIZE	1
 +
 # define XMM0		xmm16
 # define YMM0		ymm16
 # define VEC0		ymm16
 # define VEC(i)		VEC##i
 -# define VMOVU		vmovdqu64
 -# define VMOVA		vmovdqa64
 +
 +# define VMOVU     vmovdqu64
 +# define VMOVA     vmovdqa64
 +
 # define VZEROUPPER
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index ff196844a093dc3b..e723413a664c088f 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,8 +63,27 @@
 # endif
 #endif
 +#if VEC_SIZE == 64
 +# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
 +#else
 +# define LOOP_4X_OFFSET	(0)
 +#endif
 +
 +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 +# define END_REG	rcx
 +# define LOOP_REG	rdi
 +#else
 +# define END_REG	rdi
 +# define LOOP_REG	rdx
 +#endif
 +
 #define PAGE_SIZE 4096
 +/* Macro to calculate size of small memset block for aligning
 +   purposes.  */
 +#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
 +
 +
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -74,6 +93,7 @@
 ENTRY (__bzero)
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
 +	xorl	%esi, %esi
 	pxor	%XMM0, %XMM0
 	jmp	L(entry_from_bzero)
 END (__bzero)
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 -ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 +ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	jb	L(less_vec)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 -	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 -	VMOVU	%VEC(0), (%rdi)
 +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 +	 */
 +	VMOVU	%VEC(0), (%rax)
 +	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_RETURN
 -
 -	.p2align 4
 -L(stosb_more_2x_vec):
 -	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 -	ja	L(stosb)
 -#else
 -	.p2align 4
 #endif
 -L(more_2x_vec):
 -	/* Stores to first 2x VEC before cmp as any path forward will
 -	   require it.  */
 -	VMOVU	%VEC(0), (%rdi)
 -	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	ja	L(loop_start)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 -L(return):
 -#if VEC_SIZE > 16
 -	ZERO_UPPER_VEC_REGISTERS_RETURN
 +
 +	.p2align 4,, 10
 +L(last_2x_vec):
 +#ifdef USE_LESS_VEC_MASK_STORE
 +	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
 +	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
 #else
 -	ret
 +	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
 #endif
 +	VZEROUPPER_RETURN
 -L(loop_start):
 -	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
 -	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
 -	cmpq	$(VEC_SIZE * 8), %rdx
 -	jbe	L(loop_end)
 -	andq	$-(VEC_SIZE * 2), %rdi
 -	subq	$-(VEC_SIZE * 4), %rdi
 -	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
 -	.p2align 4
 -L(loop):
 -	VMOVA	%VEC(0), (%rdi)
 -	VMOVA	%VEC(0), VEC_SIZE(%rdi)
 -	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
 -	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
 -	subq	$-(VEC_SIZE * 4), %rdi
 -	cmpq	%rcx, %rdi
 -	jb	L(loop)
 -L(loop_end):
 -	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
 -	       rdx as length is also unchanged.  */
 -	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
 -	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 -	VZEROUPPER_SHORT_RETURN
 -
 -	.p2align 4
 +	/* If have AVX512 mask instructions put L(less_vec) close to
 +	   entry as it doesn't take much space and is likely a hot target.
 +	 */
 +#ifdef USE_LESS_VEC_MASK_STORE
 +	.p2align 4,, 10
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
 -# ifdef USE_LESS_VEC_MASK_STORE
 	/* Clear high bits from edi. Only keeping bits relevant to page
 	   cross check. Note that we are using rax which is set in
 -	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
 -	 */
 +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 	andl	$(PAGE_SIZE - 1), %edi
 -	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
 -	   performance degradation when it has to fault supress.  */
 +	/* Check if VEC_SIZE store cross page. Mask stores suffer
 +	   serious performance degradation when it has to fault supress.
 +	 */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 +	/* This is generally considered a cold target.  */
 	ja	L(cross_page)
 # if VEC_SIZE > 32
 	movq	$-1, %rcx
@@ -247,58 +235,185 @@ L(less_vec):
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
 -	vmovdqu8	%VEC(0), (%rax) {%k1}
 +	vmovdqu8 %VEC(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 +# if defined USE_MULTIARCH && IS_IN (libc)
 +	/* Include L(stosb_local) here if including L(less_vec) between
 +	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
 +	   L(stosb_more_2x_vec) target.  */
 +	.p2align 4,, 10
 +L(stosb_local):
 +	movzbl	%sil, %eax
 +	mov	%RDX_LP, %RCX_LP
 +	mov	%RDI_LP, %RDX_LP
 +	rep	stosb
 +	mov	%RDX_LP, %RAX_LP
 +	VZEROUPPER_RETURN
 +# endif
 +#endif
 +
 +#if defined USE_MULTIARCH && IS_IN (libc)
 	.p2align 4
 -L(cross_page):
 +L(stosb_more_2x_vec):
 +	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 +	ja	L(stosb_local)
 +#endif
 +	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 +	   and (4x, 8x] jump to target.  */
 +L(more_2x_vec):
 +
 +	/* Two different methods of setting up pointers / compare. The
 +	   two methods are based on the fact that EVEX/AVX512 mov
 +	   instructions take more bytes then AVX2/SSE2 mov instructions. As
 +	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
 +	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
 +	   this saves code size and keeps a few targets in one fetch block.
 +	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
 +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 +	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
 +	   LOOP_4X_OFFSET) with LEA_BID.  */
 +
 +	/* END_REG is rcx for EVEX/AVX512.  */
 +	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
 +#endif
 +
 +	/* Stores to first 2x VEC before cmp as any path forward will
 +	   require it.  */
 +	VMOVU	%VEC(0), (%rax)
 +	VMOVU	%VEC(0), VEC_SIZE(%rax)
 +
 +
 +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 +	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 +	addq	%rdx, %END_REG
 +#endif
 +
 +	cmpq	$(VEC_SIZE * 4), %rdx
 +	jbe	L(last_2x_vec)
 +
 +	/* Store next 2x vec regardless.  */
 +	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 +	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
 +
 +
 +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 +	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
 +	   extra offset to addresses in loop. Used for AVX512 to save space
 +	   as no way to get (VEC_SIZE * 4) in imm8.  */
 +# if LOOP_4X_OFFSET == 0
 +	subq	$-(VEC_SIZE * 4), %LOOP_REG
 # endif
 -# if VEC_SIZE > 32
 -	cmpb	$32, %dl
 -	jae	L(between_32_63)
 +	/* Avoid imm32 compare here to save code size.  */
 +	cmpq	%rdi, %rcx
 +#else
 +	addq	$-(VEC_SIZE * 4), %END_REG
 +	cmpq	$(VEC_SIZE * 8), %rdx
 +#endif
 +	jbe	L(last_4x_vec)
 +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 +	/* Set LOOP_REG (rdx).  */
 +	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 +#endif
 +	/* Align dst for loop.  */
 +	andq	$(VEC_SIZE * -2), %LOOP_REG
 +	.p2align 4
 +L(loop):
 +	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
 +	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
 +	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
 +	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 +	subq	$-(VEC_SIZE * 4), %LOOP_REG
 +	cmpq	%END_REG, %LOOP_REG
 +	jb	L(loop)
 +	.p2align 4,, MOV_SIZE
 +L(last_4x_vec):
 +	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
 +	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
 +	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
 +	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
 +L(return):
 +#if VEC_SIZE > 16
 +	ZERO_UPPER_VEC_REGISTERS_RETURN
 +#else
 +	ret
 +#endif
 +
 +	.p2align 4,, 10
 +#ifndef USE_LESS_VEC_MASK_STORE
 +# if defined USE_MULTIARCH && IS_IN (libc)
 +	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 +	   range for 2-byte jump encoding.  */
 +L(stosb_local):
 +	movzbl	%sil, %eax
 +	mov	%RDX_LP, %RCX_LP
 +	mov	%RDI_LP, %RDX_LP
 +	rep	stosb
 +	mov	%RDX_LP, %RAX_LP
 +	VZEROUPPER_RETURN
 # endif
 -# if VEC_SIZE > 16
 -	cmpb	$16, %dl
 +	/* Define L(less_vec) only if not otherwise defined.  */
 +	.p2align 4
 +L(less_vec):
 +#endif
 +L(cross_page):
 +#if VEC_SIZE > 32
 +	cmpl	$32, %edx
 +	jae	L(between_32_63)
 +#endif
 +#if VEC_SIZE > 16
 +	cmpl	$16, %edx
 	jae	L(between_16_31)
 -# endif
 -	MOVQ	%XMM0, %rcx
 -	cmpb	$8, %dl
 +#endif
 +	MOVQ	%XMM0, %rdi
 +	cmpl	$8, %edx
 	jae	L(between_8_15)
 -	cmpb	$4, %dl
 +	cmpl	$4, %edx
 	jae	L(between_4_7)
 -	cmpb	$1, %dl
 +	cmpl	$1, %edx
 	ja	L(between_2_3)
 -	jb	1f
 -	movb	%cl, (%rax)
 -1:
 +	jb	L(return)
 +	movb	%sil, (%rax)
 	VZEROUPPER_RETURN
 -# if VEC_SIZE > 32
 +
 +	/* Align small targets only if not doing so would cross a fetch
 +	   line.  */
 +#if VEC_SIZE > 32
 +	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
 -	VMOVU	%YMM0, -32(%rax,%rdx)
 	VMOVU	%YMM0, (%rax)
 +	VMOVU	%YMM0, -32(%rax, %rdx)
 	VZEROUPPER_RETURN
 -# endif
 -# if VEC_SIZE > 16
 -	/* From 16 to 31.  No branch when size == 16.  */
 +#endif
 +
 +#if VEC_SIZE >= 32
 +	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 L(between_16_31):
 -	VMOVU	%XMM0, -16(%rax,%rdx)
 +	/* From 16 to 31.  No branch when size == 16.  */
 	VMOVU	%XMM0, (%rax)
 +	VMOVU	%XMM0, -16(%rax, %rdx)
 	VZEROUPPER_RETURN
 -# endif
 -	/* From 8 to 15.  No branch when size == 8.  */
 +#endif
 +
 +	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
 L(between_8_15):
 -	movq	%rcx, -8(%rax,%rdx)
 -	movq	%rcx, (%rax)
 +	/* From 8 to 15.  No branch when size == 8.  */
 +	movq	%rdi, (%rax)
 +	movq	%rdi, -8(%rax, %rdx)
 	VZEROUPPER_RETURN
 +
 +	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 -	movl	%ecx, -4(%rax,%rdx)
 -	movl	%ecx, (%rax)
 +	movl	%edi, (%rax)
 +	movl	%edi, -4(%rax, %rdx)
 	VZEROUPPER_RETURN
 +
 +	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 -	movw	%cx, -2(%rax,%rdx)
 -	movw	%cx, (%rax)
 +	movw	%di, (%rax)
 +	movb	%dil, -1(%rax, %rdx)
 	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
--- a/glibc-upstream-2.34-177.patch
+++ b/glibc-upstream-2.34-177.patch
@ -0,0 +1,40 @@
 commit baf3ece63453adac59c5688930324a78ced5b2e4
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Sat Oct 23 01:26:47 2021 -0400
    x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
    This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
    it could potentially be dangerous to use SSE2 if this function is ever
    called without using 'vzeroupper' beforehand. While compilers appear
    to use 'vzeroupper' before function calls if AVX2 has been used, using
    SSE2 here is more brittle. Since it is not absolutely necessary it
    should be avoided.
    It costs 2-extra bytes but the extra bytes should only eat into
    alignment padding.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 index 2761b54f2e7dea9f..640f6757fac8a356 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@ L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	/* Use movups to save code size.  */
 -	movups	(%rsi), %xmm2
 +	vmovdqu	(%rsi), %xmm2
 	VPCMP	$4, (%rdi), %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
 -	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 +	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax
--- a/glibc-upstream-2.34-178.patch
+++ b/glibc-upstream-2.34-178.patch
@ -0,0 +1,690 @@
 commit f35ad30da4880a1574996df0674986ecf82fa7ae
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Fri Oct 29 12:40:20 2021 -0700
    x86-64: Improve EVEX strcmp with masked load
    In strcmp-evex.S, to compare 2 32-byte strings, replace
            VMOVU   (%rdi, %rdx), %YMM0
            VMOVU   (%rsi, %rdx), %YMM1
            /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
            VPCMP   $4, %YMM0, %YMM1, %k0
            VPCMP   $0, %YMMZERO, %YMM0, %k1
            VPCMP   $0, %YMMZERO, %YMM1, %k2
            /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
            kord    %k1, %k2, %k1
            /* Each bit in K1 represents a NULL or a mismatch.  */
            kord    %k0, %k1, %k1
            kmovd   %k1, %ecx
            testl   %ecx, %ecx
            jne     L(last_vector)
    with
            VMOVU   (%rdi, %rdx), %YMM0
            VPTESTM %YMM0, %YMM0, %k2
            /* Each bit cleared in K1 represents a mismatch or a null CHAR
               in YMM0 and 32 bytes at (%rsi, %rdx).  */
            VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
            kmovd   %k1, %ecx
            incl    %ecx
            jne     L(last_vector)
    It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
    and Ice Lake.
    Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
    (cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -41,6 +41,8 @@
 # ifdef USE_AS_WCSCMP
 /* Compare packed dwords.  */
 #  define VPCMP		vpcmpd
 +#  define VPMINU	vpminud
 +#  define VPTESTM	vptestmd
 #  define SHIFT_REG32	r8d
 #  define SHIFT_REG64	r8
 /* 1 dword char == 4 bytes.  */
@@ -48,6 +50,8 @@
 # else
 /* Compare packed bytes.  */
 #  define VPCMP		vpcmpb
 +#  define VPMINU	vpminub
 +#  define VPTESTM	vptestmb
 #  define SHIFT_REG32	ecx
 #  define SHIFT_REG64	rcx
 /* 1 byte char == 1 byte.  */
@@ -67,6 +71,9 @@
 # define YMM5		ymm22
 # define YMM6		ymm23
 # define YMM7		ymm24
 +# define YMM8		ymm25
 +# define YMM9		ymm26
 +# define YMM10		ymm27
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -76,7 +83,7 @@
 /* The main idea of the string comparison (byte or dword) using 256-bit
    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
    latter can be on either packed bytes or dwords depending on
 -   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
 +   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
@@ -123,27 +130,21 @@ ENTRY (STRCMP)
 	jg	L(cross_page)
 	/* Start comparing 4 vectors.  */
 	VMOVU	(%rdi), %YMM0
 -	VMOVU	(%rsi), %YMM1
 -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 -	VPCMP	$4, %YMM0, %YMM1, %k0
 +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 +	VPTESTM	%YMM0, %YMM0, %k2
 -	/* Check for NULL in YMM0.  */
 -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 -	/* Check for NULL in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 -	kord	%k1, %k2, %k1
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at (%rsi).  */
 +	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
 -	/* Each bit in K1 represents:
 -	   1. A mismatch in YMM0 and YMM1.  Or
 -	   2. A NULL in YMM0 or YMM1.
 -	 */
 -	kord	%k0, %k1, %k1
 -
 -	ktestd	%k1, %k1
 -	je	L(next_3_vectors)
 	kmovd	%k1, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 +	je	L(next_3_vectors)
 	tzcntl	%ecx, %edx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -172,9 +173,7 @@ L(return):
 # endif
 	ret
 -	.p2align 4
 L(return_vec_size):
 -	kmovd	%k1, %ecx
 	tzcntl	%ecx, %edx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -210,9 +209,7 @@ L(return_vec_size):
 # endif
 	ret
 -	.p2align 4
 L(return_2_vec_size):
 -	kmovd	%k1, %ecx
 	tzcntl	%ecx, %edx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -248,9 +245,7 @@ L(return_2_vec_size):
 # endif
 	ret
 -	.p2align 4
 L(return_3_vec_size):
 -	kmovd	%k1, %ecx
 	tzcntl	%ecx, %edx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -289,43 +284,45 @@ L(return_3_vec_size):
 	.p2align 4
 L(next_3_vectors):
 	VMOVU	VEC_SIZE(%rdi), %YMM0
 -	VMOVU	VEC_SIZE(%rsi), %YMM1
 -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 -	VPCMP	$4, %YMM0, %YMM1, %k0
 -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 -	ktestd	%k1, %k1
 +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
 +	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
 +	kmovd	%k1, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	jne	L(return_vec_size)
 -	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
 -	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
 -	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
 -	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
 -
 -	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
 -	VPCMP	$4, %YMM2, %YMM4, %k0
 -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 -	VPCMP	$0, %YMMZERO, %YMM4, %k2
 -	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 -	ktestd	%k1, %k1
 +	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
 +	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
 +	kmovd	%k1, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	jne	L(return_2_vec_size)
 -	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
 -	VPCMP	$4, %YMM3, %YMM5, %k0
 -	VPCMP	$0, %YMMZERO, %YMM3, %k1
 -	VPCMP	$0, %YMMZERO, %YMM5, %k2
 -	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 -	ktestd	%k1, %k1
 +	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
 +	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
 +	kmovd	%k1, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	jne	L(return_3_vec_size)
 L(main_loop_header):
 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
@@ -375,56 +372,51 @@ L(back_to_loop):
 	VMOVA	VEC_SIZE(%rax), %YMM2
 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
 -	VMOVU	(%rdx), %YMM1
 -	VMOVU	VEC_SIZE(%rdx), %YMM3
 -	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
 -	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
 -
 -	VPCMP	$4, %YMM0, %YMM1, %k0
 -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
 -	   YMM1.  */
 -	kord	%k0, %k1, %k4
 -
 -	VPCMP	$4, %YMM2, %YMM3, %k0
 -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 -	VPCMP	$0, %YMMZERO, %YMM3, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
 -	   YMM3.  */
 -	kord	%k0, %k1, %k5
 -
 -	VPCMP	$4, %YMM4, %YMM5, %k0
 -	VPCMP	$0, %YMMZERO, %YMM4, %k1
 -	VPCMP	$0, %YMMZERO, %YMM5, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
 -	   YMM5.  */
 -	kord	%k0, %k1, %k6
 -
 -	VPCMP	$4, %YMM6, %YMM7, %k0
 -	VPCMP	$0, %YMMZERO, %YMM6, %k1
 -	VPCMP	$0, %YMMZERO, %YMM7, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
 -	   YMM7.  */
 -	kord	%k0, %k1, %k7
 -
 -	kord	%k4, %k5, %k0
 -	kord	%k6, %k7, %k1
 -
 -	/* Test each mask (32 bits) individually because for VEC_SIZE
 -	   == 32 is not possible to OR the four masks and keep all bits
 -	   in a 64-bit integer register, differing from SSE2 strcmp
 -	   where ORing is possible.  */
 -	kortestd %k0, %k1
 -	je	L(loop)
 -	ktestd	%k4, %k4
 +
 +	VPMINU	%YMM0, %YMM2, %YMM8
 +	VPMINU	%YMM4, %YMM6, %YMM9
 +
 +	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
 +	VPMINU	%YMM8, %YMM9, %YMM8
 +
 +	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
 +	VPTESTM	%YMM8, %YMM8, %k1
 +
 +	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
 +	vpxorq	(%rdx), %YMM0, %YMM1
 +	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
 +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
 +	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
 +
 +	vporq	%YMM1, %YMM3, %YMM9
 +	vporq	%YMM5, %YMM7, %YMM10
 +
 +	/* A non-zero CHAR in YMM9 represents a mismatch.  */
 +	vporq	%YMM9, %YMM10, %YMM9
 +
 +	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
 +	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
 +	kmovd   %k0, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 +	je	 L(loop)
 +
 +	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
 +	VPTESTM	%YMM0, %YMM0, %k1
 +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 +	   in YMM0 and (%rdx).  */
 +	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
 +	kmovd	%k0, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	je	L(test_vec)
 -	kmovd	%k4, %edi
 -	tzcntl	%edi, %ecx
 +	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %ecx
@@ -466,9 +458,18 @@ L(test_vec):
 	cmpq	$VEC_SIZE, %r11
 	jbe	L(zero)
 # endif
 -	ktestd	%k5, %k5
 +	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
 +	VPTESTM	%YMM2, %YMM2, %k1
 +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 +	   in YMM2 and VEC_SIZE(%rdx).  */
 +	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
 +	kmovd	%k0, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	je	L(test_2_vec)
 -	kmovd	%k5, %ecx
 	tzcntl	%ecx, %edi
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -512,9 +513,18 @@ L(test_2_vec):
 	cmpq	$(VEC_SIZE * 2), %r11
 	jbe	L(zero)
 # endif
 -	ktestd	%k6, %k6
 +	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
 +	VPTESTM	%YMM4, %YMM4, %k1
 +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 +	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
 +	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
 +	kmovd	%k0, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	je	L(test_3_vec)
 -	kmovd	%k6, %ecx
 	tzcntl	%ecx, %edi
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
@@ -558,8 +568,18 @@ L(test_3_vec):
 	cmpq	$(VEC_SIZE * 3), %r11
 	jbe	L(zero)
 # endif
 -	kmovd	%k7, %esi
 -	tzcntl	%esi, %ecx
 +	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
 +	VPTESTM	%YMM6, %YMM6, %k1
 +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 +	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
 +	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
 +	kmovd	%k0, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 +	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %ecx
@@ -615,39 +635,51 @@ L(loop_cross_page):
 	VMOVU	(%rax, %r10), %YMM2
 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
 -	VMOVU	(%rdx, %r10), %YMM4
 -	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
 -
 -	VPCMP	$4, %YMM4, %YMM2, %k0
 -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 -	VPCMP	$0, %YMMZERO, %YMM4, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
 -	   YMM4.  */
 -	kord	%k0, %k1, %k1
 -
 -	VPCMP	$4, %YMM5, %YMM3, %k3
 -	VPCMP	$0, %YMMZERO, %YMM3, %k4
 -	VPCMP	$0, %YMMZERO, %YMM5, %k5
 -	kord	%k4, %k5, %k4
 -	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
 -	   YMM5.  */
 -	kord	%k3, %k4, %k3
 +
 +	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
 +	VPTESTM	%YMM2, %YMM2, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM2 and 32 bytes at (%rdx, %r10).  */
 +	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
 +	kmovd	%k1, %r9d
 +	/* Don't use subl since it is the lower 16/32 bits of RDI
 +	   below.  */
 +	notl	%r9d
 +# ifdef USE_AS_WCSCMP
 +	/* Only last 8 bits are valid.  */
 +	andl	$0xff, %r9d
 +# endif
 +
 +	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
 +	VPTESTM	%YMM3, %YMM3, %k4
 +	/* Each bit cleared in K3 represents a mismatch or a null CHAR
 +	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
 +	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
 +	kmovd	%k3, %edi
 +# ifdef USE_AS_WCSCMP
 +	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 +	notl	%edi
 +	andl	$0xff, %edi
 +# else
 +	incl	%edi
 +# endif
 # ifdef USE_AS_WCSCMP
 -	/* NB: Each bit in K1/K3 represents 4-byte element.  */
 -	kshiftlw $8, %k3, %k2
 +	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
 +	sall	$8, %edi
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
 	movl	%ecx, %SHIFT_REG32
 	sarl	$2, %SHIFT_REG32
 +
 +	/* Each bit in EDI represents a null CHAR or a mismatch.  */
 +	orl	%r9d, %edi
 # else
 -	kshiftlq $32, %k3, %k2
 -# endif
 +	salq	$32, %rdi
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	korq	%k1, %k2, %k1
 -	kmovq	%k1, %rdi
 +	/* Each bit in RDI represents a null CHAR or a mismatch.  */
 +	orq	%r9, %rdi
 +# endif
 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
 	shrxq	%SHIFT_REG64, %rdi, %rdi
@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
 -	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
 -	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
 -
 -	VPCMP	$4, %YMM0, %YMM2, %k0
 -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 -	VPCMP	$0, %YMMZERO, %YMM2, %k2
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
 -	   YMM2.  */
 -	kord	%k0, %k1, %k1
 -
 -	VPCMP	$4, %YMM1, %YMM3, %k3
 -	VPCMP	$0, %YMMZERO, %YMM1, %k4
 -	VPCMP	$0, %YMMZERO, %YMM3, %k5
 -	kord	%k4, %k5, %k4
 -	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
 -	   YMM3.  */
 -	kord	%k3, %k4, %k3
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
 +	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
 +	kmovd	%k1, %r9d
 +	/* Don't use subl since it is the lower 16/32 bits of RDI
 +	   below.  */
 +	notl	%r9d
 # ifdef USE_AS_WCSCMP
 -	/* NB: Each bit in K1/K3 represents 4-byte element.  */
 -	kshiftlw $8, %k3, %k2
 +	/* Only last 8 bits are valid.  */
 +	andl	$0xff, %r9d
 +# endif
 +
 +	VPTESTM	%YMM1, %YMM1, %k4
 +	/* Each bit cleared in K3 represents a mismatch or a null CHAR
 +	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
 +	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
 +	kmovd	%k3, %edi
 +# ifdef USE_AS_WCSCMP
 +	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 +	notl	%edi
 +	andl	$0xff, %edi
 # else
 -	kshiftlq $32, %k3, %k2
 +	incl	%edi
 # endif
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	korq	%k1, %k2, %k1
 -	kmovq	%k1, %rdi
 +# ifdef USE_AS_WCSCMP
 +	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
 +	sall	$8, %edi
 +
 +	/* Each bit in EDI represents a null CHAR or a mismatch.  */
 +	orl	%r9d, %edi
 +# else
 +	salq	$32, %rdi
 +
 +	/* Each bit in RDI represents a null CHAR or a mismatch.  */
 +	orq	%r9, %rdi
 +# endif
 	xorl	%r8d, %r8d
 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
 	/* R8 has number of bytes skipped.  */
 	movl	%ecx, %r8d
 # ifdef USE_AS_WCSCMP
 -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 +	/* NB: Divide shift count by 4 since each bit in RDI represent 4
 	   bytes.  */
 	sarl	$2, %ecx
 -# endif
 +	/* Skip ECX bytes.  */
 +	shrl	%cl, %edi
 +# else
 	/* Skip ECX bytes.  */
 	shrq	%cl, %rdi
 +# endif
 1:
 	/* Before jumping back to the loop, set ESI to the number of
 	   VEC_SIZE * 4 blocks before page crossing.  */
@@ -818,7 +863,7 @@ L(cross_page_loop):
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %ecx
 # endif
 -	/* Check null char.  */
 +	/* Check null CHAR.  */
 	testl	%eax, %eax
 	jne	L(cross_page_loop)
 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
@@ -901,18 +946,17 @@ L(cross_page):
 	jg	L(cross_page_1_vector)
 L(loop_1_vector):
 	VMOVU	(%rdi, %rdx), %YMM0
 -	VMOVU	(%rsi, %rdx), %YMM1
 -
 -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 -	VPCMP	$4, %YMM0, %YMM1, %k0
 -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 +
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
 +	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
 	kmovd	%k1, %ecx
 -	testl	%ecx, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xff, %ecx
 +# else
 +	incl	%ecx
 +# endif
 	jne	L(last_vector)
 	addl	$VEC_SIZE, %edx
@@ -931,18 +975,17 @@ L(cross_page_1_vector):
 	cmpl	$(PAGE_SIZE - 16), %eax
 	jg	L(cross_page_1_xmm)
 	VMOVU	(%rdi, %rdx), %XMM0
 -	VMOVU	(%rsi, %rdx), %XMM1
 -
 -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 -	VPCMP	$4, %XMM0, %XMM1, %k0
 -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 -	korw	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	korw	%k0, %k1, %k1
 -	kmovw	%k1, %ecx
 -	testl	%ecx, %ecx
 +
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
 +	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
 +	kmovd	%k1, %ecx
 +# ifdef USE_AS_WCSCMP
 +	subl	$0xf, %ecx
 +# else
 +	subl	$0xffff, %ecx
 +# endif
 	jne	L(last_vector)
 	addl	$16, %edx
@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
 	vmovq	(%rdi, %rdx), %XMM0
 	vmovq	(%rsi, %rdx), %XMM1
 -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 -	VPCMP	$4, %XMM0, %XMM1, %k0
 -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 -	kmovd	%k1, %ecx
 -
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in XMM0 and XMM1.  */
 +	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
 +	kmovb	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 -	/* Only last 2 bits are valid.  */
 -	andl	$0x3, %ecx
 +	subl	$0x3, %ecx
 # else
 -	/* Only last 8 bits are valid.  */
 -	andl	$0xff, %ecx
 +	subl	$0xff, %ecx
 # endif
 -
 -	testl	%ecx, %ecx
 	jne	L(last_vector)
 	addl	$8, %edx
@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
 	vmovd	(%rdi, %rdx), %XMM0
 	vmovd	(%rsi, %rdx), %XMM1
 -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 -	VPCMP	$4, %XMM0, %XMM1, %k0
 -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 -	kord	%k1, %k2, %k1
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	kord	%k0, %k1, %k1
 +	VPTESTM	%YMM0, %YMM0, %k2
 +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 +	   in XMM0 and XMM1.  */
 +	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
 	kmovd	%k1, %ecx
 -
 # ifdef USE_AS_WCSCMP
 -	/* Only the last bit is valid.  */
 -	andl	$0x1, %ecx
 +	subl	$0x1, %ecx
 # else
 -	/* Only last 4 bits are valid.  */
 -	andl	$0xf, %ecx
 +	subl	$0xf, %ecx
 # endif
 -
 -	testl	%ecx, %ecx
 	jne	L(last_vector)
 	addl	$4, %edx
--- a/glibc-upstream-2.34-179.patch
+++ b/glibc-upstream-2.34-179.patch
@ -0,0 +1,85 @@
 commit a182bb7a3922404f79def09d79ef89678b4049f0
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Fri Oct 29 12:56:53 2021 -0700
    x86-64: Remove Prefer_AVX2_STRCMP
    Remove Prefer_AVX2_STRCMP to enable EVEX strcmp.  When comparing 2 32-byte
    strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
    VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
    and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
    VPMOVMSKB and 1 TESTL.  EVEX strcmp is now faster than AVX2 strcmp by up
    to 40% on Tiger Lake and Ice Lake.
    (cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 index de4e3c3b7258120d..f4d4049e391cbabd 100644
 --- a/sysdeps/x86/cpu-features.c
 +++ b/sysdeps/x86/cpu-features.c
@@ -574,14 +574,6 @@ disable_tsx:
 	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
 	      |= bit_arch_Prefer_No_VZEROUPPER;
 -
 -	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
 -	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
 -	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
 -	     AVX2 strcmp is faster than EVEX strcmp.  */
 -	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
 -	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 -	      |= bit_arch_Prefer_AVX2_STRCMP;
 	}
       /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
 diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
 index 58f2fad4323d5d91..957db3ad229ba39f 100644
 --- a/sysdeps/x86/cpu-tunables.c
 +++ b/sysdeps/x86/cpu-tunables.c
@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
 						Fast_Copy_Backward,
 						disable, 18);
 -	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
 -		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
 	    }
 	  break;
 	case 19:
 diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 index 3bdc76cf71007948..8250bfcbecd29a9f 100644
 --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -31,5 +31,4 @@ BIT (Prefer_ERMS)
 BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 -BIT (Prefer_AVX2_STRCMP)
 BIT (Avoid_Short_Distance_REP_MOVSB)
 diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
 index 62b7abeeee646ab4..7c2901bf44456259 100644
 --- a/sysdeps/x86_64/multiarch/strcmp.c
 +++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
 +	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	return OPTIMIZE (evex);
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
 index 60ba0fe356b31779..f94a421784bfe923 100644
 --- a/sysdeps/x86_64/multiarch/strncmp.c
 +++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
 +	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	return OPTIMIZE (evex);
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
--- a/glibc-upstream-2.34-180.patch
+++ b/glibc-upstream-2.34-180.patch
@ -0,0 +1,48 @@
 commit 2e64237a8744dd50f9222293275fa52e7248ff76
 Author: Fangrui Song <maskray@google.com>
 Date:   Tue Nov 2 20:59:52 2021 -0700
    x86-64: Replace movzx with movzbl
    Clang cannot assemble movzx in the AT&T dialect mode.
    ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
     movzx (%rsi), %ecx
                   ^~~~
    Change movzx to movzbl, which follows the AT&T dialect and is used
    elsewhere in the file.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 index bc19547b09639071..6197a723b9e0606e 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
 	.p2align 4
 	// XXX Same as code above
 LABEL(Byte0):
 -	movzx	(%rsi), %ecx
 -	movzx	(%rdi), %eax
 +	movzbl	(%rsi), %ecx
 +	movzbl	(%rdi), %eax
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
 index 824e648230a15739..7f8a1bc756f86aee 100644
 --- a/sysdeps/x86_64/strcmp.S
 +++ b/sysdeps/x86_64/strcmp.S
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
 	.p2align 4
 LABEL(Byte0):
 -	movzx	(%rsi), %ecx
 -	movzx	(%rdi), %eax
 +	movzbl	(%rsi), %ecx
 +	movzbl	(%rdi), %eax
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
--- a/glibc-upstream-2.34-181.patch
+++ b/glibc-upstream-2.34-181.patch
@ -0,0 +1,843 @@
 commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Mon Nov 1 00:49:51 2021 -0500
    x86: Optimize memmove-vec-unaligned-erms.S
    No bug.
    The optimizations are as follows:
    1) Always align entry to 64 bytes. This makes behavior more
       predictable and makes other frontend optimizations easier.
    2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
       significant benefits in the case that:
            0 < (dst - src) < [256, 512]
    3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
       improvement and for FSRM [-10%, 25%].
    In addition to these primary changes there is general cleanup
    throughout to optimize the aligning routines and control flow logic.
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
 diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
 index db106a7a1f23f268..b2b318084823dceb 100644
 --- a/sysdeps/x86_64/memmove.S
 +++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
 -
 +#define MOV_SIZE	3
 #define SECTION(p)		p
 #ifdef USE_MULTIARCH
 diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
 index 1ec1962e861dbf63..67a55f0c85af841c 100644
 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
 +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 -
 +# define MOV_SIZE	4
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
 index e195e93f153c9512..975ae6c0515b83cb 100644
 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 -
 +# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 index 848848ab39ff9326..0fa7126830af7acb 100644
 --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
 -
 +# define MOV_SIZE	6
 # define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 index 0cbce8f944da51a0..88715441feaaccf5 100644
 --- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 # define VZEROUPPER
 -
 +# define MOV_SIZE	6
 # define SECTION(p)		p##.evex
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index abde8438d41f2320..7b27cbdda5fb99f7 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
 # endif
 #endif
 +/* Whether to align before movsb. Ultimately we want 64 byte
 +   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
 +#define ALIGN_MOVSB	(VEC_SIZE > 16)
 +/* Number of bytes to align movsb to.  */
 +#define MOVSB_ALIGN_TO	64
 +
 +#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
 +#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
 +
 +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
 +# error MOV_SIZE Unknown
 +#endif
 +
 +#if LARGE_MOV_SIZE
 +# define SMALL_SIZE_OFFSET	(4)
 +#else
 +# define SMALL_SIZE_OFFSET	(0)
 +#endif
 +
 #ifndef PAGE_SIZE
 # define PAGE_SIZE 4096
 #endif
@@ -199,25 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 +	/* Load regardless.  */
 +	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 -#if !defined USE_MULTIARCH || !IS_IN (libc)
 -L(last_2x_vec):
 -#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 -#if !defined USE_MULTIARCH || !IS_IN (libc)
 -L(nop):
 -	ret
 +#if !(defined USE_MULTIARCH && IS_IN (libc))
 +	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 -
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 +	/* Load regardless.  */
 +	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 -L(last_2x_vec):
 -	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
 -	VMOVU	(%rsi), %VEC(0)
 -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 +	 */
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 -	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 +	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
 -#if VEC_SIZE > 16
 +# if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 -#else
 +# else
 	ret
 +# endif
 #endif
 -L(movsb):
 -	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
 -	jae	L(more_8x_vec)
 -	cmpq	%rsi, %rdi
 -	jb	1f
 -	/* Source == destination is less common.  */
 -	je	L(nop)
 -	leaq	(%rsi,%rdx), %r9
 -	cmpq	%r9, %rdi
 -	/* Avoid slow backward REP MOVSB.  */
 -	jb	L(more_8x_vec_backward)
 -# if AVOID_SHORT_DISTANCE_REP_MOVSB
 -	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
 -	jz	3f
 -	movq	%rdi, %rcx
 -	subq	%rsi, %rcx
 -	jmp	2f
 -# endif
 -1:
 -# if AVOID_SHORT_DISTANCE_REP_MOVSB
 -	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
 -	jz	3f
 -	movq	%rsi, %rcx
 -	subq	%rdi, %rcx
 -2:
 -/* Avoid "rep movsb" if RCX, the distance between source and destination,
 -   is N*4GB + [1..63] with N >= 0.  */
 -	cmpl	$63, %ecx
 -	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
 -3:
 -# endif
 -	mov	%RDX_LP, %RCX_LP
 -	rep movsb
 -L(nop):
 +#if LARGE_MOV_SIZE
 +	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
 +	   ENTRY block and L(less_vec).  */
 +	.p2align 4,, 8
 +L(between_4_7):
 +	/* From 4 to 7.  No branch when size == 4.  */
 +	movl	(%rsi), %ecx
 +	movl	(%rsi, %rdx), %esi
 +	movl	%ecx, (%rdi)
 +	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 +	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
 -	cmpb	$32, %dl
 +	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
 -	cmpb	$16, %dl
 +	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
 -	cmpb	$8, %dl
 +	cmpl	$8, %edx
 	jae	L(between_8_15)
 -	cmpb	$4, %dl
 +#if SMALL_MOV_SIZE
 +	cmpl	$4, %edx
 +#else
 +	subq	$4, %rdx
 +#endif
 	jae	L(between_4_7)
 -	cmpb	$1, %dl
 -	ja	L(between_2_3)
 -	jb	1f
 -	movzbl	(%rsi), %ecx
 +	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
 +	jl	L(copy_0)
 +	movb	(%rsi), %cl
 +	je	L(copy_1)
 +	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
 +	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
 +L(copy_1):
 	movb	%cl, (%rdi)
 -1:
 +L(copy_0):
 	ret
 +
 +#if SMALL_MOV_SIZE
 +	.p2align 4,, 8
 +L(between_4_7):
 +	/* From 4 to 7.  No branch when size == 4.  */
 +	movl	-4(%rsi, %rdx), %ecx
 +	movl	(%rsi), %esi
 +	movl	%ecx, -4(%rdi, %rdx)
 +	movl	%esi, (%rdi)
 +	ret
 +#endif
 +
 +#if VEC_SIZE > 16
 +	/* From 16 to 31.  No branch when size == 16.  */
 +	.p2align 4,, 8
 +L(between_16_31):
 +	vmovdqu	(%rsi), %xmm0
 +	vmovdqu	-16(%rsi, %rdx), %xmm1
 +	vmovdqu	%xmm0, (%rdi)
 +	vmovdqu	%xmm1, -16(%rdi, %rdx)
 +	/* No ymm registers have been touched.  */
 +	ret
 +#endif
 +
 #if VEC_SIZE > 32
 +	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	VMOVU	(%rsi), %YMM0
 -	VMOVU	-32(%rsi,%rdx), %YMM1
 +	VMOVU	-32(%rsi, %rdx), %YMM1
 	VMOVU	%YMM0, (%rdi)
 -	VMOVU	%YMM1, -32(%rdi,%rdx)
 -	VZEROUPPER_RETURN
 -#endif
 -#if VEC_SIZE > 16
 -	/* From 16 to 31.  No branch when size == 16.  */
 -L(between_16_31):
 -	VMOVU	(%rsi), %XMM0
 -	VMOVU	-16(%rsi,%rdx), %XMM1
 -	VMOVU	%XMM0, (%rdi)
 -	VMOVU	%XMM1, -16(%rdi,%rdx)
 +	VMOVU	%YMM1, -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 +
 +	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 -	movq	-8(%rsi,%rdx), %rcx
 +	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
 -	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
 +	movq	%rcx, -8(%rdi, %rdx)
 	ret
 -L(between_4_7):
 -	/* From 4 to 7.  No branch when size == 4.  */
 -	movl	-4(%rsi,%rdx), %ecx
 -	movl	(%rsi), %esi
 -	movl	%ecx, -4(%rdi,%rdx)
 -	movl	%esi, (%rdi)
 -	ret
 -L(between_2_3):
 -	/* From 2 to 3.  No branch when size == 2.  */
 -	movzwl	-2(%rsi,%rdx), %ecx
 -	movzwl	(%rsi), %esi
 -	movw	%cx, -2(%rdi,%rdx)
 -	movw	%si, (%rdi)
 -	ret
 +	.p2align 4,, 10
 +L(last_4x_vec):
 +	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
 +
 +	/* VEC(0) and VEC(1) have already been loaded.  */
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
 +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
 +	VMOVU	%VEC(0), (%rdi)
 +	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 +	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
 +	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
 -	/* More than 2 * VEC and there may be overlap between destination
 -	   and source.  */
 +	/* More than 2 * VEC and there may be overlap between
 +	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 +	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
 +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_4x_vec)
 -	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
 -	VMOVU	(%rsi), %VEC(0)
 -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 +	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
 -	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
 -	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
 -	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
 +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
 +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
 +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
 -	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
 -	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 -	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 -	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 -	VZEROUPPER_RETURN
 -L(last_4x_vec):
 -	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
 -	VMOVU	(%rsi), %VEC(0)
 -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
 -	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
 -	VMOVU	%VEC(0), (%rdi)
 -	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 -	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
 -	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 +	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
 +	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
 +	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
 +	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 +	.p2align 4,, 4
 L(more_8x_vec):
 +	movq	%rdi, %rcx
 +	subq	%rsi, %rcx
 +	/* Go to backwards temporal copy if overlap no matter what as
 +	   backward REP MOVSB is slow and we don't want to use NT stores if
 +	   there is overlap.  */
 +	cmpq	%rdx, %rcx
 +	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
 +	jb	L(more_8x_vec_backward_check_nop)
 	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
 -	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_memcpy_2x)
 #endif
 -	/* Entry if rdx is greater than non-temporal threshold but there
 -       is overlap.  */
 +	/* To reach this point there cannot be overlap and dst > src. So
 +	   check for overlap and src > dst in which case correctness
 +	   requires forward copy. Otherwise decide between backward/forward
 +	   copy depending on address aliasing.  */
 +
 +	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
 +	   but less than __x86_shared_non_temporal_threshold.  */
 L(more_8x_vec_check):
 -	cmpq	%rsi, %rdi
 -	ja	L(more_8x_vec_backward)
 -	/* Source == destination is less common.  */
 -	je	L(nop)
 -	/* Load the first VEC and last 4 * VEC to support overlapping
 -	   addresses.  */
 -	VMOVU	(%rsi), %VEC(4)
 +	/* rcx contains dst - src. Add back length (rdx).  */
 +	leaq	(%rcx, %rdx), %r8
 +	/* If r8 has different sign than rcx then there is overlap so we
 +	   must do forward copy.  */
 +	xorq	%rcx, %r8
 +	/* Isolate just sign bit of r8.  */
 +	shrq	$63, %r8
 +	/* Get 4k difference dst - src.  */
 +	andl	$(PAGE_SIZE - 256), %ecx
 +	/* If r8 is non-zero must do foward for correctness. Otherwise
 +	   if ecx is non-zero there is 4k False Alaising so do backward
 +	   copy.  */
 +	addl	%r8d, %ecx
 +	jz	L(more_8x_vec_backward)
 +
 +	/* if rdx is greater than __x86_shared_non_temporal_threshold
 +	   but there is overlap, or from short distance movsb.  */
 +L(more_8x_vec_forward):
 +	/* Load first and last 4 * VEC to support overlapping addresses.
 +	 */
 +
 +	/* First vec was already loaded into VEC(0).  */
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 +	/* Save begining of dst.  */
 +	movq	%rdi, %rcx
 +	/* Align dst to VEC_SIZE - 1.  */
 +	orq	$(VEC_SIZE - 1), %rdi
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 -	/* Save start and stop of the destination buffer.  */
 -	movq	%rdi, %r11
 -	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
 -	/* Align destination for aligned stores in the loop.  Compute
 -	   how much destination is misaligned.  */
 -	movq	%rdi, %r8
 -	andq	$(VEC_SIZE - 1), %r8
 -	/* Get the negative of offset for alignment.  */
 -	subq	$VEC_SIZE, %r8
 -	/* Adjust source.  */
 -	subq	%r8, %rsi
 -	/* Adjust destination which should be aligned now.  */
 -	subq	%r8, %rdi
 -	/* Adjust length.  */
 -	addq	%r8, %rdx
 -	.p2align 4
 +	/* Subtract dst from src. Add back after dst aligned.  */
 +	subq	%rcx, %rsi
 +	/* Finish aligning dst.  */
 +	incq	%rdi
 +	/* Restore src adjusted with new value for aligned dst.  */
 +	addq	%rdi, %rsi
 +	/* Store end of buffer minus tail in rdx.  */
 +	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
 +
 +	/* Dont use multi-byte nop to align.  */
 +	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 -	VMOVU	(%rsi), %VEC(0)
 -	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 -	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 -	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 +	VMOVU	(%rsi), %VEC(1)
 +	VMOVU	VEC_SIZE(%rsi), %VEC(2)
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
 	subq	$-(VEC_SIZE * 4), %rsi
 -	addq	$-(VEC_SIZE * 4), %rdx
 -	VMOVA	%VEC(0), (%rdi)
 -	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 -	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 -	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 +	VMOVA	%VEC(1), (%rdi)
 +	VMOVA	%VEC(2), VEC_SIZE(%rdi)
 +	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
 +	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
 	subq	$-(VEC_SIZE * 4), %rdi
 -	cmpq	$(VEC_SIZE * 4), %rdx
 +	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
 -	VMOVU	%VEC(5), (%rcx)
 -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
 -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
 -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 +	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
 +	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
 +	VMOVU	%VEC(7), VEC_SIZE(%rdx)
 +	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
 -	VMOVU	%VEC(4), (%r11)
 +	VMOVU	%VEC(0), (%rcx)
 +	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
 +	 */
 +L(nop_backward):
 	VZEROUPPER_RETURN
 +	.p2align 4,, 8
 +L(more_8x_vec_backward_check_nop):
 +	/* rcx contains dst - src. Test for dst == src to skip all of
 +	   memmove.  */
 +	testq	%rcx, %rcx
 +	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
 -	VMOVU	(%rsi), %VEC(4)
 +
 +	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
 +	/* Begining of region for 4x backward copy stored in rcx.  */
 +	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
 -	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
 -	/* Save stop of the destination buffer.  */
 -	leaq	-VEC_SIZE(%rdi, %rdx), %r11
 -	/* Align destination end for aligned stores in the loop.  Compute
 -	   how much destination end is misaligned.  */
 -	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
 -	movq	%r11, %r9
 -	movq	%r11, %r8
 -	andq	$(VEC_SIZE - 1), %r8
 -	/* Adjust source.  */
 -	subq	%r8, %rcx
 -	/* Adjust the end of destination which should be aligned now.  */
 -	subq	%r8, %r9
 -	/* Adjust length.  */
 -	subq	%r8, %rdx
 -
 -	.p2align 4
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
 +	/* Subtract dst from src. Add back after dst aligned.  */
 +	subq	%rdi, %rsi
 +	/* Align dst.  */
 +	andq	$-(VEC_SIZE), %rcx
 +	/* Restore src.  */
 +	addq	%rcx, %rsi
 +
 +	/* Don't use multi-byte nop to align.  */
 +	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 -	VMOVU	(%rcx), %VEC(0)
 -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
 -	addq	$-(VEC_SIZE * 4), %rcx
 -	addq	$-(VEC_SIZE * 4), %rdx
 -	VMOVA	%VEC(0), (%r9)
 -	VMOVA	%VEC(1), -VEC_SIZE(%r9)
 -	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
 -	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
 -	addq	$-(VEC_SIZE * 4), %r9
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	ja	L(loop_4x_vec_backward)
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 +	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
 +	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
 +	addq	$(VEC_SIZE * -4), %rsi
 +	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
 +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
 +	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
 +	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
 +	addq	$(VEC_SIZE * -4), %rcx
 +	cmpq	%rcx, %rdi
 +	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
 -	VMOVU	%VEC(4), (%rdi)
 +	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
 -	VMOVU	%VEC(8), (%r11)
 +	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
 +	VZEROUPPER_RETURN
 +
 +#if defined USE_MULTIARCH && IS_IN (libc)
 +	/* L(skip_short_movsb_check) is only used with ERMS. Not for
 +	   FSRM.  */
 +	.p2align 5,, 16
 +# if ALIGN_MOVSB
 +L(skip_short_movsb_check):
 +#  if MOVSB_ALIGN_TO > VEC_SIZE
 +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 +#  endif
 +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 +#   error Unsupported MOVSB_ALIGN_TO
 +#  endif
 +	/* If CPU does not have FSRM two options for aligning. Align src
 +	   if dst and src 4k alias. Otherwise align dst.  */
 +	testl	$(PAGE_SIZE - 512), %ecx
 +	jnz	L(movsb_align_dst)
 +	/* Fall through. dst and src 4k alias. It's better to align src
 +	   here because the bottleneck will be loads dues to the false
 +	   dependency on dst.  */
 +
 +	/* rcx already has dst - src.  */
 +	movq	%rcx, %r9
 +	/* Add src to len. Subtract back after src aligned. -1 because
 +	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
 +	leaq	-1(%rsi, %rdx), %rcx
 +	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
 +	orq	$(MOVSB_ALIGN_TO - 1), %rsi
 +	/* Restore dst and len adjusted with new values for aligned dst.
 +	 */
 +	leaq	1(%rsi, %r9), %rdi
 +	subq	%rsi, %rcx
 +	/* Finish aligning src.  */
 +	incq	%rsi
 +
 +	rep	movsb
 +
 +	VMOVU	%VEC(0), (%r8)
 +#  if MOVSB_ALIGN_TO > VEC_SIZE
 +	VMOVU	%VEC(1), VEC_SIZE(%r8)
 +#  endif
 	VZEROUPPER_RETURN
 +# endif
 +
 +	.p2align 4,, 12
 +L(movsb):
 +	movq	%rdi, %rcx
 +	subq	%rsi, %rcx
 +	/* Go to backwards temporal copy if overlap no matter what as
 +	   backward REP MOVSB is slow and we don't want to use NT stores if
 +	   there is overlap.  */
 +	cmpq	%rdx, %rcx
 +	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
 +	jb	L(more_8x_vec_backward_check_nop)
 +# if ALIGN_MOVSB
 +	/* Save dest for storing aligning VECs later.  */
 +	movq	%rdi, %r8
 +# endif
 +	/* If above __x86_rep_movsb_stop_threshold most likely is
 +	   candidate for NT moves aswell.  */
 +	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
 +	jae	L(large_memcpy_2x_check)
 +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
 +	/* Only avoid short movsb if CPU has FSRM.  */
 +	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
 +	jz	L(skip_short_movsb_check)
 +#  if AVOID_SHORT_DISTANCE_REP_MOVSB
 +	/* Avoid "rep movsb" if RCX, the distance between source and
 +	   destination, is N*4GB + [1..63] with N >= 0.  */
 +
 +	/* ecx contains dst - src. Early check for backward copy
 +	   conditions means only case of slow movsb with src = dst + [0,
 +	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
 +	   for that case.  */
 +	cmpl	$-64, %ecx
 +	ja	L(more_8x_vec_forward)
 +#  endif
 +# endif
 +# if ALIGN_MOVSB
 +#  if MOVSB_ALIGN_TO > VEC_SIZE
 +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 +#  endif
 +#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
 +#   error Unsupported MOVSB_ALIGN_TO
 +#  endif
 +	/* Fall through means cpu has FSRM. In that case exclusively
 +	   align destination.  */
 +L(movsb_align_dst):
 +	/* Subtract dst from src. Add back after dst aligned.  */
 +	subq	%rdi, %rsi
 +	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
 +	addq	$(MOVSB_ALIGN_TO - 1), %rdi
 +	/* Add dst to len. Subtract back after dst aligned.  */
 +	leaq	(%r8, %rdx), %rcx
 +	/* Finish aligning dst.  */
 +	andq	$-(MOVSB_ALIGN_TO), %rdi
 +	/* Restore src and len adjusted with new values for aligned dst.
 +	 */
 +	addq	%rdi, %rsi
 +	subq	%rdi, %rcx
 +
 +	rep	movsb
 +
 +	/* Store VECs loaded for aligning.  */
 +	VMOVU	%VEC(0), (%r8)
 +#  if MOVSB_ALIGN_TO > VEC_SIZE
 +	VMOVU	%VEC(1), VEC_SIZE(%r8)
 +#  endif
 +	VZEROUPPER_RETURN
 +# else	/* !ALIGN_MOVSB.  */
 +L(skip_short_movsb_check):
 +	mov	%RDX_LP, %RCX_LP
 +	rep	movsb
 +	ret
 +# endif
 +#endif
 +	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 -	.p2align 4
 +L(large_memcpy_2x_check):
 +	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 +	jb	L(more_8x_vec_check)
 L(large_memcpy_2x):
 -	/* Compute absolute value of difference between source and
 -	   destination.  */
 -	movq	%rdi, %r9
 -	subq	%rsi, %r9
 -	movq	%r9, %r8
 -	leaq	-1(%r9), %rcx
 -	sarq	$63, %r8
 -	xorq	%r8, %r9
 -	subq	%r8, %r9
 -	/* Don't use non-temporal store if there is overlap between
 -	   destination and source since destination may be in cache when
 -	   source is loaded.  */
 -	cmpq	%r9, %rdx
 -	ja	L(more_8x_vec_check)
 +	/* To reach this point it is impossible for dst > src and
 +	   overlap. Remaining to check is src > dst and overlap. rcx
 +	   already contains dst - src. Negate rcx to get src - dst. If
 +	   length > rcx then there is overlap and forward copy is best.  */
 +	negq	%rcx
 +	cmpq	%rcx, %rdx
 +	ja	L(more_8x_vec_forward)
 	/* Cache align destination. First store the first 64 bytes then
 	   adjust alignments.  */
 -	VMOVU	(%rsi), %VEC(8)
 -#if VEC_SIZE < 64
 -	VMOVU	VEC_SIZE(%rsi), %VEC(9)
 -#if VEC_SIZE < 32
 -	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
 -	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
 -#endif
 -#endif
 -	VMOVU	%VEC(8), (%rdi)
 -#if VEC_SIZE < 64
 -	VMOVU	%VEC(9), VEC_SIZE(%rdi)
 -#if VEC_SIZE < 32
 -	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
 -	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
 -#endif
 -#endif
 +
 +	/* First vec was also loaded into VEC(0).  */
 +# if VEC_SIZE < 64
 +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 +#  if VEC_SIZE < 32
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 +#  endif
 +# endif
 +	VMOVU	%VEC(0), (%rdi)
 +# if VEC_SIZE < 64
 +	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 +#  if VEC_SIZE < 32
 +	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 +	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
 +#  endif
 +# endif
 +
 	/* Adjust source, destination, and size.  */
 	movq	%rdi, %r8
 	andq	$63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
 	/* Adjust length.  */
 	addq	%r8, %rdx
 -	/* Test if source and destination addresses will alias. If they do
 -	   the larger pipeline in large_memcpy_4x alleviated the
 +	/* Test if source and destination addresses will alias. If they
 +	   do the larger pipeline in large_memcpy_4x alleviated the
 	   performance drop.  */
 +
 +	/* ecx contains -(dst - src). not ecx will return dst - src - 1
 +	   which works for testing aliasing.  */
 +	notl	%ecx
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
 	/* ecx stores inner loop counter.  */
 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 L(loop_large_memcpy_4x_inner):
 -	/* Only one prefetch set per page as doing 4 pages give more time
 -	   for prefetcher to keep up.  */
 +	/* Only one prefetch set per page as doing 4 pages give more
 +	   time for prefetcher to keep up.  */
 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
--- a/glibc-upstream-2.34-182.patch
+++ b/glibc-upstream-2.34-182.patch
@ -0,0 +1,131 @@
 commit cecbac52123456e2fbcff062a4165bf7b9174797
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Mon Nov 1 00:49:52 2021 -0500
    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
    No bug.
    This patch doubles the rep_movsb_threshold when using ERMS. Based on
    benchmarks the vector copy loop, especially now that it handles 4k
    aliasing, is better for these medium ranged.
    On Skylake with ERMS:
    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
    4096,   0,      0,      0,      0.975
    4096,   0,      0,      1,      0.953
    4096,   12,     0,      0,      0.969
    4096,   12,     0,      1,      0.872
    4096,   44,     0,      0,      0.979
    4096,   44,     0,      1,      0.83
    4096,   0,      12,     0,      1.006
    4096,   0,      12,     1,      0.989
    4096,   0,      44,     0,      0.739
    4096,   0,      44,     1,      0.942
    4096,   12,     12,     0,      1.009
    4096,   12,     12,     1,      0.973
    4096,   44,     44,     0,      0.791
    4096,   44,     44,     1,      0.961
    4096,   2048,   0,      0,      0.978
    4096,   2048,   0,      1,      0.951
    4096,   2060,   0,      0,      0.986
    4096,   2060,   0,      1,      0.963
    4096,   2048,   12,     0,      0.971
    4096,   2048,   12,     1,      0.941
    4096,   2060,   12,     0,      0.977
    4096,   2060,   12,     1,      0.949
    8192,   0,      0,      0,      0.85
    8192,   0,      0,      1,      0.845
    8192,   13,     0,      0,      0.937
    8192,   13,     0,      1,      0.939
    8192,   45,     0,      0,      0.932
    8192,   45,     0,      1,      0.927
    8192,   0,      13,     0,      0.621
    8192,   0,      13,     1,      0.62
    8192,   0,      45,     0,      0.53
    8192,   0,      45,     1,      0.516
    8192,   13,     13,     0,      0.664
    8192,   13,     13,     1,      0.659
    8192,   45,     45,     0,      0.593
    8192,   45,     45,     1,      0.575
    8192,   2048,   0,      0,      0.854
    8192,   2048,   0,      1,      0.834
    8192,   2061,   0,      0,      0.863
    8192,   2061,   0,      1,      0.857
    8192,   2048,   13,     0,      0.63
    8192,   2048,   13,     1,      0.629
    8192,   2061,   13,     0,      0.627
    8192,   2061,   13,     1,      0.62
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
 diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
 index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
 --- a/sysdeps/x86/dl-cacheinfo.h
 +++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
 -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
 +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 +     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
 +     threshold is 2048 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
 -      rep_movsb_threshold = 2048 * (64 / 16);
 +      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
 -      rep_movsb_threshold = 2048 * (32 / 16);
 +      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
 diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
 index dd6e1d65c9490d4f..419313804d49cf65 100644
 --- a/sysdeps/x86/dl-tunables.list
 +++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
     }
     x86_rep_movsb_threshold {
       type: SIZE_T
 -      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
 -      # isn't faster on short data.  The memcpy micro benchmark in glibc
 -      # shows that 2KB is the approximate value above which REP MOVSB
 -      # becomes faster than SSE2 optimization on processors with Enhanced
 -      # REP MOVSB.  Since larger register size can move more data with a
 -      # single load and store, the threshold is higher with larger register
 -      # size.  Note: Since the REP MOVSB threshold must be greater than 8
 -      # times of vector size and the default value is 2048 * (vector size
 -      # / 16), the default value and the minimum value must be updated at
 -      # run-time.  NB: Don't set the default value since we can't tell if
 -      # the tunable value is set by user or not [BZ #27069].
 +      # Since there is overhead to set up REP MOVSB operation, REP
 +      # MOVSB isn't faster on short data.  The memcpy micro benchmark
 +      # in glibc shows that 2KB is the approximate value above which
 +      # REP MOVSB becomes faster than SSE2 optimization on processors
 +      # with Enhanced REP MOVSB.  Since larger register size can move
 +      # more data with a single load and store, the threshold is
 +      # higher with larger register size.  Micro benchmarks show AVX
 +      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
 +      # threshold is extrapolated to 16KB.  For machines with FSRM the
 +      # threshold is universally set at 2112 bytes.  Note: Since the
 +      # REP MOVSB threshold must be greater than 8 times of vector
 +      # size and the default value is 4096 * (vector size / 16), the
 +      # default value and the minimum value must be updated at
 +      # run-time.  NB: Don't set the default value since we can't tell
 +      # if the tunable value is set by user or not [BZ #27069].
       minval: 1
     }
     x86_rep_stosb_threshold {
--- a/glibc-upstream-2.34-183.patch
+++ b/glibc-upstream-2.34-183.patch
--- a/glibc-upstream-2.34-184.patch
+++ b/glibc-upstream-2.34-184.patch
@ -0,0 +1,104 @@
 commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Fri Dec 3 15:29:25 2021 -0800
    x86-64: Use notl in EVEX strcmp [BZ #28646]
    Must use notl %edi here as lower bits are for CHAR comparisons
    potentially out of range thus can be 0 without indicating mismatch.
    This fixes BZ #28646.
    Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
 diff --git a/string/test-strcmp.c b/string/test-strcmp.c
 index 7feababf4ddc5603..a0255b9625fbcedd 100644
 --- a/string/test-strcmp.c
 +++ b/string/test-strcmp.c
@@ -25,6 +25,7 @@
 # define TEST_NAME "strcmp"
 #endif
 #include "test-string.h"
 +#include <support/test-driver.h>
 #ifdef WIDE
 # include <wchar.h>
@@ -392,6 +393,32 @@ check2 (void)
 	}
 }
 +static void
 +check3 (void)
 +{
 +  size_t size = 0xd000 + 0x4000;
 +  CHAR *s1, *s2;
 +  CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
 +			MAP_PRIVATE | MAP_ANON, -1, 0);
 +  CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
 +			MAP_PRIVATE | MAP_ANON, -1, 0);
 +  if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
 +    error (EXIT_UNSUPPORTED, errno, "mmap failed");
 +
 +  s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
 +  s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
 +
 +  STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
 +  STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
 +
 +  int exp_result = SIMPLE_STRCMP (s1, s2);
 +  FOR_EACH_IMPL (impl, 0)
 +    check_result (impl, s1, s2, exp_result);
 +
 +  munmap ((void *) buffer1, size);
 +  munmap ((void *) buffer2, size);
 +}
 +
 int
 test_main (void)
 {
@@ -400,6 +427,7 @@ test_main (void)
   test_init ();
   check();
   check2 ();
 +  check3 ();
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index 82f12ac89bcae20b..6f5c4bf984da2b80 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -656,12 +656,13 @@ L(loop_cross_page):
 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
 	kmovd	%k3, %edi
 +    /* Must use notl %edi here as lower bits are for CHAR
 +	   comparisons potentially out of range thus can be 0 without
 +	   indicating mismatch.  */
 +	notl	%edi
 # ifdef USE_AS_WCSCMP
 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 -	notl	%edi
 	andl	$0xff, %edi
 -# else
 -	incl	%edi
 # endif
 # ifdef USE_AS_WCSCMP
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
 	kmovd	%k3, %edi
 +	/* Must use notl %edi here as lower bits are for CHAR
 +	   comparisons potentially out of range thus can be 0 without
 +	   indicating mismatch.  */
 +	notl	%edi
 # ifdef USE_AS_WCSCMP
 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 -	notl	%edi
 	andl	$0xff, %edi
 -# else
 -	incl	%edi
 # endif
 # ifdef USE_AS_WCSCMP
--- a/glibc-upstream-2.34-185.patch
+++ b/glibc-upstream-2.34-185.patch
@ -0,0 +1,30 @@
 commit f3a99b2216114f89b20329ae7664b764248b4bbd
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Mon Dec 6 07:14:12 2021 -0800
    x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
    Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
    they won't lower CPU frequency when ZMM load and store instructions are
    used.
    (cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 index f4d4049e391cbabd..09590d8794b1c6fb 100644
 --- a/sysdeps/x86/cpu-features.c
 +++ b/sysdeps/x86/cpu-features.c
@@ -566,8 +566,11 @@ disable_tsx:
 	  |= bit_arch_Prefer_No_VZEROUPPER;
       else
 	{
 -	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
 -	    |= bit_arch_Prefer_No_AVX512;
 +	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
 +	     when ZMM load and store instructions are used.  */
 +	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
 +	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
 +	      |= bit_arch_Prefer_No_AVX512;
 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
 	     transactionally executing RTM region.  */
--- a/glibc-upstream-2.34-186.patch
+++ b/glibc-upstream-2.34-186.patch
@ -0,0 +1,384 @@
 commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Fri Dec 24 18:54:41 2021 -0600
    x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
    No bug.
    Optimizations are twofold.
    1) Replace page cross and 0/1 checks with masked load instructions in
       L(less_vec). In applications this reduces branch-misses in the
       hot [0, 32] case.
    2) Change controlflow so that L(less_vec) case gets the fall through.
    Change 2) helps copies in the [0, 32] size range but comes at the cost
    of copies in the [33, 64] size range.  From profiles of GCC and
    Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
    appears to the the right tradeoff.
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 index 640f6757fac8a356..d2899e7c7078cd41 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,15 +62,18 @@ Latency:
 # define VMOVU		vmovdqu64
 # ifdef USE_AS_WMEMCMP
 +#  define VMOVU_MASK	vmovdqu32
 #  define CHAR_SIZE	4
 #  define VPCMP	vpcmpd
 #  define VPTEST	vptestmd
 # else
 +#  define VMOVU_MASK	vmovdqu8
 #  define CHAR_SIZE	1
 #  define VPCMP	vpcmpub
 #  define VPTEST	vptestmb
 # endif
 +
 # define VEC_SIZE	32
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 	movl	%edx, %edx
 # endif
 	cmp	$CHAR_PER_VEC, %RDX_LP
 -	jb	L(less_vec)
 +	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
 +	ja	L(more_1x_vec)
 +
 +	/* Create mask for CHAR's we want to compare. This allows us to
 +	   avoid having to include page cross logic.  */
 +	movl	$-1, %ecx
 +	bzhil	%edx, %ecx, %ecx
 +	kmovd	%ecx, %k2
 +
 +	/* Safe to load full ymm with mask.  */
 +	VMOVU_MASK (%rsi), %YMM2{%k2}
 +	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 +	ret
 +	.p2align 4
 +L(return_vec_0):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 +	/* NB: no partial register stall here because xorl zero idiom
 +	   above.  */
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(%rsi, %rax), %ecx
 +	movzbl	(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	ret
 +
 +
 +	.p2align 4
 +L(more_1x_vec):
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %YMM1
 	/* Use compare not equals to directly check for mismatch.  */
 -	VPCMP	$4, (%rdi), %YMM1, %k1
 +	VPCMP	$4,(%rdi), %YMM1, %k1
 	kmovd	%k1, %eax
 	/* NB: eax must be destination register if going to
 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 	/* Check third and fourth VEC no matter what.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 -	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
 +	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 -	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
 +	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(return_vec_3)
@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
 	   oring with YMM1. Result is stored in YMM4.  */
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 	/* NB: eax must be zero to reach here.  */
 	ret
 -	.p2align 4
 +
 +	.p2align 4,, 8
 L(8x_end_return_vec_0_1_2_3):
 	movq	%rdx, %rdi
 L(8x_return_vec_0_1_2_3):
@@ -222,23 +262,6 @@ L(return_vec_3):
 # endif
 	ret
 -	.p2align 4
 -L(return_vec_0):
 -	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCMP
 -	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 -	xorl	%edx, %edx
 -	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 -	/* NB: no partial register stall here because xorl zero idiom
 -	   above.  */
 -	setg	%dl
 -	leal	-1(%rdx, %rdx), %eax
 -# else
 -	movzbl	(%rsi, %rax), %ecx
 -	movzbl	(%rdi, %rax), %eax
 -	subl	%ecx, %eax
 -# endif
 -	ret
 	.p2align 4
 L(return_vec_1):
@@ -297,7 +320,7 @@ L(loop_4x_vec):
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 	VPTEST	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
@@ -324,7 +347,7 @@ L(loop_4x_vec):
 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
 -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 	VPTEST	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
@@ -336,14 +359,14 @@ L(loop_4x_vec):
 	/* Only entry is from L(more_8x_vec).  */
 	.p2align 4,, 10
 L(8x_last_2x_vec):
 -	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
 +	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(8x_return_vec_2)
 	/* Naturally aligned to 16 bytes.  */
 L(8x_last_1x_vec):
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
 -	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
 +	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(8x_return_vec_3)
@@ -392,7 +415,9 @@ L(last_1x_vec):
 	jnz	L(return_vec_0_end)
 	ret
 -	.p2align 4,, 10
 +
 +	/* Don't align. Takes 2-fetch blocks either way and aligning
 +	   will cause code to spill into another cacheline.  */
 L(return_vec_1_end):
 	/* Use bsf to save code size. This is necessary to have
 	   L(one_or_less) fit in aligning bytes between.  */
@@ -411,31 +436,8 @@ L(return_vec_1_end):
 # endif
 	ret
 -	/* NB: L(one_or_less) fits in alignment padding between
 -	   L(return_vec_1_end) and L(return_vec_0_end).  */
 -# ifdef USE_AS_WMEMCMP
 -L(one_or_less):
 -	jb	L(zero)
 -	movl	(%rdi), %ecx
 -	xorl	%edx, %edx
 -	cmpl	(%rsi), %ecx
 -	je	L(zero)
 -	setg	%dl
 -	leal	-1(%rdx, %rdx), %eax
 -	ret
 -# else
 -L(one_or_less):
 -	jb	L(zero)
 -	movzbl	(%rsi), %ecx
 -	movzbl	(%rdi), %eax
 -	subl	%ecx, %eax
 -	ret
 -# endif
 -L(zero):
 -	xorl	%eax, %eax
 -	ret
 -
 -	.p2align 4
 +	/* Don't align. Takes 2-fetch blocks either way and aligning
 +	   will cause code to spill into another cacheline.  */
 L(return_vec_0_end):
 	tzcntl	%eax, %eax
 	addl	%edx, %eax
@@ -451,146 +453,7 @@ L(return_vec_0_end):
 	subl	%ecx, %eax
 # endif
 	ret
 +	/* 1-byte until next cache line.  */
 -	.p2align 4
 -L(less_vec):
 -	/* Check if one or less CHAR. This is necessary for size == 0
 -	   but is also faster for size == CHAR_SIZE.  */
 -	cmpl	$1, %edx
 -	jbe	L(one_or_less)
 -
 -	/* Check if loading one VEC from either s1 or s2 could cause a
 -	   page cross. This can have false positives but is by far the
 -	   fastest method.  */
 -	movl	%edi, %eax
 -	orl	%esi, %eax
 -	andl	$(PAGE_SIZE - 1), %eax
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 -	jg	L(page_cross_less_vec)
 -
 -	/* No page cross possible.  */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMP	$4, (%rdi), %YMM2, %k1
 -	kmovd	%k1, %eax
 -	/* Check if any matches where in bounds. Intentionally not
 -	   storing result in eax to limit dependency chain if it goes to
 -	   L(return_vec_0_lv).  */
 -	bzhil	%edx, %eax, %edx
 -	jnz	L(return_vec_0_lv)
 -	xorl	%eax, %eax
 -	ret
 -
 -	/* Essentially duplicate of L(return_vec_0). Ends up not costing
 -	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
 -	   the jump and ends up fitting in aligning bytes. As well fits on
 -	   same cache line as L(less_vec) so also saves a line from having
 -	   to be fetched on cold calls to memcmp.  */
 -	.p2align 4,, 4
 -L(return_vec_0_lv):
 -	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCMP
 -	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 -	xorl	%edx, %edx
 -	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 -	/* NB: no partial register stall here because xorl zero idiom
 -	   above.  */
 -	setg	%dl
 -	leal	-1(%rdx, %rdx), %eax
 -# else
 -	movzbl	(%rsi, %rax), %ecx
 -	movzbl	(%rdi, %rax), %eax
 -	subl	%ecx, %eax
 -# endif
 -	ret
 -
 -	.p2align 4
 -L(page_cross_less_vec):
 -	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 -	   bytes.  */
 -	cmpl	$(16 / CHAR_SIZE), %edx
 -	jae	L(between_16_31)
 -# ifndef USE_AS_WMEMCMP
 -	cmpl	$8, %edx
 -	jae	L(between_8_15)
 -	cmpl	$4, %edx
 -	jb	L(between_2_3)
 -
 -	/* Load as big endian with overlapping movbe to avoid branches.
 -	 */
 -	movbe	(%rdi), %eax
 -	movbe	(%rsi), %ecx
 -	shlq	$32, %rax
 -	shlq	$32, %rcx
 -	movbe	-4(%rdi, %rdx), %edi
 -	movbe	-4(%rsi, %rdx), %esi
 -	orq	%rdi, %rax
 -	orq	%rsi, %rcx
 -	subq	%rcx, %rax
 -	/* edx is guranteed to be positive int32 in range [4, 7].  */
 -	cmovne	%edx, %eax
 -	/* ecx is -1 if rcx > rax. Otherwise 0.  */
 -	sbbl	%ecx, %ecx
 -	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
 -	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
 -	   eax doesn't matter.  */
 -	orl	%ecx, %eax
 -	ret
 -
 -	.p2align 4,, 8
 -L(between_8_15):
 -# endif
 -	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 -	vmovq	(%rdi), %xmm1
 -	vmovq	(%rsi), %xmm2
 -	VPCMP	$4, %xmm1, %xmm2, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_0_lv)
 -	/* Use overlapping loads to avoid branches.  */
 -	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
 -	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
 -	VPCMP	$4, %xmm1, %xmm2, %k1
 -	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_0_end)
 -	ret
 -
 -	.p2align 4,, 8
 -L(between_16_31):
 -	/* From 16 to 31 bytes.  No branch when size == 16.  */
 -
 -	/* Use movups to save code size.  */
 -	vmovdqu	(%rsi), %xmm2
 -	VPCMP	$4, (%rdi), %xmm2, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_0_lv)
 -	/* Use overlapping loads to avoid branches.  */
 -	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 -	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 -	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(return_vec_0_end)
 -	ret
 -
 -# ifndef USE_AS_WMEMCMP
 -L(between_2_3):
 -	/* Load as big endian to avoid branches.  */
 -	movzwl	(%rdi), %eax
 -	movzwl	(%rsi), %ecx
 -	shll	$8, %eax
 -	shll	$8, %ecx
 -	bswap	%eax
 -	bswap	%ecx
 -	movzbl	-1(%rdi, %rdx), %edi
 -	movzbl	-1(%rsi, %rdx), %esi
 -	orl	%edi, %eax
 -	orl	%esi, %ecx
 -	/* Subtraction is okay because the upper 8 bits are zero.  */
 -	subl	%ecx, %eax
 -	ret
 -# endif
 END (MEMCMP)
 #endif
--- a/glibc-upstream-2.34-187.patch
+++ b/glibc-upstream-2.34-187.patch
@ -0,0 +1,42 @@
 commit 9681691402052b727e01ae3375c73e0f76566593
 Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
 Date:   Wed Apr 27 13:59:26 2022 -0300
    linux: Fix missing internal 64 bit time_t stat usage
    These are two missing spots initially done by 52a5fe70a2c77935.
    Checked on i686-linux-gnu.
    (cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9)
 diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
 index 13160d32499c4e58..00e4ce7f80ee2dfe 100644
 --- a/sysdeps/unix/sysv/linux/faccessat.c
 +++ b/sysdeps/unix/sysv/linux/faccessat.c
@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
   if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
     return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
 -  struct stat64 stats;
 -  if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
 +  struct __stat64_t64 stats;
 +  if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
     return -1;
   mode &= (X_OK | W_OK | R_OK);	/* Clear any bogus bits. */
 diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
 index b599a66c930cad4d..f79930303118ebcd 100644
 --- a/sysdeps/unix/sysv/linux/pathconf.c
 +++ b/sysdeps/unix/sysv/linux/pathconf.c
@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
 	      && strcmp (mntbuf.mnt_type, "ext4") != 0)
 	    continue;
 -	  struct stat64 fsst;
 -	  if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
 +	  struct __stat64_t64 fsst;
 +	  if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
 	      && st.st_dev == fsst.st_dev)
 	    {
 	      if (strcmp (mntbuf.mnt_type, "ext4") == 0)
--- a/glibc-upstream-2.34-188.patch
+++ b/glibc-upstream-2.34-188.patch
@ -0,0 +1,39 @@
 commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe
 Author: Carlos O'Donell <carlos@redhat.com>
 Date:   Tue Apr 26 10:52:41 2022 -0400
    i386: Regenerate ulps
    These failures were caught while building glibc master for Fedora
    Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse'
    using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon
    processor.
    (cherry picked from commit e465d97653311c3687aee49de782177353acfe86)
 diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
 index 7601049110789201..84e6686eba5fe79a 100644
 --- a/sysdeps/i386/fpu/libm-test-ulps
 +++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
 Function: Imaginary part of "clog10":
 double: 2
 -float: 1
 +float: 2
 float128: 2
 ldouble: 2
 diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
 index a39c89cec1141935..cc21e6907fe8b6a3 100644
 --- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
 +++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
 Function: Imaginary part of "clog10":
 double: 2
 -float: 1
 +float: 2
 float128: 2
 ldouble: 2
--- a/glibc.spec
+++ b/glibc.spec
@ -148,7 +148,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 30%{?dist}
+Release: 31%{?dist}
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch
 Patch176: glibc-rh2058230.patch
 Patch177: glibc-rh2054789.patch
 Patch178: glibc-upstream-2.34-108.patch
 Patch179: glibc-upstream-2.34-110.patch
 # glibc-2.34-109-gd64b08d5ba only changes NEWS.
 Patch179: glibc-upstream-2.34-110.patch
 Patch180: glibc-upstream-2.34-111.patch
 Patch181: glibc-upstream-2.34-112.patch
 Patch182: glibc-upstream-2.34-113.patch
 Patch183: glibc-upstream-2.34-114.patch
 # glibc-2.34-115-gd5d1c95aaf only changes NEWS.
 # glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch184: glibc-upstream-2.34-117.patch
 Patch185: glibc-upstream-2.34-118.patch
 Patch186: glibc-upstream-2.34-119.patch
 # glibc-2.34-115-gd5d1c95aaf only changes NEWS.
 # glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch187: glibc-upstream-2.34-120.patch
 Patch188: glibc-upstream-2.34-121.patch
 Patch189: glibc-upstream-2.34-122.patch
@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch
 Patch230: glibc-upstream-2.34-164.patch
 Patch231: glibc-upstream-2.34-165.patch
 Patch232: glibc-upstream-2.34-166.patch
 Patch233: glibc-upstream-2.34-167.patch
 Patch234: glibc-upstream-2.34-168.patch
 Patch235: glibc-upstream-2.34-169.patch
 Patch236: glibc-upstream-2.34-170.patch
 Patch237: glibc-upstream-2.34-171.patch
 Patch238: glibc-upstream-2.34-172.patch
 Patch239: glibc-upstream-2.34-173.patch
 Patch240: glibc-upstream-2.34-174.patch
 Patch241: glibc-upstream-2.34-175.patch
 Patch242: glibc-upstream-2.34-176.patch
 Patch243: glibc-upstream-2.34-177.patch
 Patch244: glibc-upstream-2.34-178.patch
 Patch245: glibc-upstream-2.34-179.patch
 Patch246: glibc-upstream-2.34-180.patch
 Patch247: glibc-upstream-2.34-181.patch
 Patch248: glibc-upstream-2.34-182.patch
 Patch249: glibc-upstream-2.34-183.patch
 Patch250: glibc-upstream-2.34-184.patch
 Patch251: glibc-upstream-2.34-185.patch
 Patch252: glibc-upstream-2.34-186.patch
 Patch253: glibc-upstream-2.34-187.patch
 Patch254: glibc-upstream-2.34-188.patch
 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2493,6 +2515,32 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 %changelog
 * Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31
 - Sync with upstream branch release/2.34/master,
  commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe:
 - i386: Regenerate ulps
 - linux: Fix missing internal 64 bit time_t stat usage
 - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
 - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
 - x86-64: Use notl in EVEX strcmp [BZ #28646]
 - x86: Shrink memcmp-sse4.S code size
 - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
 - x86: Optimize memmove-vec-unaligned-erms.S
 - x86-64: Replace movzx with movzbl
 - x86-64: Remove Prefer_AVX2_STRCMP
 - x86-64: Improve EVEX strcmp with masked load
 - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
 - x86: Optimize memset-vec-unaligned-erms.S
 - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
 - x86: Modify ENTRY in sysdep.h so that p2align can be specified
 - x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
 - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
 - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
 - INSTALL: Rephrase -with-default-link documentation
 - misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
 - Default to --with-default-link=no (bug 25812)
 - scripts: Add glibcelf.py module
 * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30
 - Sync with upstream branch release/2.34/master,
  commit 71326f1f2fd09dafb9c34404765fb88129e94237: