diff --git a/glibc-upstream-2.34-167.patch b/glibc-upstream-2.34-167.patch
new file mode 100644
index 0000000..e00042d
--- /dev/null
+++ b/glibc-upstream-2.34-167.patch
@@ -0,0 +1,1446 @@
+commit 3e0a91b79b409a6a4113a0fdb08221c0bb29cfce
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Apr 11 11:28:08 2022 +0200
+
+    scripts: Add glibcelf.py module
+    
+    Hopefully, this will lead to tests that are easier to maintain.  The
+    current approach of parsing readelf -W output using regular expressions
+    is not necessarily easier than parsing the ELF data directly.
+    
+    This module is still somewhat incomplete (e.g., coverage of relocation
+    types and versioning information is missing), but it is sufficient to
+    perform basic symbol analysis or program header analysis.
+    
+    The EM_* mapping for architecture-specific constant classes (e.g.,
+    SttX86_64) is not yet implemented.  The classes are defined for the
+    benefit of elf/tst-glibcelf.py.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    (cherry picked from commit 30035d67728a846fa39749cd162afd278ac654c4)
+
+diff --git a/elf/Makefile b/elf/Makefile
+index 8e2dd91c583f9a62..8afbe3f6ab259331 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -1053,6 +1053,13 @@ CFLAGS-tst-prelink.c += -fno-pie
+ tst-prelink-no-pie = yes
+ endif
+ 
++tests-special += $(objpfx)tst-glibcelf.out
++$(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
++  $(..)/scripts/glibcextract.py
++	PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcelf.py \
++          --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
++	  < /dev/null > $@ 2>&1; $(evaluate-test)
++
+ # The test requires shared _and_ PIE because the executable
+ # unit test driver must be able to link with the shared object
+ # that is going to eventually go into an installed DSO.
+diff --git a/elf/tst-glibcelf.py b/elf/tst-glibcelf.py
+new file mode 100644
+index 0000000000000000..bf15a3bad4479e08
+--- /dev/null
++++ b/elf/tst-glibcelf.py
+@@ -0,0 +1,260 @@
++#!/usr/bin/python3
++# Verify scripts/glibcelf.py contents against elf/elf.h.
++# Copyright (C) 2022 Free Software Foundation, Inc.
++# This file is part of the GNU C Library.
++#
++# The GNU C Library is free software; you can redistribute it and/or
++# modify it under the terms of the GNU Lesser General Public
++# License as published by the Free Software Foundation; either
++# version 2.1 of the License, or (at your option) any later version.
++#
++# The GNU C Library is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++# Lesser General Public License for more details.
++#
++# You should have received a copy of the GNU Lesser General Public
++# License along with the GNU C Library; if not, see
++# <https://www.gnu.org/licenses/>.
++
++import argparse
++import enum
++import sys
++
++import glibcelf
++import glibcextract
++
++errors_encountered = 0
++
++def error(message):
++    global errors_encountered
++    sys.stdout.write('error: {}\n'.format(message))
++    errors_encountered += 1
++
++# The enum constants in glibcelf are expected to have exactly these
++# prefixes.
++expected_constant_prefixes = tuple(
++    'ELFCLASS ELFDATA EM_ ET_ DT_ PF_ PT_ SHF_ SHN_ SHT_ STB_ STT_'.split())
++
++def find_constant_prefix(name):
++    """Returns a matching prefix from expected_constant_prefixes or None."""
++    for prefix in expected_constant_prefixes:
++        if name.startswith(prefix):
++            return prefix
++    return None
++
++def find_enum_types():
++    """A generator for OpenIntEnum and IntFlag classes in glibcelf."""
++    for obj in vars(glibcelf).values():
++        if isinstance(obj, type) and obj.__bases__[0] in (
++                glibcelf._OpenIntEnum, enum.Enum, enum.IntFlag):
++            yield obj
++
++def check_duplicates():
++    """Verifies that enum types do not have duplicate values.
++
++    Different types must have different member names, too.
++
++    """
++    global_seen = {}
++    for typ in find_enum_types():
++        seen = {}
++        last = None
++        for (name, e) in typ.__members__.items():
++            if e.value in seen:
++                error('{} has {}={} and {}={}'.format(
++                    typ, seen[e.value], e.value, name, e.value))
++                last = e
++            else:
++                seen[e.value] = name
++                if last is not None and last.value > e.value:
++                    error('{} has {}={} after {}={}'.format(
++                        typ, name, e.value, last.name, last.value))
++                if name in global_seen:
++                    error('{} used in {} and {}'.format(
++                        name, global_seen[name], typ))
++                else:
++                    global_seen[name] = typ
++
++def check_constant_prefixes():
++    """Check that the constant prefixes match expected_constant_prefixes."""
++    seen = set()
++    for typ in find_enum_types():
++        typ_prefix = None
++        for val in typ:
++            prefix = find_constant_prefix(val.name)
++            if prefix is None:
++                error('constant {!r} for {} has unknown prefix'.format(
++                    val, typ))
++                break
++            elif typ_prefix is None:
++                typ_prefix = prefix
++                seen.add(typ_prefix)
++            elif prefix != typ_prefix:
++                error('prefix {!r} for constant {!r}, expected {!r}'.format(
++                    prefix, val, typ_prefix))
++        if typ_prefix is None:
++            error('empty enum type {}'.format(typ))
++
++    for prefix in sorted(set(expected_constant_prefixes) - seen):
++        error('missing constant prefix {!r}'.format(prefix))
++    # Reverse difference is already covered inside the loop.
++
++def find_elf_h_constants(cc):
++    """Returns a dictionary of relevant constants from <elf.h>."""
++    return glibcextract.compute_macro_consts(
++        source_text='#include <elf.h>',
++        cc=cc,
++        macro_re='|'.join(
++            prefix + '.*' for prefix in expected_constant_prefixes))
++
++# The first part of the pair is a name of an <elf.h> constant that is
++# dropped from glibcelf.  The second part is the constant as it is
++# used in <elf.h>.
++glibcelf_skipped_aliases = (
++    ('EM_ARC_A5', 'EM_ARC_COMPACT'),
++    ('PF_PARISC_SBP', 'PF_HP_SBP')
++)
++
++# Constants that provide little value and are not included in
++# glibcelf: *LO*/*HI* range constants, *NUM constants counting the
++# number of constants.  Also includes the alias names from
++# glibcelf_skipped_aliases.
++glibcelf_skipped_constants = frozenset(
++    [e[0] for e in glibcelf_skipped_aliases]) | frozenset("""
++DT_AARCH64_NUM
++DT_ADDRNUM
++DT_ADDRRNGHI
++DT_ADDRRNGLO
++DT_ALPHA_NUM
++DT_ENCODING
++DT_EXTRANUM
++DT_HIOS
++DT_HIPROC
++DT_IA_64_NUM
++DT_LOOS
++DT_LOPROC
++DT_MIPS_NUM
++DT_NUM
++DT_PPC64_NUM
++DT_PPC_NUM
++DT_PROCNUM
++DT_SPARC_NUM
++DT_VALNUM
++DT_VALRNGHI
++DT_VALRNGLO
++DT_VERSIONTAGNUM
++ELFCLASSNUM
++ELFDATANUM
++ET_HIOS
++ET_HIPROC
++ET_LOOS
++ET_LOPROC
++ET_NUM
++PF_MASKOS
++PF_MASKPROC
++PT_HIOS
++PT_HIPROC
++PT_HISUNW
++PT_LOOS
++PT_LOPROC
++PT_LOSUNW
++SHF_MASKOS
++SHF_MASKPROC
++SHN_HIOS
++SHN_HIPROC
++SHN_HIRESERVE
++SHN_LOOS
++SHN_LOPROC
++SHN_LORESERVE
++SHT_HIOS
++SHT_HIPROC
++SHT_HIPROC
++SHT_HISUNW
++SHT_HIUSER
++SHT_LOOS
++SHT_LOPROC
++SHT_LOSUNW
++SHT_LOUSER
++SHT_NUM
++STB_HIOS
++STB_HIPROC
++STB_LOOS
++STB_LOPROC
++STB_NUM
++STT_HIOS
++STT_HIPROC
++STT_LOOS
++STT_LOPROC
++STT_NUM
++""".strip().split())
++
++def check_constant_values(cc):
++    """Checks the values of <elf.h> constants against glibcelf."""
++
++    glibcelf_constants = {
++        e.name: e for typ in find_enum_types() for e in typ}
++    elf_h_constants = find_elf_h_constants(cc=cc)
++
++    missing_in_glibcelf = (set(elf_h_constants) - set(glibcelf_constants)
++                           - glibcelf_skipped_constants)
++    for name in sorted(missing_in_glibcelf):
++        error('constant {} is missing from glibcelf'.format(name))
++
++    unexpected_in_glibcelf = \
++        set(glibcelf_constants) & glibcelf_skipped_constants
++    for name in sorted(unexpected_in_glibcelf):
++        error('constant {} is supposed to be filtered from glibcelf'.format(
++            name))
++
++    missing_in_elf_h = set(glibcelf_constants) - set(elf_h_constants)
++    for name in sorted(missing_in_elf_h):
++        error('constant {} is missing from <elf.h>'.format(name))
++
++    expected_in_elf_h = glibcelf_skipped_constants - set(elf_h_constants)
++    for name in expected_in_elf_h:
++        error('filtered constant {} is missing from <elf.h>'.format(name))
++
++    for alias_name, name_in_glibcelf in glibcelf_skipped_aliases:
++        if name_in_glibcelf not in glibcelf_constants:
++            error('alias value {} for {} not in glibcelf'.format(
++                name_in_glibcelf, alias_name))
++        elif (int(elf_h_constants[alias_name])
++              != glibcelf_constants[name_in_glibcelf].value):
++            error('<elf.h> has {}={}, glibcelf has {}={}'.format(
++                alias_name, elf_h_constants[alias_name],
++                name_in_glibcelf, glibcelf_constants[name_in_glibcelf]))
++
++    # Check for value mismatches:
++    for name in sorted(set(glibcelf_constants) & set(elf_h_constants)):
++        glibcelf_value = glibcelf_constants[name].value
++        elf_h_value = int(elf_h_constants[name])
++        # On 32-bit architectures <elf.h> as some constants that are
++        # parsed as signed, while they are unsigned in glibcelf.  So
++        # far, this only affects some flag constants, so special-case
++        # them here.
++        if (glibcelf_value != elf_h_value
++            and not (isinstance(glibcelf_constants[name], enum.IntFlag)
++                     and glibcelf_value == 1 << 31
++                     and elf_h_value == -(1 << 31))):
++            error('{}: glibcelf has {!r}, <elf.h> has {!r}'.format(
++                name, glibcelf_value, elf_h_value))
++
++def main():
++    """The main entry point."""
++    parser = argparse.ArgumentParser(
++        description="Check glibcelf.py and elf.h against each other.")
++    parser.add_argument('--cc', metavar='CC',
++                        help='C compiler (including options) to use')
++    args = parser.parse_args()
++
++    check_duplicates()
++    check_constant_prefixes()
++    check_constant_values(cc=args.cc)
++
++    if errors_encountered > 0:
++        print("note: errors encountered:", errors_encountered)
++        sys.exit(1)
++
++if __name__ == '__main__':
++    main()
+diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
+new file mode 100644
+index 0000000000000000..8f7d0ca184845714
+--- /dev/null
++++ b/scripts/glibcelf.py
+@@ -0,0 +1,1135 @@
++#!/usr/bin/python3
++# ELF support functionality for Python.
++# Copyright (C) 2022 Free Software Foundation, Inc.
++# This file is part of the GNU C Library.
++#
++# The GNU C Library is free software; you can redistribute it and/or
++# modify it under the terms of the GNU Lesser General Public
++# License as published by the Free Software Foundation; either
++# version 2.1 of the License, or (at your option) any later version.
++#
++# The GNU C Library is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++# Lesser General Public License for more details.
++#
++# You should have received a copy of the GNU Lesser General Public
++# License along with the GNU C Library; if not, see
++# <https://www.gnu.org/licenses/>.
++
++"""Basic ELF parser.
++
++Use Image.readfile(path) to read an ELF file into memory and begin
++parsing it.
++
++"""
++
++import collections
++import enum
++import struct
++
++class _OpenIntEnum(enum.IntEnum):
++    """Integer enumeration that supports arbitrary int values."""
++    @classmethod
++    def _missing_(cls, value):
++        # See enum.IntFlag._create_pseudo_member_.  This allows
++        # creating of enum constants with arbitrary integer values.
++        pseudo_member = int.__new__(cls, value)
++        pseudo_member._name_ = None
++        pseudo_member._value_ = value
++        return pseudo_member
++
++    def __repr__(self):
++        name = self._name_
++        if name is not None:
++            # The names have prefixes like SHT_, implying their type.
++            return name
++        return '{}({})'.format(self.__class__.__name__, self._value_)
++
++    def __str__(self):
++        name = self._name_
++        if name is not None:
++            return name
++        return str(self._value_)
++
++class ElfClass(_OpenIntEnum):
++    """ELF word size.  Type of EI_CLASS values."""
++    ELFCLASSNONE = 0
++    ELFCLASS32 = 1
++    ELFCLASS64 = 2
++
++class ElfData(_OpenIntEnum):
++    """ELF endianess.  Type of EI_DATA values."""
++    ELFDATANONE = 0
++    ELFDATA2LSB = 1
++    ELFDATA2MSB = 2
++
++class Machine(_OpenIntEnum):
++    """ELF machine type.  Type of values in Ehdr.e_machine field."""
++    EM_NONE = 0
++    EM_M32 = 1
++    EM_SPARC = 2
++    EM_386 = 3
++    EM_68K = 4
++    EM_88K = 5
++    EM_IAMCU = 6
++    EM_860 = 7
++    EM_MIPS = 8
++    EM_S370 = 9
++    EM_MIPS_RS3_LE = 10
++    EM_PARISC = 15
++    EM_VPP500 = 17
++    EM_SPARC32PLUS = 18
++    EM_960 = 19
++    EM_PPC = 20
++    EM_PPC64 = 21
++    EM_S390 = 22
++    EM_SPU = 23
++    EM_V800 = 36
++    EM_FR20 = 37
++    EM_RH32 = 38
++    EM_RCE = 39
++    EM_ARM = 40
++    EM_FAKE_ALPHA = 41
++    EM_SH = 42
++    EM_SPARCV9 = 43
++    EM_TRICORE = 44
++    EM_ARC = 45
++    EM_H8_300 = 46
++    EM_H8_300H = 47
++    EM_H8S = 48
++    EM_H8_500 = 49
++    EM_IA_64 = 50
++    EM_MIPS_X = 51
++    EM_COLDFIRE = 52
++    EM_68HC12 = 53
++    EM_MMA = 54
++    EM_PCP = 55
++    EM_NCPU = 56
++    EM_NDR1 = 57
++    EM_STARCORE = 58
++    EM_ME16 = 59
++    EM_ST100 = 60
++    EM_TINYJ = 61
++    EM_X86_64 = 62
++    EM_PDSP = 63
++    EM_PDP10 = 64
++    EM_PDP11 = 65
++    EM_FX66 = 66
++    EM_ST9PLUS = 67
++    EM_ST7 = 68
++    EM_68HC16 = 69
++    EM_68HC11 = 70
++    EM_68HC08 = 71
++    EM_68HC05 = 72
++    EM_SVX = 73
++    EM_ST19 = 74
++    EM_VAX = 75
++    EM_CRIS = 76
++    EM_JAVELIN = 77
++    EM_FIREPATH = 78
++    EM_ZSP = 79
++    EM_MMIX = 80
++    EM_HUANY = 81
++    EM_PRISM = 82
++    EM_AVR = 83
++    EM_FR30 = 84
++    EM_D10V = 85
++    EM_D30V = 86
++    EM_V850 = 87
++    EM_M32R = 88
++    EM_MN10300 = 89
++    EM_MN10200 = 90
++    EM_PJ = 91
++    EM_OPENRISC = 92
++    EM_ARC_COMPACT = 93
++    EM_XTENSA = 94
++    EM_VIDEOCORE = 95
++    EM_TMM_GPP = 96
++    EM_NS32K = 97
++    EM_TPC = 98
++    EM_SNP1K = 99
++    EM_ST200 = 100
++    EM_IP2K = 101
++    EM_MAX = 102
++    EM_CR = 103
++    EM_F2MC16 = 104
++    EM_MSP430 = 105
++    EM_BLACKFIN = 106
++    EM_SE_C33 = 107
++    EM_SEP = 108
++    EM_ARCA = 109
++    EM_UNICORE = 110
++    EM_EXCESS = 111
++    EM_DXP = 112
++    EM_ALTERA_NIOS2 = 113
++    EM_CRX = 114
++    EM_XGATE = 115
++    EM_C166 = 116
++    EM_M16C = 117
++    EM_DSPIC30F = 118
++    EM_CE = 119
++    EM_M32C = 120
++    EM_TSK3000 = 131
++    EM_RS08 = 132
++    EM_SHARC = 133
++    EM_ECOG2 = 134
++    EM_SCORE7 = 135
++    EM_DSP24 = 136
++    EM_VIDEOCORE3 = 137
++    EM_LATTICEMICO32 = 138
++    EM_SE_C17 = 139
++    EM_TI_C6000 = 140
++    EM_TI_C2000 = 141
++    EM_TI_C5500 = 142
++    EM_TI_ARP32 = 143
++    EM_TI_PRU = 144
++    EM_MMDSP_PLUS = 160
++    EM_CYPRESS_M8C = 161
++    EM_R32C = 162
++    EM_TRIMEDIA = 163
++    EM_QDSP6 = 164
++    EM_8051 = 165
++    EM_STXP7X = 166
++    EM_NDS32 = 167
++    EM_ECOG1X = 168
++    EM_MAXQ30 = 169
++    EM_XIMO16 = 170
++    EM_MANIK = 171
++    EM_CRAYNV2 = 172
++    EM_RX = 173
++    EM_METAG = 174
++    EM_MCST_ELBRUS = 175
++    EM_ECOG16 = 176
++    EM_CR16 = 177
++    EM_ETPU = 178
++    EM_SLE9X = 179
++    EM_L10M = 180
++    EM_K10M = 181
++    EM_AARCH64 = 183
++    EM_AVR32 = 185
++    EM_STM8 = 186
++    EM_TILE64 = 187
++    EM_TILEPRO = 188
++    EM_MICROBLAZE = 189
++    EM_CUDA = 190
++    EM_TILEGX = 191
++    EM_CLOUDSHIELD = 192
++    EM_COREA_1ST = 193
++    EM_COREA_2ND = 194
++    EM_ARCV2 = 195
++    EM_OPEN8 = 196
++    EM_RL78 = 197
++    EM_VIDEOCORE5 = 198
++    EM_78KOR = 199
++    EM_56800EX = 200
++    EM_BA1 = 201
++    EM_BA2 = 202
++    EM_XCORE = 203
++    EM_MCHP_PIC = 204
++    EM_INTELGT = 205
++    EM_KM32 = 210
++    EM_KMX32 = 211
++    EM_EMX16 = 212
++    EM_EMX8 = 213
++    EM_KVARC = 214
++    EM_CDP = 215
++    EM_COGE = 216
++    EM_COOL = 217
++    EM_NORC = 218
++    EM_CSR_KALIMBA = 219
++    EM_Z80 = 220
++    EM_VISIUM = 221
++    EM_FT32 = 222
++    EM_MOXIE = 223
++    EM_AMDGPU = 224
++    EM_RISCV = 243
++    EM_BPF = 247
++    EM_CSKY = 252
++    EM_NUM = 253
++    EM_ALPHA = 0x9026
++
++class Et(_OpenIntEnum):
++    """ELF file type.  Type of ET_* values and the Ehdr.e_type field."""
++    ET_NONE = 0
++    ET_REL = 1
++    ET_EXEC = 2
++    ET_DYN = 3
++    ET_CORE = 4
++
++class Shn(_OpenIntEnum):
++    """ELF reserved section indices."""
++    SHN_UNDEF = 0
++    SHN_BEFORE = 0xff00
++    SHN_AFTER = 0xff01
++    SHN_ABS = 0xfff1
++    SHN_COMMON = 0xfff2
++    SHN_XINDEX = 0xffff
++
++class ShnMIPS(enum.Enum):
++    """Supplemental SHN_* constants for EM_MIPS."""
++    SHN_MIPS_ACOMMON = 0xff00
++    SHN_MIPS_TEXT = 0xff01
++    SHN_MIPS_DATA = 0xff02
++    SHN_MIPS_SCOMMON = 0xff03
++    SHN_MIPS_SUNDEFINED = 0xff04
++
++class ShnPARISC(enum.Enum):
++    """Supplemental SHN_* constants for EM_PARISC."""
++    SHN_PARISC_ANSI_COMMON = 0xff00
++    SHN_PARISC_HUGE_COMMON = 0xff01
++
++class Sht(_OpenIntEnum):
++    """ELF section types.  Type of SHT_* values."""
++    SHT_NULL = 0
++    SHT_PROGBITS = 1
++    SHT_SYMTAB = 2
++    SHT_STRTAB = 3
++    SHT_RELA = 4
++    SHT_HASH = 5
++    SHT_DYNAMIC = 6
++    SHT_NOTE = 7
++    SHT_NOBITS = 8
++    SHT_REL = 9
++    SHT_SHLIB = 10
++    SHT_DYNSYM = 11
++    SHT_INIT_ARRAY = 14
++    SHT_FINI_ARRAY = 15
++    SHT_PREINIT_ARRAY = 16
++    SHT_GROUP = 17
++    SHT_SYMTAB_SHNDX = 18
++    SHT_GNU_ATTRIBUTES = 0x6ffffff5
++    SHT_GNU_HASH = 0x6ffffff6
++    SHT_GNU_LIBLIST = 0x6ffffff7
++    SHT_CHECKSUM = 0x6ffffff8
++    SHT_SUNW_move = 0x6ffffffa
++    SHT_SUNW_COMDAT = 0x6ffffffb
++    SHT_SUNW_syminfo = 0x6ffffffc
++    SHT_GNU_verdef = 0x6ffffffd
++    SHT_GNU_verneed = 0x6ffffffe
++    SHT_GNU_versym = 0x6fffffff
++
++class ShtALPHA(enum.Enum):
++    """Supplemental SHT_* constants for EM_ALPHA."""
++    SHT_ALPHA_DEBUG = 0x70000001
++    SHT_ALPHA_REGINFO = 0x70000002
++
++class ShtARM(enum.Enum):
++    """Supplemental SHT_* constants for EM_ARM."""
++    SHT_ARM_EXIDX = 0x70000001
++    SHT_ARM_PREEMPTMAP = 0x70000002
++    SHT_ARM_ATTRIBUTES = 0x70000003
++
++class ShtCSKY(enum.Enum):
++    """Supplemental SHT_* constants for EM_CSKY."""
++    SHT_CSKY_ATTRIBUTES = 0x70000001
++
++class ShtIA_64(enum.Enum):
++    """Supplemental SHT_* constants for EM_IA_64."""
++    SHT_IA_64_EXT = 0x70000000
++    SHT_IA_64_UNWIND = 0x70000001
++
++class ShtMIPS(enum.Enum):
++    """Supplemental SHT_* constants for EM_MIPS."""
++    SHT_MIPS_LIBLIST = 0x70000000
++    SHT_MIPS_MSYM = 0x70000001
++    SHT_MIPS_CONFLICT = 0x70000002
++    SHT_MIPS_GPTAB = 0x70000003
++    SHT_MIPS_UCODE = 0x70000004
++    SHT_MIPS_DEBUG = 0x70000005
++    SHT_MIPS_REGINFO = 0x70000006
++    SHT_MIPS_PACKAGE = 0x70000007
++    SHT_MIPS_PACKSYM = 0x70000008
++    SHT_MIPS_RELD = 0x70000009
++    SHT_MIPS_IFACE = 0x7000000b
++    SHT_MIPS_CONTENT = 0x7000000c
++    SHT_MIPS_OPTIONS = 0x7000000d
++    SHT_MIPS_SHDR = 0x70000010
++    SHT_MIPS_FDESC = 0x70000011
++    SHT_MIPS_EXTSYM = 0x70000012
++    SHT_MIPS_DENSE = 0x70000013
++    SHT_MIPS_PDESC = 0x70000014
++    SHT_MIPS_LOCSYM = 0x70000015
++    SHT_MIPS_AUXSYM = 0x70000016
++    SHT_MIPS_OPTSYM = 0x70000017
++    SHT_MIPS_LOCSTR = 0x70000018
++    SHT_MIPS_LINE = 0x70000019
++    SHT_MIPS_RFDESC = 0x7000001a
++    SHT_MIPS_DELTASYM = 0x7000001b
++    SHT_MIPS_DELTAINST = 0x7000001c
++    SHT_MIPS_DELTACLASS = 0x7000001d
++    SHT_MIPS_DWARF = 0x7000001e
++    SHT_MIPS_DELTADECL = 0x7000001f
++    SHT_MIPS_SYMBOL_LIB = 0x70000020
++    SHT_MIPS_EVENTS = 0x70000021
++    SHT_MIPS_TRANSLATE = 0x70000022
++    SHT_MIPS_PIXIE = 0x70000023
++    SHT_MIPS_XLATE = 0x70000024
++    SHT_MIPS_XLATE_DEBUG = 0x70000025
++    SHT_MIPS_WHIRL = 0x70000026
++    SHT_MIPS_EH_REGION = 0x70000027
++    SHT_MIPS_XLATE_OLD = 0x70000028
++    SHT_MIPS_PDR_EXCEPTION = 0x70000029
++    SHT_MIPS_XHASH = 0x7000002b
++
++class ShtPARISC(enum.Enum):
++    """Supplemental SHT_* constants for EM_PARISC."""
++    SHT_PARISC_EXT = 0x70000000
++    SHT_PARISC_UNWIND = 0x70000001
++    SHT_PARISC_DOC = 0x70000002
++
++class Pf(enum.IntFlag):
++    """Program header flags.  Type of Phdr.p_flags values."""
++    PF_X = 1
++    PF_W = 2
++    PF_R = 4
++
++class PfARM(enum.IntFlag):
++    """Supplemental PF_* flags for EM_ARM."""
++    PF_ARM_SB = 0x10000000
++    PF_ARM_PI = 0x20000000
++    PF_ARM_ABS = 0x40000000
++
++class PfPARISC(enum.IntFlag):
++    """Supplemental PF_* flags for EM_PARISC."""
++    PF_HP_PAGE_SIZE = 0x00100000
++    PF_HP_FAR_SHARED = 0x00200000
++    PF_HP_NEAR_SHARED = 0x00400000
++    PF_HP_CODE = 0x01000000
++    PF_HP_MODIFY = 0x02000000
++    PF_HP_LAZYSWAP = 0x04000000
++    PF_HP_SBP = 0x08000000
++
++class PfIA_64(enum.IntFlag):
++    """Supplemental PF_* flags for EM_IA_64."""
++    PF_IA_64_NORECOV = 0x80000000
++
++class PfMIPS(enum.IntFlag):
++    """Supplemental PF_* flags for EM_MIPS."""
++    PF_MIPS_LOCAL = 0x10000000
++
++class Shf(enum.IntFlag):
++    """Section flags.  Type of Shdr.sh_type values."""
++    SHF_WRITE = 1 << 0
++    SHF_ALLOC = 1 << 1
++    SHF_EXECINSTR = 1 << 2
++    SHF_MERGE = 1 << 4
++    SHF_STRINGS = 1 << 5
++    SHF_INFO_LINK = 1 << 6
++    SHF_LINK_ORDER = 1 << 7
++    SHF_OS_NONCONFORMING = 256
++    SHF_GROUP = 1 << 9
++    SHF_TLS = 1 << 10
++    SHF_COMPRESSED = 1 << 11
++    SHF_GNU_RETAIN = 1 << 21
++    SHF_ORDERED = 1 << 30
++    SHF_EXCLUDE = 1 << 31
++
++class ShfALPHA(enum.IntFlag):
++    """Supplemental SHF_* constants for EM_ALPHA."""
++    SHF_ALPHA_GPREL = 0x10000000
++
++class ShfARM(enum.IntFlag):
++    """Supplemental SHF_* constants for EM_ARM."""
++    SHF_ARM_ENTRYSECT = 0x10000000
++    SHF_ARM_COMDEF = 0x80000000
++
++class ShfIA_64(enum.IntFlag):
++    """Supplemental SHF_* constants for EM_IA_64."""
++    SHF_IA_64_SHORT  = 0x10000000
++    SHF_IA_64_NORECOV = 0x20000000
++
++class ShfMIPS(enum.IntFlag):
++    """Supplemental SHF_* constants for EM_MIPS."""
++    SHF_MIPS_GPREL = 0x10000000
++    SHF_MIPS_MERGE = 0x20000000
++    SHF_MIPS_ADDR = 0x40000000
++    SHF_MIPS_STRINGS = 0x80000000
++    SHF_MIPS_NOSTRIP = 0x08000000
++    SHF_MIPS_LOCAL = 0x04000000
++    SHF_MIPS_NAMES = 0x02000000
++    SHF_MIPS_NODUPE = 0x01000000
++
++class ShfPARISC(enum.IntFlag):
++    """Supplemental SHF_* constants for EM_PARISC."""
++    SHF_PARISC_SHORT = 0x20000000
++    SHF_PARISC_HUGE = 0x40000000
++    SHF_PARISC_SBP = 0x80000000
++
++class Stb(_OpenIntEnum):
++    """ELF symbol binding type."""
++    STB_LOCAL = 0
++    STB_GLOBAL = 1
++    STB_WEAK = 2
++    STB_GNU_UNIQUE = 10
++    STB_MIPS_SPLIT_COMMON = 13
++
++class Stt(_OpenIntEnum):
++    """ELF symbol type."""
++    STT_NOTYPE = 0
++    STT_OBJECT = 1
++    STT_FUNC = 2
++    STT_SECTION = 3
++    STT_FILE = 4
++    STT_COMMON = 5
++    STT_TLS = 6
++    STT_GNU_IFUNC = 10
++
++class SttARM(enum.Enum):
++    """Supplemental STT_* constants for EM_ARM."""
++    STT_ARM_TFUNC = 13
++    STT_ARM_16BIT = 15
++
++class SttPARISC(enum.Enum):
++    """Supplemental STT_* constants for EM_PARISC."""
++    STT_HP_OPAQUE = 11
++    STT_HP_STUB = 12
++    STT_PARISC_MILLICODE = 13
++
++class SttSPARC(enum.Enum):
++    """Supplemental STT_* constants for EM_SPARC."""
++    STT_SPARC_REGISTER = 13
++
++class SttX86_64(enum.Enum):
++    """Supplemental STT_* constants for EM_X86_64."""
++    SHT_X86_64_UNWIND = 0x70000001
++
++class Pt(_OpenIntEnum):
++    """ELF program header types.  Type of Phdr.p_type."""
++    PT_NULL = 0
++    PT_LOAD = 1
++    PT_DYNAMIC = 2
++    PT_INTERP = 3
++    PT_NOTE = 4
++    PT_SHLIB = 5
++    PT_PHDR = 6
++    PT_TLS = 7
++    PT_NUM = 8
++    PT_GNU_EH_FRAME = 0x6474e550
++    PT_GNU_STACK = 0x6474e551
++    PT_GNU_RELRO = 0x6474e552
++    PT_GNU_PROPERTY = 0x6474e553
++    PT_SUNWBSS = 0x6ffffffa
++    PT_SUNWSTACK = 0x6ffffffb
++
++class PtARM(enum.Enum):
++    """Supplemental PT_* constants for EM_ARM."""
++    PT_ARM_EXIDX = 0x70000001
++
++class PtIA_64(enum.Enum):
++    """Supplemental PT_* constants for EM_IA_64."""
++    PT_IA_64_HP_OPT_ANOT = 0x60000012
++    PT_IA_64_HP_HSL_ANOT = 0x60000013
++    PT_IA_64_HP_STACK = 0x60000014
++    PT_IA_64_ARCHEXT = 0x70000000
++    PT_IA_64_UNWIND = 0x70000001
++
++class PtMIPS(enum.Enum):
++    """Supplemental PT_* constants for EM_MIPS."""
++    PT_MIPS_REGINFO = 0x70000000
++    PT_MIPS_RTPROC = 0x70000001
++    PT_MIPS_OPTIONS = 0x70000002
++    PT_MIPS_ABIFLAGS = 0x70000003
++
++class PtPARISC(enum.Enum):
++    """Supplemental PT_* constants for EM_PARISC."""
++    PT_HP_TLS = 0x60000000
++    PT_HP_CORE_NONE = 0x60000001
++    PT_HP_CORE_VERSION = 0x60000002
++    PT_HP_CORE_KERNEL = 0x60000003
++    PT_HP_CORE_COMM = 0x60000004
++    PT_HP_CORE_PROC = 0x60000005
++    PT_HP_CORE_LOADABLE = 0x60000006
++    PT_HP_CORE_STACK = 0x60000007
++    PT_HP_CORE_SHM = 0x60000008
++    PT_HP_CORE_MMF = 0x60000009
++    PT_HP_PARALLEL = 0x60000010
++    PT_HP_FASTBIND = 0x60000011
++    PT_HP_OPT_ANNOT = 0x60000012
++    PT_HP_HSL_ANNOT = 0x60000013
++    PT_HP_STACK = 0x60000014
++    PT_PARISC_ARCHEXT = 0x70000000
++    PT_PARISC_UNWIND = 0x70000001
++
++class Dt(_OpenIntEnum):
++    """ELF dynamic segment tags.  Type of Dyn.d_val."""
++    DT_NULL = 0
++    DT_NEEDED = 1
++    DT_PLTRELSZ = 2
++    DT_PLTGOT = 3
++    DT_HASH = 4
++    DT_STRTAB = 5
++    DT_SYMTAB = 6
++    DT_RELA = 7
++    DT_RELASZ = 8
++    DT_RELAENT = 9
++    DT_STRSZ = 10
++    DT_SYMENT = 11
++    DT_INIT = 12
++    DT_FINI = 13
++    DT_SONAME = 14
++    DT_RPATH = 15
++    DT_SYMBOLIC = 16
++    DT_REL = 17
++    DT_RELSZ = 18
++    DT_RELENT = 19
++    DT_PLTREL = 20
++    DT_DEBUG = 21
++    DT_TEXTREL = 22
++    DT_JMPREL = 23
++    DT_BIND_NOW = 24
++    DT_INIT_ARRAY = 25
++    DT_FINI_ARRAY = 26
++    DT_INIT_ARRAYSZ = 27
++    DT_FINI_ARRAYSZ = 28
++    DT_RUNPATH = 29
++    DT_FLAGS = 30
++    DT_PREINIT_ARRAY = 32
++    DT_PREINIT_ARRAYSZ = 33
++    DT_SYMTAB_SHNDX = 34
++    DT_GNU_PRELINKED = 0x6ffffdf5
++    DT_GNU_CONFLICTSZ = 0x6ffffdf6
++    DT_GNU_LIBLISTSZ = 0x6ffffdf7
++    DT_CHECKSUM = 0x6ffffdf8
++    DT_PLTPADSZ = 0x6ffffdf9
++    DT_MOVEENT = 0x6ffffdfa
++    DT_MOVESZ = 0x6ffffdfb
++    DT_FEATURE_1 = 0x6ffffdfc
++    DT_POSFLAG_1 = 0x6ffffdfd
++    DT_SYMINSZ = 0x6ffffdfe
++    DT_SYMINENT = 0x6ffffdff
++    DT_GNU_HASH = 0x6ffffef5
++    DT_TLSDESC_PLT = 0x6ffffef6
++    DT_TLSDESC_GOT = 0x6ffffef7
++    DT_GNU_CONFLICT = 0x6ffffef8
++    DT_GNU_LIBLIST = 0x6ffffef9
++    DT_CONFIG = 0x6ffffefa
++    DT_DEPAUDIT = 0x6ffffefb
++    DT_AUDIT = 0x6ffffefc
++    DT_PLTPAD = 0x6ffffefd
++    DT_MOVETAB = 0x6ffffefe
++    DT_SYMINFO = 0x6ffffeff
++    DT_VERSYM = 0x6ffffff0
++    DT_RELACOUNT = 0x6ffffff9
++    DT_RELCOUNT = 0x6ffffffa
++    DT_FLAGS_1 = 0x6ffffffb
++    DT_VERDEF = 0x6ffffffc
++    DT_VERDEFNUM = 0x6ffffffd
++    DT_VERNEED = 0x6ffffffe
++    DT_VERNEEDNUM = 0x6fffffff
++    DT_AUXILIARY = 0x7ffffffd
++    DT_FILTER = 0x7fffffff
++
++class DtAARCH64(enum.Enum):
++    """Supplemental DT_* constants for EM_AARCH64."""
++    DT_AARCH64_BTI_PLT = 0x70000001
++    DT_AARCH64_PAC_PLT = 0x70000003
++    DT_AARCH64_VARIANT_PCS = 0x70000005
++
++class DtALPHA(enum.Enum):
++    """Supplemental DT_* constants for EM_ALPHA."""
++    DT_ALPHA_PLTRO = 0x70000000
++
++class DtALTERA_NIOS2(enum.Enum):
++    """Supplemental DT_* constants for EM_ALTERA_NIOS2."""
++    DT_NIOS2_GP = 0x70000002
++
++class DtIA_64(enum.Enum):
++    """Supplemental DT_* constants for EM_IA_64."""
++    DT_IA_64_PLT_RESERVE = 0x70000000
++
++class DtMIPS(enum.Enum):
++    """Supplemental DT_* constants for EM_MIPS."""
++    DT_MIPS_RLD_VERSION = 0x70000001
++    DT_MIPS_TIME_STAMP = 0x70000002
++    DT_MIPS_ICHECKSUM = 0x70000003
++    DT_MIPS_IVERSION = 0x70000004
++    DT_MIPS_FLAGS = 0x70000005
++    DT_MIPS_BASE_ADDRESS = 0x70000006
++    DT_MIPS_MSYM = 0x70000007
++    DT_MIPS_CONFLICT = 0x70000008
++    DT_MIPS_LIBLIST = 0x70000009
++    DT_MIPS_LOCAL_GOTNO = 0x7000000a
++    DT_MIPS_CONFLICTNO = 0x7000000b
++    DT_MIPS_LIBLISTNO = 0x70000010
++    DT_MIPS_SYMTABNO = 0x70000011
++    DT_MIPS_UNREFEXTNO = 0x70000012
++    DT_MIPS_GOTSYM = 0x70000013
++    DT_MIPS_HIPAGENO = 0x70000014
++    DT_MIPS_RLD_MAP = 0x70000016
++    DT_MIPS_DELTA_CLASS = 0x70000017
++    DT_MIPS_DELTA_CLASS_NO = 0x70000018
++    DT_MIPS_DELTA_INSTANCE = 0x70000019
++    DT_MIPS_DELTA_INSTANCE_NO = 0x7000001a
++    DT_MIPS_DELTA_RELOC = 0x7000001b
++    DT_MIPS_DELTA_RELOC_NO = 0x7000001c
++    DT_MIPS_DELTA_SYM = 0x7000001d
++    DT_MIPS_DELTA_SYM_NO = 0x7000001e
++    DT_MIPS_DELTA_CLASSSYM = 0x70000020
++    DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021
++    DT_MIPS_CXX_FLAGS = 0x70000022
++    DT_MIPS_PIXIE_INIT = 0x70000023
++    DT_MIPS_SYMBOL_LIB = 0x70000024
++    DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025
++    DT_MIPS_LOCAL_GOTIDX = 0x70000026
++    DT_MIPS_HIDDEN_GOTIDX = 0x70000027
++    DT_MIPS_PROTECTED_GOTIDX = 0x70000028
++    DT_MIPS_OPTIONS = 0x70000029
++    DT_MIPS_INTERFACE = 0x7000002a
++    DT_MIPS_DYNSTR_ALIGN = 0x7000002b
++    DT_MIPS_INTERFACE_SIZE = 0x7000002c
++    DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002d
++    DT_MIPS_PERF_SUFFIX = 0x7000002e
++    DT_MIPS_COMPACT_SIZE = 0x7000002f
++    DT_MIPS_GP_VALUE = 0x70000030
++    DT_MIPS_AUX_DYNAMIC = 0x70000031
++    DT_MIPS_PLTGOT = 0x70000032
++    DT_MIPS_RWPLT = 0x70000034
++    DT_MIPS_RLD_MAP_REL = 0x70000035
++    DT_MIPS_XHASH = 0x70000036
++
++class DtPPC(enum.Enum):
++    """Supplemental DT_* constants for EM_PPC."""
++    DT_PPC_GOT = 0x70000000
++    DT_PPC_OPT = 0x70000001
++
++class DtPPC64(enum.Enum):
++    """Supplemental DT_* constants for EM_PPC64."""
++    DT_PPC64_GLINK = 0x70000000
++    DT_PPC64_OPD = 0x70000001
++    DT_PPC64_OPDSZ = 0x70000002
++    DT_PPC64_OPT = 0x70000003
++
++class DtSPARC(enum.Enum):
++    """Supplemental DT_* constants for EM_SPARC."""
++    DT_SPARC_REGISTER = 0x70000001
++
++class StInfo:
++    """ELF symbol binding and type.  Type of the Sym.st_info field."""
++    def __init__(self, arg0, arg1=None):
++        if isinstance(arg0, int) and arg1 is None:
++            self.bind = Stb(arg0 >> 4)
++            self.type = Stt(arg0 & 15)
++        else:
++            self.bind = Stb(arg0)
++            self.type = Stt(arg1)
++
++    def value(self):
++        """Returns the raw value for the bind/type combination."""
++        return (self.bind.value() << 4) | (self.type.value())
++
++# Type in an ELF file.  Used for deserialization.
++_Layout = collections.namedtuple('_Layout', 'unpack size')
++
++def _define_layouts(baseclass: type, layout32: str, layout64: str,
++                    types=None, fields32=None):
++    """Assign variants dict to baseclass.
++
++    The variants dict is indexed by (ElfClass, ElfData) pairs, and its
++    values are _Layout instances.
++
++    """
++    struct32 = struct.Struct(layout32)
++    struct64 = struct.Struct(layout64)
++
++    # Check that the struct formats yield the right number of components.
++    for s in (struct32, struct64):
++        example = s.unpack(b' ' * s.size)
++        if len(example) != len(baseclass._fields):
++            raise ValueError('{!r} yields wrong field count: {} != {}'.format(
++                s.format, len(example),  len(baseclass._fields)))
++
++    # Check that field names in types are correct.
++    if types is None:
++        types = ()
++    for n in types:
++        if n not in baseclass._fields:
++            raise ValueError('{} does not have field {!r}'.format(
++                baseclass.__name__, n))
++
++    if fields32 is not None \
++       and set(fields32) != set(baseclass._fields):
++        raise ValueError('{!r} is not a permutation of the fields {!r}'.format(
++            fields32, baseclass._fields))
++
++    def unique_name(name, used_names = (set((baseclass.__name__,))
++                                        | set(baseclass._fields)
++                                        | {n.__name__
++                                           for n in (types or {}).values()})):
++        """Find a name that is not used for a class or field name."""
++        candidate = name
++        n = 0
++        while candidate in used_names:
++            n += 1
++            candidate = '{}{}'.format(name, n)
++        used_names.add(candidate)
++        return candidate
++
++    blob_name = unique_name('blob')
++    struct_unpack_name = unique_name('struct_unpack')
++    comps_name = unique_name('comps')
++
++    layouts = {}
++    for (bits, elfclass, layout, fields) in (
++            (32, ElfClass.ELFCLASS32, layout32, fields32),
++            (64, ElfClass.ELFCLASS64, layout64, None),
++    ):
++        for (elfdata, structprefix, funcsuffix) in (
++                (ElfData.ELFDATA2LSB, '<', 'LE'),
++                (ElfData.ELFDATA2MSB, '>', 'BE'),
++        ):
++            env = {
++                baseclass.__name__: baseclass,
++                struct_unpack_name: struct.unpack,
++            }
++
++            # Add the type converters.
++            if types:
++                for cls in types.values():
++                    env[cls.__name__] = cls
++
++            funcname = ''.join(
++                ('unpack_', baseclass.__name__, str(bits), funcsuffix))
++
++            code = '''
++def {funcname}({blob_name}):
++'''.format(funcname=funcname, blob_name=blob_name)
++
++            indent = ' ' * 4
++            unpack_call = '{}({!r}, {})'.format(
++                struct_unpack_name, structprefix + layout, blob_name)
++            field_names = ', '.join(baseclass._fields)
++            if types is None and fields is None:
++                code += '{}return {}({})\n'.format(
++                    indent, baseclass.__name__, unpack_call)
++            else:
++                # Destructuring tuple assignment.
++                if fields is None:
++                    code += '{}{} = {}\n'.format(
++                        indent, field_names, unpack_call)
++                else:
++                    # Use custom field order.
++                    code += '{}{} = {}\n'.format(
++                        indent, ', '.join(fields), unpack_call)
++
++                # Perform the type conversions.
++                for n in baseclass._fields:
++                    if n in types:
++                        code += '{}{} = {}({})\n'.format(
++                            indent, n, types[n].__name__, n)
++                # Create the named tuple.
++                code += '{}return {}({})\n'.format(
++                    indent, baseclass.__name__, field_names)
++
++            exec(code, env)
++            layouts[(elfclass, elfdata)] = _Layout(
++                env[funcname], struct.calcsize(layout))
++    baseclass.layouts = layouts
++
++
++# Corresponds to EI_* indices into Elf*_Ehdr.e_indent.
++class Ident(collections.namedtuple('Ident',
++    'ei_mag ei_class ei_data ei_version ei_osabi ei_abiversion ei_pad')):
++
++    def __new__(cls, *args):
++        """Construct an object from a blob or its constituent fields."""
++        if len(args) == 1:
++            return cls.unpack(args[0])
++        return cls.__base__.__new__(cls, *args)
++
++    @staticmethod
++    def unpack(blob: memoryview) -> 'Ident':
++        """Parse raws data into a tuple."""
++        ei_mag, ei_class, ei_data, ei_version, ei_osabi, ei_abiversion, \
++            ei_pad = struct.unpack('4s5B7s', blob)
++        return Ident(ei_mag, ElfClass(ei_class), ElfData(ei_data),
++                     ei_version, ei_osabi, ei_abiversion, ei_pad)
++    size = 16
++
++# Corresponds to Elf32_Ehdr and Elf64_Ehdr.
++Ehdr = collections.namedtuple('Ehdr',
++   'e_ident e_type e_machine e_version e_entry e_phoff e_shoff e_flags'
++    + ' e_ehsize e_phentsize e_phnum e_shentsize e_shnum e_shstrndx')
++_define_layouts(Ehdr,
++                layout32='16s2H5I6H',
++                layout64='16s2HI3QI6H',
++                types=dict(e_ident=Ident,
++                           e_machine=Machine,
++                           e_type=Et,
++                           e_shstrndx=Shn))
++
++# Corresponds to Elf32_Phdr and Elf64_Pdhr.  Order follows the latter.
++Phdr = collections.namedtuple('Phdr',
++    'p_type p_flags p_offset p_vaddr p_paddr p_filesz p_memsz p_align')
++_define_layouts(Phdr,
++                layout32='8I',
++                fields32=('p_type', 'p_offset', 'p_vaddr', 'p_paddr',
++                          'p_filesz', 'p_memsz', 'p_flags', 'p_align'),
++                layout64='2I6Q',
++            types=dict(p_type=Pt, p_flags=Pf))
++
++
++# Corresponds to Elf32_Shdr and Elf64_Shdr.
++class Shdr(collections.namedtuple('Shdr',
++    'sh_name sh_type sh_flags sh_addr sh_offset sh_size sh_link sh_info'
++    + ' sh_addralign sh_entsize')):
++    def resolve(self, strtab: 'StringTable') -> 'Shdr':
++        """Resolve sh_name using a string table."""
++        return self.__class__(strtab.get(self[0]), *self[1:])
++_define_layouts(Shdr,
++                layout32='10I',
++                layout64='2I4Q2I2Q',
++                types=dict(sh_type=Sht,
++                           sh_flags=Shf,
++                           sh_link=Shn))
++
++# Corresponds to Elf32_Dyn and Elf64_Dyn.  The nesting through the
++# d_un union is skipped, and d_ptr is missing (its representation in
++# Python would be identical to d_val).
++Dyn = collections.namedtuple('Dyn', 'd_tag d_val')
++_define_layouts(Dyn,
++                layout32='2i',
++                layout64='2q',
++                types=dict(d_tag=Dt))
++
++# Corresponds to Elf32_Sym and Elf64_Sym.
++class Sym(collections.namedtuple('Sym',
++    'st_name st_info st_other st_shndx st_value st_size')):
++    def resolve(self, strtab: 'StringTable') -> 'Sym':
++        """Resolve st_name using a string table."""
++        return self.__class__(strtab.get(self[0]), *self[1:])
++_define_layouts(Sym,
++                layout32='3I2BH',
++                layout64='I2BH2Q',
++                fields32=('st_name', 'st_value', 'st_size', 'st_info',
++                          'st_other', 'st_shndx'),
++                types=dict(st_shndx=Shn,
++                           st_info=StInfo))
++
++# Corresponds to Elf32_Rel and Elf64_Rel.
++Rel = collections.namedtuple('Rel', 'r_offset r_info')
++_define_layouts(Rel,
++                layout32='2I',
++                layout64='2Q')
++
++# Corresponds to Elf32_Rel and Elf64_Rel.
++Rela = collections.namedtuple('Rela', 'r_offset r_info r_addend')
++_define_layouts(Rela,
++                layout32='3I',
++                layout64='3Q')
++
++class StringTable:
++    """ELF string table."""
++    def __init__(self, blob):
++        """Create a new string table backed by the data in the blob.
++
++        blob: a memoryview-like object
++
++        """
++        self.blob = blob
++
++    def get(self, index) -> bytes:
++        """Returns the null-terminated byte string at the index."""
++        blob = self.blob
++        endindex = index
++        while True:
++            if blob[endindex] == 0:
++                return bytes(blob[index:endindex])
++            endindex += 1
++
++class Image:
++    """ELF image parser."""
++    def __init__(self, image):
++        """Create an ELF image from binary image data.
++
++        image: a memoryview-like object that supports efficient range
++        subscripting.
++
++        """
++        self.image = image
++        ident = self.read(Ident, 0)
++        classdata = (ident.ei_class, ident.ei_data)
++        # Set self.Ehdr etc. to the subtypes with the right parsers.
++        for typ in (Ehdr, Phdr, Shdr, Dyn, Sym, Rel, Rela):
++            setattr(self, typ.__name__, typ.layouts.get(classdata, None))
++
++        if self.Ehdr is not None:
++            self.ehdr = self.read(self.Ehdr, 0)
++            self._shdr_num = self._compute_shdr_num()
++        else:
++            self.ehdr = None
++            self._shdr_num = 0
++
++        self._section = {}
++        self._stringtab = {}
++
++        if self._shdr_num > 0:
++            self._shdr_strtab = self._find_shdr_strtab()
++        else:
++            self._shdr_strtab = None
++
++    @staticmethod
++    def readfile(path: str) -> 'Image':
++        """Reads the ELF file at the specified path."""
++        with open(path, 'rb') as inp:
++            return Image(memoryview(inp.read()))
++
++    def _compute_shdr_num(self) -> int:
++        """Computes the actual number of section headers."""
++        shnum = self.ehdr.e_shnum
++        if shnum == 0:
++            if self.ehdr.e_shoff == 0 or self.ehdr.e_shentsize == 0:
++                # No section headers.
++                return 0
++            # Otherwise the extension mechanism is used (which may be
++            # needed because e_shnum is just 16 bits).
++            return self.read(self.Shdr, self.ehdr.e_shoff).sh_size
++        return shnum
++
++    def _find_shdr_strtab(self) -> StringTable:
++        """Finds the section header string table (maybe via extensions)."""
++        shstrndx = self.ehdr.e_shstrndx
++        if shstrndx == Shn.SHN_XINDEX:
++            shstrndx = self.read(self.Shdr, self.ehdr.e_shoff).sh_link
++        return self._find_stringtab(shstrndx)
++
++    def read(self, typ: type, offset:int ):
++        """Reads an object at a specific offset.
++
++        The type must have been enhanced using _define_variants.
++
++        """
++        return typ.unpack(self.image[offset: offset + typ.size])
++
++    def phdrs(self) -> Phdr:
++        """Generator iterating over the program headers."""
++        if self.ehdr is None:
++            return
++        size = self.ehdr.e_phentsize
++        if size != self.Phdr.size:
++            raise ValueError('Unexpected Phdr size in ELF header: {} != {}'
++                             .format(size, self.Phdr.size))
++
++        offset = self.ehdr.e_phoff
++        for _ in range(self.ehdr.e_phnum):
++            yield self.read(self.Phdr, offset)
++            offset += size
++
++    def shdrs(self, resolve: bool=True) -> Shdr:
++        """Generator iterating over the section headers.
++
++        If resolve, section names are automatically translated
++        using the section header string table.
++
++        """
++        if self._shdr_num == 0:
++            return
++
++        size = self.ehdr.e_shentsize
++        if size != self.Shdr.size:
++            raise ValueError('Unexpected Shdr size in ELF header: {} != {}'
++                             .format(size, self.Shdr.size))
++
++        offset = self.ehdr.e_shoff
++        for _ in range(self._shdr_num):
++            shdr = self.read(self.Shdr, offset)
++            if resolve:
++                shdr = shdr.resolve(self._shdr_strtab)
++            yield shdr
++            offset += size
++
++    def dynamic(self) -> Dyn:
++        """Generator iterating over the dynamic segment."""
++        for phdr in self.phdrs():
++            if phdr.p_type == Pt.PT_DYNAMIC:
++                # Pick the first dynamic segment, like the loader.
++                if phdr.p_filesz == 0:
++                    # Probably separated debuginfo.
++                    return
++                offset = phdr.p_offset
++                end = offset + phdr.p_memsz
++                size = self.Dyn.size
++                while True:
++                    next_offset = offset + size
++                    if next_offset > end:
++                        raise ValueError(
++                            'Dynamic segment size {} is not a multiple of Dyn size {}'.format(
++                                phdr.p_memsz, size))
++                    yield self.read(self.Dyn, offset)
++                    if next_offset == end:
++                        return
++                    offset = next_offset
++
++    def syms(self, shdr: Shdr, resolve: bool=True) -> Sym:
++        """A generator iterating over a symbol table.
++
++        If resolve, symbol names are automatically translated using
++        the string table for the symbol table.
++
++        """
++        assert shdr.sh_type == Sht.SHT_SYMTAB
++        size = shdr.sh_entsize
++        if size != self.Sym.size:
++            raise ValueError('Invalid symbol table entry size {}'.format(size))
++        offset = shdr.sh_offset
++        end = shdr.sh_offset + shdr.sh_size
++        if resolve:
++            strtab = self._find_stringtab(shdr.sh_link)
++        while offset < end:
++            sym = self.read(self.Sym, offset)
++            if resolve:
++                sym = sym.resolve(strtab)
++            yield sym
++            offset += size
++        if offset != end:
++            raise ValueError('Symbol table is not a multiple of entry size')
++
++    def lookup_string(self, strtab_index: int, strtab_offset: int) -> bytes:
++        """Looks up a string in a string table identified by its link index."""
++        try:
++            strtab = self._stringtab[strtab_index]
++        except KeyError:
++            strtab = self._find_stringtab(strtab_index)
++        return strtab.get(strtab_offset)
++
++    def find_section(self, shndx: Shn) -> Shdr:
++        """Returns the section header for the indexed section.
++
++        The section name is not resolved.
++        """
++        try:
++            return self._section[shndx]
++        except KeyError:
++            pass
++        if shndx in Shn:
++            raise ValueError('Reserved section index {}'.format(shndx))
++        idx = shndx.value
++        if idx < 0 or idx > self._shdr_num:
++            raise ValueError('Section index {} out of range [0, {})'.format(
++                idx, self._shdr_num))
++        shdr = self.read(
++            self.Shdr, self.ehdr.e_shoff + idx * self.Shdr.size)
++        self._section[shndx] = shdr
++        return shdr
++
++    def _find_stringtab(self, sh_link: int) -> StringTable:
++        if sh_link in self._stringtab:
++            return self._stringtab
++        if sh_link < 0 or sh_link >= self._shdr_num:
++            raise ValueError('Section index {} out of range [0, {})'.format(
++                sh_link, self._shdr_num))
++        shdr = self.read(
++            self.Shdr, self.ehdr.e_shoff + sh_link * self.Shdr.size)
++        if shdr.sh_type != Sht.SHT_STRTAB:
++            raise ValueError(
++                'Section {} is not a string table: {}'.format(
++                    sh_link, shdr.sh_type))
++        strtab = StringTable(
++            self.image[shdr.sh_offset:shdr.sh_offset + shdr.sh_size])
++        # This could retrain essentially arbitrary amounts of data,
++        # but caching string tables seems important for performance.
++        self._stringtab[sh_link] = strtab
++        return strtab
++
++
++__all__ = [name for name in dir() if name[0].isupper()]
diff --git a/glibc-upstream-2.34-168.patch b/glibc-upstream-2.34-168.patch
new file mode 100644
index 0000000..49e07b7
--- /dev/null
+++ b/glibc-upstream-2.34-168.patch
@@ -0,0 +1,407 @@
+commit f0c71b34f96c816292c49122d50da3a511b67bf2
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Mon Apr 11 11:30:31 2022 +0200
+
+    Default to --with-default-link=no (bug 25812)
+    
+    This is necessary to place the libio vtables into the RELRO segment.
+    New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to
+    verify that this is what actually happens.
+    
+    The new tests fail on ia64 due to lack of (default) RELRO support
+    inbutils, so they are XFAILed there.
+    
+    (cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7)
+
+diff --git a/INSTALL b/INSTALL
+index d8d4e9f155f56616..60d01568d77645c7 100644
+--- a/INSTALL
++++ b/INSTALL
+@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
+      library will still be usable, but functionality may be lost--for
+      example, you can't build a shared libc with old binutils.
+ 
++'--with-default-link=FLAG'
++     With '--with-default-link=yes', the build system does not use a
++     custom linker script for linking shared objects.  The default for
++     FLAG is the opposite, 'no', because the custom linker script is
++     needed for full RELRO protection.
++
+ '--with-nonshared-cflags=CFLAGS'
+      Use additional compiler flags CFLAGS to build the parts of the
+      library which are always statically linked into applications and
+diff --git a/configure b/configure
+index 03f4e59e754b5463..34c64f8de44e3086 100755
+--- a/configure
++++ b/configure
+@@ -3373,7 +3373,7 @@ fi
+ if test "${with_default_link+set}" = set; then :
+   withval=$with_default_link; use_default_link=$withval
+ else
+-  use_default_link=default
++  use_default_link=no
+ fi
+ 
+ 
+@@ -6085,69 +6085,6 @@ fi
+ $as_echo "$libc_cv_hashstyle" >&6; }
+ 
+ 
+-# The linker's default -shared behavior is good enough if it
+-# does these things that our custom linker scripts ensure that
+-# all allocated NOTE sections come first.
+-if test "$use_default_link" = default; then
+-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
+-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
+-if ${libc_cv_use_default_link+:} false; then :
+-  $as_echo_n "(cached) " >&6
+-else
+-    libc_cv_use_default_link=no
+-  cat > conftest.s <<\EOF
+-	  .section .note.a,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "foo"
+-	  .section .note.b,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "bar"
+-EOF
+-  if { ac_try='  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
+-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+-  (eval $ac_try) 2>&5
+-  ac_status=$?
+-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+-  test $ac_status = 0; }; } &&
+-       ac_try=`$READELF -S conftest.so | sed -n \
+-	 '${x;p;}
+-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
+-	  t a
+-	  b
+-	  : a
+-	  H'`
+-  then
+-    libc_seen_a=no libc_seen_b=no
+-    set -- $ac_try
+-    while test $# -ge 2 -a "$1" = NOTE; do
+-      case "$2" in
+-      .note.a) libc_seen_a=yes ;;
+-      .note.b) libc_seen_b=yes ;;
+-      esac
+-      shift 2
+-    done
+-    case "$libc_seen_a$libc_seen_b" in
+-    yesyes)
+-      libc_cv_use_default_link=yes
+-      ;;
+-    *)
+-      echo >&5 "\
+-$libc_seen_a$libc_seen_b from:
+-$ac_try"
+-      ;;
+-    esac
+-  fi
+-  rm -f conftest*
+-fi
+-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
+-$as_echo "$libc_cv_use_default_link" >&6; }
+-  use_default_link=$libc_cv_use_default_link
+-fi
+-
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
+ $as_echo_n "checking for GLOB_DAT reloc... " >&6; }
+ if ${libc_cv_has_glob_dat+:} false; then :
+diff --git a/configure.ac b/configure.ac
+index eb9431875fae1b0e..2c69af0807266e7e 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
+ 	    AS_HELP_STRING([--with-default-link],
+ 			   [do not use explicit linker scripts]),
+ 	    [use_default_link=$withval],
+-	    [use_default_link=default])
++	    [use_default_link=no])
+ 
+ dnl Additional build flags injection.
+ AC_ARG_WITH([nonshared-cflags],
+@@ -1378,59 +1378,6 @@ fi
+ rm -f conftest*])
+ AC_SUBST(libc_cv_hashstyle)
+ 
+-# The linker's default -shared behavior is good enough if it
+-# does these things that our custom linker scripts ensure that
+-# all allocated NOTE sections come first.
+-if test "$use_default_link" = default; then
+-  AC_CACHE_CHECK([for sufficient default -shared layout],
+-		  libc_cv_use_default_link, [dnl
+-  libc_cv_use_default_link=no
+-  cat > conftest.s <<\EOF
+-	  .section .note.a,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "foo"
+-	  .section .note.b,"a",%note
+-	  .balign 4
+-	  .long 4,4,9
+-	  .string "GNU"
+-	  .string "bar"
+-EOF
+-  if AC_TRY_COMMAND([dnl
+-  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
+-       ac_try=`$READELF -S conftest.so | sed -n \
+-	 ['${x;p;}
+-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
+-	  t a
+-	  b
+-	  : a
+-	  H']`
+-  then
+-    libc_seen_a=no libc_seen_b=no
+-    set -- $ac_try
+-    while test $# -ge 2 -a "$1" = NOTE; do
+-      case "$2" in
+-      .note.a) libc_seen_a=yes ;;
+-      .note.b) libc_seen_b=yes ;;
+-      esac
+-      shift 2
+-    done
+-    case "$libc_seen_a$libc_seen_b" in
+-    yesyes)
+-      libc_cv_use_default_link=yes
+-      ;;
+-    *)
+-      echo >&AS_MESSAGE_LOG_FD "\
+-$libc_seen_a$libc_seen_b from:
+-$ac_try"
+-      ;;
+-    esac
+-  fi
+-  rm -f conftest*])
+-  use_default_link=$libc_cv_use_default_link
+-fi
+-
+ AC_CACHE_CHECK(for GLOB_DAT reloc,
+ 	       libc_cv_has_glob_dat, [dnl
+ cat > conftest.c <<EOF
+diff --git a/elf/Makefile b/elf/Makefile
+index 8afbe3f6ab259331..fec6e23b5b625e3b 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -504,6 +504,40 @@ tests-execstack-yes = \
+   # tests-execstack-yes
+ endif
+ endif
++
++tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
++$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
++  $(objpfx)ld.so
++	$(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
++	  --required=_rtld_global_ro \
++	  > $@ 2>&1; $(evaluate-test)
++# The optional symbols are present in libc only if the architecture has
++# the GLIBC_2.0 symbol set in libc.
++$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
++  $(common-objpfx)libc.so
++	$(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
++	    --required=_IO_cookie_jumps \
++	    --required=_IO_file_jumps \
++	    --required=_IO_file_jumps_maybe_mmap \
++	    --required=_IO_file_jumps_mmap \
++	    --required=_IO_helper_jumps \
++	    --required=_IO_mem_jumps \
++	    --required=_IO_obstack_jumps \
++	    --required=_IO_proc_jumps \
++	    --required=_IO_str_chk_jumps \
++	    --required=_IO_str_jumps \
++	    --required=_IO_strn_jumps \
++	    --required=_IO_wfile_jumps \
++	    --required=_IO_wfile_jumps_maybe_mmap \
++	    --required=_IO_wfile_jumps_mmap \
++	    --required=_IO_wmem_jumps \
++	    --required=_IO_wstr_jumps \
++	    --required=_IO_wstrn_jumps \
++	    --optional=_IO_old_cookie_jumps \
++	    --optional=_IO_old_file_jumps \
++	    --optional=_IO_old_proc_jumps \
++	  > $@ 2>&1; $(evaluate-test)
++
+ tests += $(tests-execstack-$(have-z-execstack))
+ ifeq ($(run-built-tests),yes)
+ tests-special += \
+diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
+new file mode 100644
+index 0000000000000000..368ea3349f86bd81
+--- /dev/null
++++ b/elf/tst-relro-symbols.py
+@@ -0,0 +1,137 @@
++#!/usr/bin/python3
++# Verify that certain symbols are covered by RELRO.
++# Copyright (C) 2022 Free Software Foundation, Inc.
++# This file is part of the GNU C Library.
++#
++# The GNU C Library is free software; you can redistribute it and/or
++# modify it under the terms of the GNU Lesser General Public
++# License as published by the Free Software Foundation; either
++# version 2.1 of the License, or (at your option) any later version.
++#
++# The GNU C Library is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++# Lesser General Public License for more details.
++#
++# You should have received a copy of the GNU Lesser General Public
++# License along with the GNU C Library; if not, see
++# <https://www.gnu.org/licenses/>.
++
++"""Analyze a (shared) object to verify that certain symbols are
++present and covered by the PT_GNU_RELRO segment.
++
++"""
++
++import argparse
++import os.path
++import sys
++
++# Make available glibc Python modules.
++sys.path.append(os.path.join(
++    os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
++
++import glibcelf
++
++def find_relro(path: str, img: glibcelf.Image) -> (int, int):
++    """Discover the address range of the PT_GNU_RELRO segment."""
++    for phdr in img.phdrs():
++        if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
++            # The computation is not entirely accurate because
++            # _dl_protect_relro in elf/dl-reloc.c rounds both the
++            # start end and downwards using the run-time page size.
++            return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
++    sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
++    sys.exit(1)
++
++def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
++    """Check if a section or symbol falls within in the RELRO segment."""
++    end = start + size - 1
++    if not (relro_begin <= start < end < relro_end):
++        error(
++            '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
++                kind, name.decode('UTF-8'), start, size,
++                relro_begin, relro_end))
++
++def get_parser():
++    """Return an argument parser for this script."""
++    parser = argparse.ArgumentParser(description=__doc__)
++    parser.add_argument('object', help='path to object file to check')
++    parser.add_argument('--required', metavar='NAME', default=(),
++                        help='required symbol names', nargs='*')
++    parser.add_argument('--optional', metavar='NAME', default=(),
++                        help='required symbol names', nargs='*')
++    return parser
++
++def main(argv):
++    """The main entry point."""
++    parser = get_parser()
++    opts = parser.parse_args(argv)
++    img = glibcelf.Image.readfile(opts.object)
++
++    required_symbols = frozenset([sym.encode('UTF-8')
++                                  for sym in opts.required])
++    optional_symbols = frozenset([sym.encode('UTF-8')
++                                  for sym in opts.optional])
++    check_symbols = required_symbols | optional_symbols
++
++    # Tracks the symbols in check_symbols that have been found.
++    symbols_found = set()
++
++    # Discover the extent of the RELRO segment.
++    relro_begin, relro_end = find_relro(opts.object, img)
++    symbol_table_found = False
++
++    errors = False
++    def error(msg: str) -> None:
++        """Record an error condition and write a message to standard output."""
++        nonlocal errors
++        errors = True
++        sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
++
++    # Iterate over section headers to find the symbol table.
++    for shdr in img.shdrs():
++        if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
++            symbol_table_found = True
++            for sym in img.syms(shdr):
++                if sym.st_name in check_symbols:
++                    symbols_found.add(sym.st_name)
++
++                    # Validate symbol type, section, and size.
++                    if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
++                        error('symbol {!r} has wrong type {}'.format(
++                            sym.st_name.decode('UTF-8'), sym.st_info.type))
++                    if sym.st_shndx in glibcelf.Shn:
++                        error('symbol {!r} has reserved section {}'.format(
++                            sym.st_name.decode('UTF-8'), sym.st_shndx))
++                        continue
++                    if sym.st_size == 0:
++                        error('symbol {!r} has size zero'.format(
++                            sym.st_name.decode('UTF-8')))
++                        continue
++
++                    check_in_relro('symbol', relro_begin, relro_end,
++                                   sym.st_name, sym.st_value, sym.st_size,
++                                   error)
++            continue # SHT_SYMTAB
++        if shdr.sh_name == b'.data.rel.ro' \
++           or shdr.sh_name.startswith(b'.data.rel.ro.'):
++            check_in_relro('section', relro_begin, relro_end,
++                           shdr.sh_name, shdr.sh_addr, shdr.sh_size,
++                           error)
++            continue
++
++    if required_symbols - symbols_found:
++        for sym in sorted(required_symbols - symbols_found):
++            error('symbol {!r} not found'.format(sym.decode('UTF-8')))
++
++    if errors:
++        sys.exit(1)
++
++    if not symbol_table_found:
++        sys.stdout.write(
++            '{}: warning: no symbol table found (stripped object)\n'.format(
++                opts.object))
++        sys.exit(77)
++
++if __name__ == '__main__':
++    main(sys.argv[1:])
+diff --git a/manual/install.texi b/manual/install.texi
+index 816b77a0a25a88a7..36a5af62bc5722b0 100644
+--- a/manual/install.texi
++++ b/manual/install.texi
+@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be
+ usable, but functionality may be lost---for example, you can't build a
+ shared libc with old binutils.
+ 
++@item --with-default-link=@var{FLAG}
++With @code{--with-default-link=yes}, the build system does not use a
++custom linker script for linking shared objects.  The default for
++@var{FLAG} is the opposite, @samp{no}, because the custom linker script
++is needed for full RELRO protection.
++
+ @item --with-nonshared-cflags=@var{cflags}
+ Use additional compiler flags @var{cflags} to build the parts of the
+ library which are always statically linked into applications and
+diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
+index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644
+--- a/sysdeps/unix/sysv/linux/ia64/Makefile
++++ b/sysdeps/unix/sysv/linux/ia64/Makefile
+@@ -1,3 +1,9 @@
++ifeq ($(subdir),elf)
++# ia64 does not support PT_GNU_RELRO.
++test-xfail-tst-relro-ldso = yes
++test-xfail-tst-relro-libc = yes
++endif
++
+ ifeq ($(subdir),misc)
+ sysdep_headers += sys/rse.h
+ endif
diff --git a/glibc-upstream-2.34-169.patch b/glibc-upstream-2.34-169.patch
new file mode 100644
index 0000000..63cb452
--- /dev/null
+++ b/glibc-upstream-2.34-169.patch
@@ -0,0 +1,87 @@
+commit ca0faa140ff8cebe4c041d935f0f5eb480873d99
+Author: Joan Bruguera <joanbrugueram@gmail.com>
+Date:   Mon Apr 11 19:49:56 2022 +0200
+
+    misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
+    
+    If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify
+    checks should pass, and `__whatever_alias` should be called.
+    
+    Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but
+    on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`.
+    
+    A comment says the -1 case should work as: "The -1 check is redundant because
+    since it implies that __glibc_safe_len_cond is true.". But this fails when:
+    * `__s > 1`
+    * `__osz == -1` (i.e. unknown size at compile time)
+    * `__l` is big enough
+    * `__l * __s <= __osz` can be folded to a constant
+    (I only found this to be true for `mbsrtowcs` and other functions in wchar2.h)
+    
+    In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be
+    called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program.
+    
+    This commit adds the explicit `__osz == -1` check again.
+    moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041
+    
+    Minimal test case (test.c):
+        #include <wchar.h>
+    
+        int main (void)
+        {
+            const char *hw = "HelloWorld";
+            mbsrtowcs (NULL, &hw, (size_t)-1, NULL);
+            return 0;
+        }
+    
+    Build with:
+        gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test
+    
+    Output:
+        *** buffer overflow detected ***: terminated
+    
+    Fixes: BZ #29030
+    Signed-off-by: Joan Bruguera <joanbrugueram@gmail.com>
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    (cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e)
+
+diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
+index 8b5902423cf0ad88..fb02452f5993c594 100644
+--- a/debug/tst-fortify.c
++++ b/debug/tst-fortify.c
+@@ -1505,6 +1505,11 @@ do_test (void)
+       CHK_FAIL_END
+ #endif
+ 
++      /* Bug 29030 regresion check */
++      cp = "HelloWorld";
++      if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
++        FAIL ();
++
+       cp = "A";
+       if (mbstowcs (wenough, cp, 10) != 1
+ 	  || wcscmp (wenough, L"A") != 0)
+diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
+index 515fb681a0547217..b36013b9a6b4d9c3 100644
+--- a/misc/sys/cdefs.h
++++ b/misc/sys/cdefs.h
+@@ -161,13 +161,13 @@
+    || (__builtin_constant_p (__l) && (__l) > 0))
+ 
+ /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
+-   condition can be folded to a constant and if it is true.  The -1 check is
+-   redundant because since it implies that __glibc_safe_len_cond is true.  */
++   condition can be folded to a constant and if it is true, or unknown (-1) */
+ #define __glibc_safe_or_unknown_len(__l, __s, __osz) \
+-  (__glibc_unsigned_or_positive (__l)					      \
+-   && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l),     \
+-						   __s, __osz))		      \
+-   && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
++  ((__osz) == (__SIZE_TYPE__) -1					      \
++   || (__glibc_unsigned_or_positive (__l)				      \
++       && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
++						       (__s), (__osz)))	      \
++       && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
+ 
+ /* Conversely, we know at compile time that the length is unsafe if the
+    __L * __S <= __OBJSZ condition can be folded to a constant and if it is
diff --git a/glibc-upstream-2.34-170.patch b/glibc-upstream-2.34-170.patch
new file mode 100644
index 0000000..11aa68c
--- /dev/null
+++ b/glibc-upstream-2.34-170.patch
@@ -0,0 +1,49 @@
+commit 0d477e92c49db2906b32e44135b98746ccc73c7b
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue Apr 26 14:22:10 2022 +0200
+
+    INSTALL: Rephrase -with-default-link documentation
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    (cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f)
+
+diff --git a/INSTALL b/INSTALL
+index 60d01568d77645c7..10a3dcdc0a8db665 100644
+--- a/INSTALL
++++ b/INSTALL
+@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
+      library will still be usable, but functionality may be lost--for
+      example, you can't build a shared libc with old binutils.
+ 
+-'--with-default-link=FLAG'
+-     With '--with-default-link=yes', the build system does not use a
+-     custom linker script for linking shared objects.  The default for
+-     FLAG is the opposite, 'no', because the custom linker script is
++'--with-default-link'
++     With '--with-default-link', the build system does not use a custom
++     linker script for linking shared objects.  The default is
++     '--without-default-link', because the custom linker script is
+      needed for full RELRO protection.
+ 
+ '--with-nonshared-cflags=CFLAGS'
+diff --git a/manual/install.texi b/manual/install.texi
+index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644
+--- a/manual/install.texi
++++ b/manual/install.texi
+@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be
+ usable, but functionality may be lost---for example, you can't build a
+ shared libc with old binutils.
+ 
+-@item --with-default-link=@var{FLAG}
+-With @code{--with-default-link=yes}, the build system does not use a
+-custom linker script for linking shared objects.  The default for
+-@var{FLAG} is the opposite, @samp{no}, because the custom linker script
+-is needed for full RELRO protection.
++@item --with-default-link
++With @code{--with-default-link}, the build system does not use a custom
++linker script for linking shared objects.  The default is
++@code{--without-default-link}, because the custom linker script is
++needed for full RELRO protection.
+ 
+ @item --with-nonshared-cflags=@var{cflags}
+ Use additional compiler flags @var{cflags} to build the parts of the
diff --git a/glibc-upstream-2.34-171.patch b/glibc-upstream-2.34-171.patch
new file mode 100644
index 0000000..04e6898
--- /dev/null
+++ b/glibc-upstream-2.34-171.patch
@@ -0,0 +1,377 @@
+commit bc56ab1f4aa937665034373d3e320d0779a839aa
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Tue Apr 26 14:23:02 2022 +0200
+
+    dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
+    
+    When audit modules are loaded, ld.so initialization is not yet
+    complete, and rtld_active () returns false even though ld.so is
+    mostly working.  Instead, the static dlopen hook is used, but that
+    does not work at all because this is not a static dlopen situation.
+    
+    Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework
+    static dlopen hooks") moved the hook pointer into _rtld_global_ro,
+    which means that separate protection is not needed anymore and the
+    hook pointer can be checked directly.
+    
+    The guard for disabling libio vtable hardening in _IO_vtable_check
+    should stay for now.
+    
+    Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so:
+    Examine GLRO to detect inactive loader [BZ #20204]").
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+    (cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b)
+
+diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
+index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644
+--- a/dlfcn/dladdr.c
++++ b/dlfcn/dladdr.c
+@@ -24,7 +24,7 @@ int
+ __dladdr (const void *address, Dl_info *info)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dladdr (address, info);
+ #endif
+   return _dl_addr (address, info, NULL, NULL);
+diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
+index 78560dbac208c316..93ce68c1d6067fe2 100644
+--- a/dlfcn/dladdr1.c
++++ b/dlfcn/dladdr1.c
+@@ -24,7 +24,7 @@ int
+ __dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
+ #endif
+ 
+diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
+index 6a013a81bb648191..07ecb21bf7d43be4 100644
+--- a/dlfcn/dlclose.c
++++ b/dlfcn/dlclose.c
+@@ -24,7 +24,7 @@ int
+ __dlclose (void *handle)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlclose (handle);
+ #endif
+ 
+diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
+index 5047b140662bc33e..63da79c63000eef0 100644
+--- a/dlfcn/dlerror.c
++++ b/dlfcn/dlerror.c
+@@ -32,7 +32,7 @@ char *
+ __dlerror (void)
+ {
+ # ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlerror ();
+ # endif
+ 
+diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
+index c6f9a1da09ff8622..47d2daa96fa5986f 100644
+--- a/dlfcn/dlinfo.c
++++ b/dlfcn/dlinfo.c
+@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg)
+ int
+ ___dlinfo (void *handle, int request, void *arg)
+ {
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
+   else
+     return dlinfo_implementation (handle, request, arg);
+diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
+index c171c8953da20fc7..2309224eb8484b1a 100644
+--- a/dlfcn/dlmopen.c
++++ b/dlfcn/dlmopen.c
+@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
+ void *
+ ___dlmopen (Lmid_t nsid, const char *file, int mode)
+ {
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
+   else
+     return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
+index e04b374b82b04337..9c59c751c4eaf7a7 100644
+--- a/dlfcn/dlopen.c
++++ b/dlfcn/dlopen.c
+@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
+ void *
+ ___dlopen (const char *file, int mode)
+ {
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
+   else
+     return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
+index 9115501ac121eeca..c2f2a42194d50953 100644
+--- a/dlfcn/dlopenold.c
++++ b/dlfcn/dlopenold.c
+@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
+     mode |= RTLD_LAZY;
+   args.mode = mode;
+ 
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
+ 
+   return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
+diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
+index 43044cf7bb95801e..d3861170a7631d01 100644
+--- a/dlfcn/dlsym.c
++++ b/dlfcn/dlsym.c
+@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
+ void *
+ ___dlsym (void *handle, const char *name)
+ {
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
+   else
+     return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
+diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
+index 9b76f9afa513e11f..3af02109c306b800 100644
+--- a/dlfcn/dlvsym.c
++++ b/dlfcn/dlvsym.c
+@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
+ void *
+ ___dlvsym (void *handle, const char *name, const char *version)
+ {
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
+ 					 RETURN_ADDRESS (0));
+   else
+diff --git a/elf/Makefile b/elf/Makefile
+index fec6e23b5b625e3b..c89a6a58690646ee 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -376,6 +376,7 @@ tests += \
+   tst-audit24d \
+   tst-audit25a \
+   tst-audit25b \
++  tst-audit26 \
+   tst-auditmany \
+   tst-auxobj \
+   tst-auxobj-dlopen \
+@@ -721,6 +722,7 @@ modules-names = \
+   tst-auditmod24c \
+   tst-auditmod24d \
+   tst-auditmod25 \
++  tst-auditmod26 \
+   tst-auxvalmod \
+   tst-big-note-lib \
+   tst-deep1mod1 \
+@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
+ LDFLAGS-tst-audit25b = -Wl,-z,now
+ tst-audit25b-ARGS = -- $(host-test-program-cmd)
+ 
++$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
++$(objpfx)tst-auditmod26.so: $(libsupport)
++tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
++
+ # tst-sonamemove links against an older implementation of the library.
+ LDFLAGS-tst-sonamemove-linkmod1.so = \
+   -Wl,--version-script=tst-sonamemove-linkmod1.map \
+diff --git a/elf/dl-libc.c b/elf/dl-libc.c
+index d5bc4a277f4c6ef3..db4342a3256921f0 100644
+--- a/elf/dl-libc.c
++++ b/elf/dl-libc.c
+@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode)
+   args.caller_dlopen = RETURN_ADDRESS (0);
+ 
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
+ #endif
+   return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
+@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name)
+   args.name = name;
+ 
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
+ #endif
+   return (dlerror_run (do_dlsym, &args) ? NULL
+@@ -199,7 +199,7 @@ void *
+ __libc_dlvsym (void *map, const char *name, const char *version)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
+ #endif
+ 
+@@ -222,7 +222,7 @@ int
+ __libc_dlclose (void *map)
+ {
+ #ifdef SHARED
+-  if (!rtld_active ())
++  if (GLRO (dl_dlfcn_hook) != NULL)
+     return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
+ #endif
+   return dlerror_run (do_dlclose, map);
+diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
+new file mode 100644
+index 0000000000000000..3f920e83bac247a5
+--- /dev/null
++++ b/elf/tst-audit26.c
+@@ -0,0 +1,35 @@
++/* Check the usability of <dlfcn.h> functions in audit modules.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <gnu/lib-names.h>
++
++#include <support/check.h>
++#include <support/xdlfcn.h>
++
++static int
++do_test (void)
++{
++  /* Check that the audit module has been loaded.  */
++  void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
++  TEST_VERIFY (handle
++	       == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
++
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
+new file mode 100644
+index 0000000000000000..db7ba95abec20f53
+--- /dev/null
++++ b/elf/tst-auditmod26.c
+@@ -0,0 +1,104 @@
++/* Check the usability of <dlfcn.h> functions in audit modules.  Audit module.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <dlfcn.h>
++#include <first-versions.h>
++#include <gnu/lib-names.h>
++#include <link.h>
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++
++#include <support/check.h>
++#include <support/xdlfcn.h>
++
++unsigned int
++la_version (unsigned int current)
++{
++  /* Exercise various <dlfcn.h> functions.  */
++
++  /* Check dlopen, dlsym, dlclose.   */
++  void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
++  void *ptr = xdlsym (handle, "sincos");
++  TEST_VERIFY (ptr != NULL);
++  ptr = dlsym (handle, "SINCOS");
++  TEST_VERIFY (ptr == NULL);
++  const char *message = dlerror ();
++  TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
++  ptr = dlsym (handle, "SINCOS");
++  TEST_VERIFY (ptr == NULL);
++  xdlclose (handle);
++  TEST_COMPARE_STRING (dlerror (), NULL);
++
++  handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
++
++  /* Check dlvsym.  _exit is unlikely to gain another symbol
++     version.  */
++  TEST_VERIFY (xdlsym (handle, "_exit")
++               == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
++
++  /* Check dlinfo.  */
++  {
++    void *handle2 = NULL;
++    TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
++    TEST_VERIFY (handle2 == handle);
++  }
++
++  /* Check dladdr and dladdr1.  */
++  Dl_info info = { };
++  TEST_VERIFY (dladdr (&_exit, &info) != 0);
++  if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias.  */
++    TEST_COMPARE_STRING (info.dli_sname, "_exit");
++  TEST_VERIFY (info.dli_saddr == &_exit);
++  TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
++  void *extra_info;
++  memset (&info, 0, sizeof (info));
++  TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
++  TEST_VERIFY (extra_info == handle);
++
++  /* Verify that dlmopen creates a new namespace.  */
++  void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
++  TEST_VERIFY (dlmopen_handle != handle);
++  memset (&info, 0, sizeof (info));
++  extra_info = NULL;
++  ptr = xdlsym (dlmopen_handle, "_exit");
++  TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
++  TEST_VERIFY (extra_info == dlmopen_handle);
++  xdlclose (dlmopen_handle);
++
++  /* Terminate the process with an error state.  This does not happen
++     automatically because the audit module state is not shared with
++     the main program.  */
++  if (support_record_failure_is_failed ())
++    {
++      fflush (stdout);
++      fflush (stderr);
++      _exit (1);
++    }
++
++  return LAV_CURRENT;
++}
++
++char *
++la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
++{
++  if (strcmp (name, "mapped to libc") == 0)
++    return (char *) LIBC_SO;
++  else
++    return (char *) name;
++}
diff --git a/glibc-upstream-2.34-172.patch b/glibc-upstream-2.34-172.patch
new file mode 100644
index 0000000..06dc695
--- /dev/null
+++ b/glibc-upstream-2.34-172.patch
@@ -0,0 +1,28 @@
+commit 83cc145830bdbefdabe03787ed884d548bea9c99
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Apr 22 19:34:52 2022 +0200
+
+    scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
+    
+    enum.IntFlag and enum.EnumMeta._missing_ support are not part of
+    earlier Python versions.
+    
+    (cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b)
+
+diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
+index 8f7d0ca184845714..da0d5380f33a195e 100644
+--- a/scripts/glibcelf.py
++++ b/scripts/glibcelf.py
+@@ -28,6 +28,12 @@ import collections
+ import enum
+ import struct
+ 
++if not hasattr(enum, 'IntFlag'):
++    import sys
++    sys.stdout.write(
++        'warning: glibcelf.py needs Python 3.6 for enum support\n')
++    sys.exit(77)
++
+ class _OpenIntEnum(enum.IntEnum):
+     """Integer enumeration that supports arbitrary int values."""
+     @classmethod
diff --git a/glibc-upstream-2.34-173.patch b/glibc-upstream-2.34-173.patch
new file mode 100644
index 0000000..69a92b8
--- /dev/null
+++ b/glibc-upstream-2.34-173.patch
@@ -0,0 +1,254 @@
+commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Aug 20 06:42:24 2021 -0700
+
+    x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
+    
+    Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+    by replacing
+    
+            vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+    
+    and
+    
+            vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+    
+    with
+            vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+    
+    This fixes BZ #28252.
+    
+    (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
++        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index dfa2acafc486b56b..f5f117d474f66176 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index be8ab7c6e0e33819..48d251db16ccab9d 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 611887082a545854..a4944a4feef6aa98 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index f671d60d5dab5a0e..fe8474fed943e8ad 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
++        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 637bfe3c06ab9ad4..229b7828cde04db2 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
++        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
++        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 9cf359c86ff9bd70..a446c504f63c9399 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index bd05109a62181f22..c1b352d0ad1992cd 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
diff --git a/glibc-upstream-2.34-174.patch b/glibc-upstream-2.34-174.patch
new file mode 100644
index 0000000..3bf44a8
--- /dev/null
+++ b/glibc-upstream-2.34-174.patch
@@ -0,0 +1,42 @@
+commit b5a44a6a471aafd3677659a610f32468c40a666b
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Sep 21 18:31:49 2021 -0500
+
+    x86: Modify ENTRY in sysdep.h so that p2align can be specified
+    
+    No bug.
+    
+    This change adds a new macro ENTRY_P2ALIGN which takes a second
+    argument, log2 of the desired function alignment.
+    
+    The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+    doesn't affect any existing functionality.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index cac1d762fb3f99d0..937180c1bd791570 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
++#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
++  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
++/* Common entry 16 byte aligns.  */
++#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
++
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
diff --git a/glibc-upstream-2.34-175.patch b/glibc-upstream-2.34-175.patch
new file mode 100644
index 0000000..5ebf0b7
--- /dev/null
+++ b/glibc-upstream-2.34-175.patch
@@ -0,0 +1,653 @@
+commit 5ec3416853c4150c4d13312e05f93a053586d528
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Sep 21 18:45:03 2021 -0500
+
+    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
+    
+    No bug.
+    
+    The frontend optimizations are to:
+    1. Reorganize logically connected basic blocks so they are either in
+       the same cache line or adjacent cache lines.
+    2. Avoid cases when basic blocks unnecissarily cross cache lines.
+    3. Try and 32 byte align any basic blocks possible without sacrificing
+       code size. Smaller / Less hot basic blocks are used for this.
+    
+    Overall code size shrunk by 168 bytes. This should make up for any
+    extra costs due to aligning to 64 bytes.
+    
+    In general performance before deviated a great deal dependending on
+    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+    essentially make it so that the current implementation is at least
+    equal to the best alignment of the original for any arguments.
+    
+    The only additional optimization is in the page cross case. Branch on
+    equals case was removed from the size == [4, 7] case. As well the [4,
+    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+    argument size.
+    
+    test-memcmp and test-wmemcmp are both passing.
+    
+    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
++
++When possible the implementation tries to optimize for frontend in the
++following ways:
++Throughput:
++    1. All code sections that fit are able to run optimally out of the
++       LSD.
++    2. All code sections that fit are able to run optimally out of the
++       DSB
++    3. Basic blocks are contained in minimum number of fetch blocks
++       necessary.
++
++Latency:
++    1. Logically connected basic blocks are put in the same
++       cache-line.
++    2. Logically connected basic blocks that do not fit in the same
++       cache-line are put in adjacent lines. This can get beneficial
++       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
++#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
++#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
++/* Cache align memcmp entry. This allows for much more thorough
++   frontend optimization.  */
++ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	   oring with YMM1. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++
++	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++
++	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
++	 */
++	VPTEST	%YMM4, %YMM4, %k1
++	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
++	.p2align 4
++L(8x_end_return_vec_0_1_2_3):
++	movq	%rdx, %rdi
++L(8x_return_vec_0_1_2_3):
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPTEST	%YMM1, %YMM1, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
++	VPTEST	%YMM2, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
++	VPTEST	%YMM3, %YMM3, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
++	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
++	   line.  */
++	bsfl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
++	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
++	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
++	/* Not ideally aligned (at offset +9 bytes in fetch block) but
++	   not aligning keeps it in the same cache line as
++	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
++	   size.  */
++	.p2align 4,, 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
++	.p2align 4,, 10
++L(return_vec_1_end):
++	/* Use bsf to save code size. This is necessary to have
++	   L(one_or_less) fit in aligning bytes between.  */
++	bsfl	%eax, %eax
++	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	/* NB: L(one_or_less) fits in alignment padding between
++	   L(return_vec_1_end) and L(return_vec_0_end).  */
++# ifdef USE_AS_WMEMCMP
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	ret
++# else
++L(one_or_less):
++	jb	L(zero)
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++	ret
++# endif
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size == 0
++	   but is also faster for size == CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check if any matches where in bounds. Intentionally not
++	   storing result in eax to limit dependency chain if it goes to
++	   L(return_vec_0_lv).  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0_lv)
++	xorl	%eax, %eax
++	ret
++
++	/* Essentially duplicate of L(return_vec_0). Ends up not costing
++	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
++	   the jump and ends up fitting in aligning bytes. As well fits on
++	   same cache line as L(less_vec) so also saves a line from having
++	   to be fetched on cold calls to memcmp.  */
++	.p2align 4,, 4
++L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
++	jb	L(between_2_3)
++
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* edx is guranteed to be positive int32 in range [4, 7].  */
++	cmovne	%edx, %eax
++	/* ecx is -1 if rcx > rax. Otherwise 0.  */
++	sbbl	%ecx, %ecx
++	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
++	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
++	   eax doesn't matter.  */
++	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	(%rdi), %xmm1
++	vmovq	(%rsi), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
++	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++
++	/* Use movups to save code size.  */
++	movups	(%rsi), %xmm2
++	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++# ifndef USE_AS_WMEMCMP
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
diff --git a/glibc-upstream-2.34-176.patch b/glibc-upstream-2.34-176.patch
new file mode 100644
index 0000000..74b18ab
--- /dev/null
+++ b/glibc-upstream-2.34-176.patch
@@ -0,0 +1,497 @@
+commit 6d18a93dbbde2958001d65dff3080beed7ae675a
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Sep 20 16:20:15 2021 -0500
+
+    x86: Optimize memset-vec-unaligned-erms.S
+    
+    No bug.
+    
+    Optimization are
+    
+    1. change control flow for L(more_2x_vec) to fall through to loop and
+       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+       size and saves jumps for length > 4x VEC_SIZE.
+    
+    2. For EVEX/AVX512 move L(less_vec) closer to entry.
+    
+    3. Avoid complex address mode for length > 2x VEC_SIZE
+    
+    4. Slightly better aligning code for the loop from the perspective of
+       code size and uops.
+    
+    5. Align targets so they make full use of their fetch block and if
+       possible cache line.
+    
+    6. Try and reduce total number of icache lines that will need to be
+       pulled in for a given length.
+    
+    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+       jumping to the stosb target in the sse2 code section will almost
+       certainly be to a new page. The new version does increase code size
+       marginally by duplicating the target but should get better iTLB
+       behavior as a result.
+    
+    test-memset, test-wmemset, and test-bzero are all passing.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
++#define MOV_SIZE	3
++#define RET_SIZE	1
++
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
++#define VMOVU     movups
++#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f36a47d594..1af668af0aeda59e 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX2	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	4
++# define RET_SIZE	4
++
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
++
++# define VMOVU     vmovdqu
++# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc2f140527..f14d6f8493c21a36 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX512	1
++
+ # define VEC_SIZE	64
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f092903302ad0..64b09e77cc20cc42 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_EVEX	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index ff196844a093dc3b..e723413a664c088f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
++#if VEC_SIZE == 64
++# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
++#else
++# define LOOP_4X_OFFSET	(0)
++#endif
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++# define END_REG	rcx
++# define LOOP_REG	rdi
++#else
++# define END_REG	rdi
++# define LOOP_REG	rdx
++#endif
++
+ #define PAGE_SIZE 4096
+ 
++/* Macro to calculate size of small memset block for aligning
++   purposes.  */
++#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
++
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
++	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	.p2align 4,, 10
++L(last_2x_vec):
++#ifdef USE_LESS_VEC_MASK_STORE
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
++	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.
++	 */
++#ifdef USE_LESS_VEC_MASK_STORE
++	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault supress.
++	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
++	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* Include L(stosb_local) here if including L(less_vec) between
++	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
++	   L(stosb_more_2x_vec) target.  */
++	.p2align 4,, 10
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
++# endif
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
++L(stosb_more_2x_vec):
++	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
++	ja	L(stosb_local)
++#endif
++	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
++	   and (4x, 8x] jump to target.  */
++L(more_2x_vec):
++
++	/* Two different methods of setting up pointers / compare. The
++	   two methods are based on the fact that EVEX/AVX512 mov
++	   instructions take more bytes then AVX2/SSE2 mov instructions. As
++	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
++	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
++	   this saves code size and keeps a few targets in one fetch block.
++	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
++	   LOOP_4X_OFFSET) with LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), VEC_SIZE(%rax)
++
++
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
++	addq	%rdx, %END_REG
++#endif
++
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
++
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
++	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
++
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
++	   extra offset to addresses in loop. Used for AVX512 to save space
++	   as no way to get (VEC_SIZE * 4) in imm8.  */
++# if LOOP_4X_OFFSET == 0
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
++	/* Avoid imm32 compare here to save code size.  */
++	cmpq	%rdi, %rcx
++#else
++	addq	$-(VEC_SIZE * 4), %END_REG
++	cmpq	$(VEC_SIZE * 8), %rdx
++#endif
++	jbe	L(last_4x_vec)
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* Set LOOP_REG (rdx).  */
++	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
++#endif
++	/* Align dst for loop.  */
++	andq	$(VEC_SIZE * -2), %LOOP_REG
++	.p2align 4
++L(loop):
++	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
++	cmpq	%END_REG, %LOOP_REG
++	jb	L(loop)
++	.p2align 4,, MOV_SIZE
++L(last_4x_vec):
++	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
++L(return):
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
++	ret
++#endif
++
++	.p2align 4,, 10
++#ifndef USE_LESS_VEC_MASK_STORE
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
++	   range for 2-byte jump encoding.  */
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	/* Define L(less_vec) only if not otherwise defined.  */
++	.p2align 4
++L(less_vec):
++#endif
++L(cross_page):
++#if VEC_SIZE > 32
++	cmpl	$32, %edx
++	jae	L(between_32_63)
++#endif
++#if VEC_SIZE > 16
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
++#endif
++	MOVQ	%XMM0, %rdi
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
++	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
++	jb	L(return)
++	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
++
++	/* Align small targets only if not doing so would cross a fetch
++	   line.  */
++#if VEC_SIZE > 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
++	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
++#endif
++
++#if VEC_SIZE >= 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
++	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
++	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
++#endif
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
++	/* From 8 to 15.  No branch when size == 8.  */
++	movq	%rdi, (%rax)
++	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
++	movl	%edi, (%rax)
++	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
++	movw	%di, (%rax)
++	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/glibc-upstream-2.34-177.patch b/glibc-upstream-2.34-177.patch
new file mode 100644
index 0000000..112bcad
--- /dev/null
+++ b/glibc-upstream-2.34-177.patch
@@ -0,0 +1,40 @@
+commit baf3ece63453adac59c5688930324a78ced5b2e4
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sat Oct 23 01:26:47 2021 -0400
+
+    x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
+    
+    This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+    
+    it could potentially be dangerous to use SSE2 if this function is ever
+    called without using 'vzeroupper' beforehand. While compilers appear
+    to use 'vzeroupper' before function calls if AVX2 has been used, using
+    SSE2 here is more brittle. Since it is not absolutely necessary it
+    should be avoided.
+    
+    It costs 2-extra bytes but the extra bytes should only eat into
+    alignment padding.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f2e7dea9f..640f6757fac8a356 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
++	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
diff --git a/glibc-upstream-2.34-178.patch b/glibc-upstream-2.34-178.patch
new file mode 100644
index 0000000..1540e2f
--- /dev/null
+++ b/glibc-upstream-2.34-178.patch
@@ -0,0 +1,690 @@
+commit f35ad30da4880a1574996df0674986ecf82fa7ae
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Oct 29 12:40:20 2021 -0700
+
+    x86-64: Improve EVEX strcmp with masked load
+    
+    In strcmp-evex.S, to compare 2 32-byte strings, replace
+    
+            VMOVU   (%rdi, %rdx), %YMM0
+            VMOVU   (%rsi, %rdx), %YMM1
+            /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+            VPCMP   $4, %YMM0, %YMM1, %k0
+            VPCMP   $0, %YMMZERO, %YMM0, %k1
+            VPCMP   $0, %YMMZERO, %YMM1, %k2
+            /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+            kord    %k1, %k2, %k1
+            /* Each bit in K1 represents a NULL or a mismatch.  */
+            kord    %k0, %k1, %k1
+            kmovd   %k1, %ecx
+            testl   %ecx, %ecx
+            jne     L(last_vector)
+    
+    with
+    
+            VMOVU   (%rdi, %rdx), %YMM0
+            VPTESTM %YMM0, %YMM0, %k2
+            /* Each bit cleared in K1 represents a mismatch or a null CHAR
+               in YMM0 and 32 bytes at (%rsi, %rdx).  */
+            VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+            kmovd   %k1, %ecx
+            incl    %ecx
+            jne     L(last_vector)
+    
+    It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+    and Ice Lake.
+    
+    Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
++# define YMM8		ymm25
++# define YMM9		ymm26
++# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi).  */
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
++
++	VPMINU	%YMM0, %YMM2, %YMM8
++	VPMINU	%YMM4, %YMM6, %YMM9
++
++	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM8
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	VPTESTM	%YMM8, %YMM8, %k1
++
++	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
++	vpxorq	(%rdx), %YMM0, %YMM1
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
++	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++
++	vporq	%YMM1, %YMM3, %YMM9
++	vporq	%YMM5, %YMM7, %YMM10
++
++	/* A non-zero CHAR in YMM9 represents a mismatch.  */
++	vporq	%YMM9, %YMM10, %YMM9
++
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
++	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
++	kmovd   %k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	 L(loop)
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM0 and (%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
++	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM2 and VEC_SIZE(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
++	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++	VPTESTM	%YMM4, %YMM4, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
++	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
++	VPTESTM	%YMM6, %YMM6, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
++
++	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM2 and 32 bytes at (%rdx, %r10).  */
++	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
++# ifdef USE_AS_WCSCMP
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
++	VPTESTM	%YMM3, %YMM3, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
++	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
++# else
++	incl	%edi
++# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
++	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	VPTESTM	%YMM1, %YMM1, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
++	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
++# else
++	salq	$32, %rdi
++
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
++	/* Skip ECX bytes.  */
++	shrl	%cl, %edi
++# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
++# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
++	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xf, %ecx
++# else
++	subl	$0xffff, %ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
++	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
++	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
++	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
++	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
diff --git a/glibc-upstream-2.34-179.patch b/glibc-upstream-2.34-179.patch
new file mode 100644
index 0000000..e9a4329
--- /dev/null
+++ b/glibc-upstream-2.34-179.patch
@@ -0,0 +1,85 @@
+commit a182bb7a3922404f79def09d79ef89678b4049f0
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Oct 29 12:56:53 2021 -0700
+
+    x86-64: Remove Prefer_AVX2_STRCMP
+    
+    Remove Prefer_AVX2_STRCMP to enable EVEX strcmp.  When comparing 2 32-byte
+    strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
+    VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
+    and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
+    VPMOVMSKB and 1 TESTL.  EVEX strcmp is now faster than AVX2 strcmp by up
+    to 40% on Tiger Lake and Ice Lake.
+    
+    (cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index de4e3c3b7258120d..f4d4049e391cbabd 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -574,14 +574,6 @@ disable_tsx:
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	      |= bit_arch_Prefer_No_VZEROUPPER;
+-
+-	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+-	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+-	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+-	     AVX2 strcmp is faster than EVEX strcmp.  */
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+-	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+-	      |= bit_arch_Prefer_AVX2_STRCMP;
+ 	}
+ 
+       /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 58f2fad4323d5d91..957db3ad229ba39f 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
+-	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+-		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 3bdc76cf71007948..8250bfcbecd29a9f 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -31,5 +31,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+-BIT (Prefer_AVX2_STRCMP)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 62b7abeeee646ab4..7c2901bf44456259 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 60ba0fe356b31779..f94a421784bfe923 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
diff --git a/glibc-upstream-2.34-180.patch b/glibc-upstream-2.34-180.patch
new file mode 100644
index 0000000..9707cf2
--- /dev/null
+++ b/glibc-upstream-2.34-180.patch
@@ -0,0 +1,48 @@
+commit 2e64237a8744dd50f9222293275fa52e7248ff76
+Author: Fangrui Song <maskray@google.com>
+Date:   Tue Nov 2 20:59:52 2021 -0700
+
+    x86-64: Replace movzx with movzbl
+    
+    Clang cannot assemble movzx in the AT&T dialect mode.
+    
+    ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+     movzx (%rsi), %ecx
+                   ^~~~
+    
+    Change movzx to movzbl, which follows the AT&T dialect and is used
+    elsewhere in the file.
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index bc19547b09639071..6197a723b9e0606e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index 824e648230a15739..7f8a1bc756f86aee 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
diff --git a/glibc-upstream-2.34-181.patch b/glibc-upstream-2.34-181.patch
new file mode 100644
index 0000000..36a401f
--- /dev/null
+++ b/glibc-upstream-2.34-181.patch
@@ -0,0 +1,843 @@
+commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Nov 1 00:49:51 2021 -0500
+
+    x86: Optimize memmove-vec-unaligned-erms.S
+    
+    No bug.
+    
+    The optimizations are as follows:
+    
+    1) Always align entry to 64 bytes. This makes behavior more
+       predictable and makes other frontend optimizations easier.
+    
+    2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
+       significant benefits in the case that:
+            0 < (dst - src) < [256, 512]
+    
+    3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
+       improvement and for FSRM [-10%, 25%].
+    
+    In addition to these primary changes there is general cleanup
+    throughout to optimize the aligning routines and control flow logic.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
+
+diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
+index db106a7a1f23f268..b2b318084823dceb 100644
+--- a/sysdeps/x86_64/memmove.S
++++ b/sysdeps/x86_64/memmove.S
+@@ -25,7 +25,7 @@
+ /* Use movups and movaps for smaller code sizes.  */
+ #define VMOVU		movups
+ #define VMOVA		movaps
+-
++#define MOV_SIZE	3
+ #define SECTION(p)		p
+ 
+ #ifdef USE_MULTIARCH
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+index 1ec1962e861dbf63..67a55f0c85af841c 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
++# define MOV_SIZE	4
+ # define ZERO_UPPER_VEC_REGISTERS_RETURN \
+   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+index e195e93f153c9512..975ae6c0515b83cb 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+@@ -4,7 +4,7 @@
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu
+ # define VMOVA		vmovdqa
+-
++# define MOV_SIZE	4
+ # define SECTION(p)		p##.avx
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index 848848ab39ff9326..0fa7126830af7acb 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
++# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+index 0cbce8f944da51a0..88715441feaaccf5 100644
+--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -25,7 +25,7 @@
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+ # define VZEROUPPER
+-
++# define MOV_SIZE	6
+ # define SECTION(p)		p##.evex
+ # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+ 
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index abde8438d41f2320..7b27cbdda5fb99f7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -76,6 +76,25 @@
+ # endif
+ #endif
+ 
++/* Whether to align before movsb. Ultimately we want 64 byte
++   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
++#define ALIGN_MOVSB	(VEC_SIZE > 16)
++/* Number of bytes to align movsb to.  */
++#define MOVSB_ALIGN_TO	64
++
++#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
++#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
++
++#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
++# error MOV_SIZE Unknown
++#endif
++
++#if LARGE_MOV_SIZE
++# define SMALL_SIZE_OFFSET	(4)
++#else
++# define SMALL_SIZE_OFFSET	(0)
++#endif
++
+ #ifndef PAGE_SIZE
+ # define PAGE_SIZE 4096
+ #endif
+@@ -199,25 +218,21 @@ L(start):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	/* Load regardless.  */
++	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(last_2x_vec):
+-#endif
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-#if !defined USE_MULTIARCH || !IS_IN (libc)
+-L(nop):
+-	ret
++#if !(defined USE_MULTIARCH && IS_IN (libc))
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ #else
+ 	VZEROUPPER_RETURN
+ #endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+-
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
++ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
+ 	movq	%rdi, %rax
+ L(start_erms):
+ # ifdef __ILP32__
+@@ -298,310 +313,448 @@ L(start_erms):
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	/* Load regardless.  */
++	VMOVU	(%rsi), %VEC(0)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+-L(last_2x_vec):
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
++	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
+ L(return):
+-#if VEC_SIZE > 16
++# if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+-#else
++# else
+ 	ret
++# endif
+ #endif
+ 
+-L(movsb):
+-	cmp     __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+-	jae	L(more_8x_vec)
+-	cmpq	%rsi, %rdi
+-	jb	1f
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	leaq	(%rsi,%rdx), %r9
+-	cmpq	%r9, %rdi
+-	/* Avoid slow backward REP MOVSB.  */
+-	jb	L(more_8x_vec_backward)
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rdi, %rcx
+-	subq	%rsi, %rcx
+-	jmp	2f
+-# endif
+-1:
+-# if AVOID_SHORT_DISTANCE_REP_MOVSB
+-	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+-	jz	3f
+-	movq	%rsi, %rcx
+-	subq	%rdi, %rcx
+-2:
+-/* Avoid "rep movsb" if RCX, the distance between source and destination,
+-   is N*4GB + [1..63] with N >= 0.  */
+-	cmpl	$63, %ecx
+-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+-3:
+-# endif
+-	mov	%RDX_LP, %RCX_LP
+-	rep movsb
+-L(nop):
++#if LARGE_MOV_SIZE
++	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
++	   ENTRY block and L(less_vec).  */
++	.p2align 4,, 8
++L(between_4_7):
++	/* From 4 to 7.  No branch when size == 4.  */
++	movl	(%rsi), %ecx
++	movl	(%rsi, %rdx), %esi
++	movl	%ecx, (%rdi)
++	movl	%esi, (%rdi, %rdx)
+ 	ret
+ #endif
+ 
++	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ # error Unsupported VEC_SIZE!
+ #endif
+ #if VEC_SIZE > 32
+-	cmpb	$32, %dl
++	cmpl	$32, %edx
+ 	jae	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+ #endif
+-	cmpb	$8, %dl
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++#if SMALL_MOV_SIZE
++	cmpl	$4, %edx
++#else
++	subq	$4, %rdx
++#endif
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
+-	ja	L(between_2_3)
+-	jb	1f
+-	movzbl	(%rsi), %ecx
++	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
++	jl	L(copy_0)
++	movb	(%rsi), %cl
++	je	L(copy_1)
++	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
++	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
++L(copy_1):
+ 	movb	%cl, (%rdi)
+-1:
++L(copy_0):
+ 	ret
++
++#if SMALL_MOV_SIZE
++	.p2align 4,, 8
++L(between_4_7):
++	/* From 4 to 7.  No branch when size == 4.  */
++	movl	-4(%rsi, %rdx), %ecx
++	movl	(%rsi), %esi
++	movl	%ecx, -4(%rdi, %rdx)
++	movl	%esi, (%rdi)
++	ret
++#endif
++
++#if VEC_SIZE > 16
++	/* From 16 to 31.  No branch when size == 16.  */
++	.p2align 4,, 8
++L(between_16_31):
++	vmovdqu	(%rsi), %xmm0
++	vmovdqu	-16(%rsi, %rdx), %xmm1
++	vmovdqu	%xmm0, (%rdi)
++	vmovdqu	%xmm1, -16(%rdi, %rdx)
++	/* No ymm registers have been touched.  */
++	ret
++#endif
++
+ #if VEC_SIZE > 32
++	.p2align 4,, 10
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ 	VMOVU	(%rsi), %YMM0
+-	VMOVU	-32(%rsi,%rdx), %YMM1
++	VMOVU	-32(%rsi, %rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+-	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-#endif
+-#if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
+-L(between_16_31):
+-	VMOVU	(%rsi), %XMM0
+-	VMOVU	-16(%rsi,%rdx), %XMM1
+-	VMOVU	%XMM0, (%rdi)
+-	VMOVU	%XMM1, -16(%rdi,%rdx)
++	VMOVU	%YMM1, -32(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
++
++	.p2align 4,, 10
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	-8(%rsi,%rdx), %rcx
++	movq	-8(%rsi, %rdx), %rcx
+ 	movq	(%rsi), %rsi
+-	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rsi, (%rdi)
++	movq	%rcx, -8(%rdi, %rdx)
+ 	ret
+-L(between_4_7):
+-	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	-4(%rsi,%rdx), %ecx
+-	movl	(%rsi), %esi
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%esi, (%rdi)
+-	ret
+-L(between_2_3):
+-	/* From 2 to 3.  No branch when size == 2.  */
+-	movzwl	-2(%rsi,%rdx), %ecx
+-	movzwl	(%rsi), %esi
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%si, (%rdi)
+-	ret
+ 
++	.p2align 4,, 10
++L(last_4x_vec):
++	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
++
++	/* VEC(0) and VEC(1) have already been loaded.  */
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(1), VEC_SIZE(%rdi)
++	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
++	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++
++	.p2align 4
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ L(movsb_more_2x_vec):
+ 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+ 	ja	L(movsb)
+ #endif
+ L(more_2x_vec):
+-	/* More than 2 * VEC and there may be overlap between destination
+-	   and source.  */
++	/* More than 2 * VEC and there may be overlap between
++	   destination and source.  */
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
++	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_4x_vec)
+-	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER_RETURN
+-L(last_4x_vec):
+-	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
++	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
++	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 4
+ L(more_8x_vec):
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	/* Go to backwards temporal copy if overlap no matter what as
++	   backward REP MOVSB is slow and we don't want to use NT stores if
++	   there is overlap.  */
++	cmpq	%rdx, %rcx
++	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
++	jb	L(more_8x_vec_backward_check_nop)
+ 	/* Check if non-temporal move candidate.  */
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_memcpy_2x)
+ #endif
+-	/* Entry if rdx is greater than non-temporal threshold but there
+-       is overlap.  */
++	/* To reach this point there cannot be overlap and dst > src. So
++	   check for overlap and src > dst in which case correctness
++	   requires forward copy. Otherwise decide between backward/forward
++	   copy depending on address aliasing.  */
++
++	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
++	   but less than __x86_shared_non_temporal_threshold.  */
+ L(more_8x_vec_check):
+-	cmpq	%rsi, %rdi
+-	ja	L(more_8x_vec_backward)
+-	/* Source == destination is less common.  */
+-	je	L(nop)
+-	/* Load the first VEC and last 4 * VEC to support overlapping
+-	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
++	/* rcx contains dst - src. Add back length (rdx).  */
++	leaq	(%rcx, %rdx), %r8
++	/* If r8 has different sign than rcx then there is overlap so we
++	   must do forward copy.  */
++	xorq	%rcx, %r8
++	/* Isolate just sign bit of r8.  */
++	shrq	$63, %r8
++	/* Get 4k difference dst - src.  */
++	andl	$(PAGE_SIZE - 256), %ecx
++	/* If r8 is non-zero must do foward for correctness. Otherwise
++	   if ecx is non-zero there is 4k False Alaising so do backward
++	   copy.  */
++	addl	%r8d, %ecx
++	jz	L(more_8x_vec_backward)
++
++	/* if rdx is greater than __x86_shared_non_temporal_threshold
++	   but there is overlap, or from short distance movsb.  */
++L(more_8x_vec_forward):
++	/* Load first and last 4 * VEC to support overlapping addresses.
++	 */
++
++	/* First vec was already loaded into VEC(0).  */
+ 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
++	/* Save begining of dst.  */
++	movq	%rdi, %rcx
++	/* Align dst to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
+ 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+-	/* Save start and stop of the destination buffer.  */
+-	movq	%rdi, %r11
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+-	/* Align destination for aligned stores in the loop.  Compute
+-	   how much destination is misaligned.  */
+-	movq	%rdi, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rsi
+-	/* Adjust destination which should be aligned now.  */
+-	subq	%r8, %rdi
+-	/* Adjust length.  */
+-	addq	%r8, %rdx
+ 
+-	.p2align 4
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rcx, %rsi
++	/* Finish aligning dst.  */
++	incq	%rdi
++	/* Restore src adjusted with new value for aligned dst.  */
++	addq	%rdi, %rsi
++	/* Store end of buffer minus tail in rdx.  */
++	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
++
++	/* Dont use multi-byte nop to align.  */
++	.p2align 4,, 11
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+-	VMOVU	(%rsi), %VEC(0)
+-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++	VMOVU	(%rsi), %VEC(1)
++	VMOVU	VEC_SIZE(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+ 	subq	$-(VEC_SIZE * 4), %rsi
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	VMOVA	%VEC(1), (%rdi)
++	VMOVA	%VEC(2), VEC_SIZE(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	$(VEC_SIZE * 4), %rdx
++	cmpq	%rdi, %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
++	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
++	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
++	VMOVU	%VEC(7), VEC_SIZE(%rdx)
++	VMOVU	%VEC(8), (%rdx)
+ 	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
++	VMOVU	%VEC(0), (%rcx)
++	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
++	 */
++L(nop_backward):
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 8
++L(more_8x_vec_backward_check_nop):
++	/* rcx contains dst - src. Test for dst == src to skip all of
++	   memmove.  */
++	testq	%rcx, %rcx
++	jz	L(nop_backward)
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+ 	   addresses.  */
+-	VMOVU	(%rsi), %VEC(4)
++
++	/* First vec was also loaded into VEC(0).  */
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
++	/* Begining of region for 4x backward copy stored in rcx.  */
++	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+-	/* Save stop of the destination buffer.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+-	/* Align destination end for aligned stores in the loop.  Compute
+-	   how much destination end is misaligned.  */
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+-	movq	%r11, %r9
+-	movq	%r11, %r8
+-	andq	$(VEC_SIZE - 1), %r8
+-	/* Adjust source.  */
+-	subq	%r8, %rcx
+-	/* Adjust the end of destination which should be aligned now.  */
+-	subq	%r8, %r9
+-	/* Adjust length.  */
+-	subq	%r8, %rdx
+-
+-	.p2align 4
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rdi, %rsi
++	/* Align dst.  */
++	andq	$-(VEC_SIZE), %rcx
++	/* Restore src.  */
++	addq	%rcx, %rsi
++
++	/* Don't use multi-byte nop to align.  */
++	.p2align 4,, 11
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	addq	$-(VEC_SIZE * 4), %rcx
+-	addq	$-(VEC_SIZE * 4), %rdx
+-	VMOVA	%VEC(0), (%r9)
+-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	addq	$-(VEC_SIZE * 4), %r9
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec_backward)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
++	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
++	addq	$(VEC_SIZE * -4), %rsi
++	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
++	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
++	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
++	addq	$(VEC_SIZE * -4), %rcx
++	cmpq	%rcx, %rdi
++	jb	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
++	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
++	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
++	VZEROUPPER_RETURN
++
++#if defined USE_MULTIARCH && IS_IN (libc)
++	/* L(skip_short_movsb_check) is only used with ERMS. Not for
++	   FSRM.  */
++	.p2align 5,, 16
++# if ALIGN_MOVSB
++L(skip_short_movsb_check):
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  endif
++#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
++#   error Unsupported MOVSB_ALIGN_TO
++#  endif
++	/* If CPU does not have FSRM two options for aligning. Align src
++	   if dst and src 4k alias. Otherwise align dst.  */
++	testl	$(PAGE_SIZE - 512), %ecx
++	jnz	L(movsb_align_dst)
++	/* Fall through. dst and src 4k alias. It's better to align src
++	   here because the bottleneck will be loads dues to the false
++	   dependency on dst.  */
++
++	/* rcx already has dst - src.  */
++	movq	%rcx, %r9
++	/* Add src to len. Subtract back after src aligned. -1 because
++	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
++	leaq	-1(%rsi, %rdx), %rcx
++	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
++	orq	$(MOVSB_ALIGN_TO - 1), %rsi
++	/* Restore dst and len adjusted with new values for aligned dst.
++	 */
++	leaq	1(%rsi, %r9), %rdi
++	subq	%rsi, %rcx
++	/* Finish aligning src.  */
++	incq	%rsi
++
++	rep	movsb
++
++	VMOVU	%VEC(0), (%r8)
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	%VEC(1), VEC_SIZE(%r8)
++#  endif
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 12
++L(movsb):
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	/* Go to backwards temporal copy if overlap no matter what as
++	   backward REP MOVSB is slow and we don't want to use NT stores if
++	   there is overlap.  */
++	cmpq	%rdx, %rcx
++	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
++	jb	L(more_8x_vec_backward_check_nop)
++# if ALIGN_MOVSB
++	/* Save dest for storing aligning VECs later.  */
++	movq	%rdi, %r8
++# endif
++	/* If above __x86_rep_movsb_stop_threshold most likely is
++	   candidate for NT moves aswell.  */
++	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
++	jae	L(large_memcpy_2x_check)
++# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
++	/* Only avoid short movsb if CPU has FSRM.  */
++	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
++	jz	L(skip_short_movsb_check)
++#  if AVOID_SHORT_DISTANCE_REP_MOVSB
++	/* Avoid "rep movsb" if RCX, the distance between source and
++	   destination, is N*4GB + [1..63] with N >= 0.  */
++
++	/* ecx contains dst - src. Early check for backward copy
++	   conditions means only case of slow movsb with src = dst + [0,
++	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
++	   for that case.  */
++	cmpl	$-64, %ecx
++	ja	L(more_8x_vec_forward)
++#  endif
++# endif
++# if ALIGN_MOVSB
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  endif
++#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
++#   error Unsupported MOVSB_ALIGN_TO
++#  endif
++	/* Fall through means cpu has FSRM. In that case exclusively
++	   align destination.  */
++L(movsb_align_dst):
++	/* Subtract dst from src. Add back after dst aligned.  */
++	subq	%rdi, %rsi
++	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
++	addq	$(MOVSB_ALIGN_TO - 1), %rdi
++	/* Add dst to len. Subtract back after dst aligned.  */
++	leaq	(%r8, %rdx), %rcx
++	/* Finish aligning dst.  */
++	andq	$-(MOVSB_ALIGN_TO), %rdi
++	/* Restore src and len adjusted with new values for aligned dst.
++	 */
++	addq	%rdi, %rsi
++	subq	%rdi, %rcx
++
++	rep	movsb
++
++	/* Store VECs loaded for aligning.  */
++	VMOVU	%VEC(0), (%r8)
++#  if MOVSB_ALIGN_TO > VEC_SIZE
++	VMOVU	%VEC(1), VEC_SIZE(%r8)
++#  endif
++	VZEROUPPER_RETURN
++# else	/* !ALIGN_MOVSB.  */
++L(skip_short_movsb_check):
++	mov	%RDX_LP, %RCX_LP
++	rep	movsb
++	ret
++# endif
++#endif
+ 
++	.p2align 4,, 10
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	.p2align 4
++L(large_memcpy_2x_check):
++	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
++	jb	L(more_8x_vec_check)
+ L(large_memcpy_2x):
+-	/* Compute absolute value of difference between source and
+-	   destination.  */
+-	movq	%rdi, %r9
+-	subq	%rsi, %r9
+-	movq	%r9, %r8
+-	leaq	-1(%r9), %rcx
+-	sarq	$63, %r8
+-	xorq	%r8, %r9
+-	subq	%r8, %r9
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache when
+-	   source is loaded.  */
+-	cmpq	%r9, %rdx
+-	ja	L(more_8x_vec_check)
++	/* To reach this point it is impossible for dst > src and
++	   overlap. Remaining to check is src > dst and overlap. rcx
++	   already contains dst - src. Negate rcx to get src - dst. If
++	   length > rcx then there is overlap and forward copy is best.  */
++	negq	%rcx
++	cmpq	%rcx, %rdx
++	ja	L(more_8x_vec_forward)
+ 
+ 	/* Cache align destination. First store the first 64 bytes then
+ 	   adjust alignments.  */
+-	VMOVU	(%rsi), %VEC(8)
+-#if VEC_SIZE < 64
+-	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+-#if VEC_SIZE < 32
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+-#endif
+-#endif
+-	VMOVU	%VEC(8), (%rdi)
+-#if VEC_SIZE < 64
+-	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+-#if VEC_SIZE < 32
+-	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+-#endif
+-#endif
++
++	/* First vec was also loaded into VEC(0).  */
++# if VEC_SIZE < 64
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++#  if VEC_SIZE < 32
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++#  endif
++# endif
++	VMOVU	%VEC(0), (%rdi)
++# if VEC_SIZE < 64
++	VMOVU	%VEC(1), VEC_SIZE(%rdi)
++#  if VEC_SIZE < 32
++	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
++#  endif
++# endif
++
+ 	/* Adjust source, destination, and size.  */
+ 	movq	%rdi, %r8
+ 	andq	$63, %r8
+@@ -614,9 +767,13 @@ L(large_memcpy_2x):
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+ 
+-	/* Test if source and destination addresses will alias. If they do
+-	   the larger pipeline in large_memcpy_4x alleviated the
++	/* Test if source and destination addresses will alias. If they
++	   do the larger pipeline in large_memcpy_4x alleviated the
+ 	   performance drop.  */
++
++	/* ecx contains -(dst - src). not ecx will return dst - src - 1
++	   which works for testing aliasing.  */
++	notl	%ecx
+ 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ 	jz	L(large_memcpy_4x)
+ 
+@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
+ 	/* ecx stores inner loop counter.  */
+ 	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+ L(loop_large_memcpy_4x_inner):
+-	/* Only one prefetch set per page as doing 4 pages give more time
+-	   for prefetcher to keep up.  */
++	/* Only one prefetch set per page as doing 4 pages give more
++	   time for prefetcher to keep up.  */
+ 	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ 	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
diff --git a/glibc-upstream-2.34-182.patch b/glibc-upstream-2.34-182.patch
new file mode 100644
index 0000000..563ff9d
--- /dev/null
+++ b/glibc-upstream-2.34-182.patch
@@ -0,0 +1,131 @@
+commit cecbac52123456e2fbcff062a4165bf7b9174797
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Nov 1 00:49:52 2021 -0500
+
+    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
+    
+    No bug.
+    
+    This patch doubles the rep_movsb_threshold when using ERMS. Based on
+    benchmarks the vector copy loop, especially now that it handles 4k
+    aliasing, is better for these medium ranged.
+    
+    On Skylake with ERMS:
+    
+    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+    4096,   0,      0,      0,      0.975
+    4096,   0,      0,      1,      0.953
+    4096,   12,     0,      0,      0.969
+    4096,   12,     0,      1,      0.872
+    4096,   44,     0,      0,      0.979
+    4096,   44,     0,      1,      0.83
+    4096,   0,      12,     0,      1.006
+    4096,   0,      12,     1,      0.989
+    4096,   0,      44,     0,      0.739
+    4096,   0,      44,     1,      0.942
+    4096,   12,     12,     0,      1.009
+    4096,   12,     12,     1,      0.973
+    4096,   44,     44,     0,      0.791
+    4096,   44,     44,     1,      0.961
+    4096,   2048,   0,      0,      0.978
+    4096,   2048,   0,      1,      0.951
+    4096,   2060,   0,      0,      0.986
+    4096,   2060,   0,      1,      0.963
+    4096,   2048,   12,     0,      0.971
+    4096,   2048,   12,     1,      0.941
+    4096,   2060,   12,     0,      0.977
+    4096,   2060,   12,     1,      0.949
+    8192,   0,      0,      0,      0.85
+    8192,   0,      0,      1,      0.845
+    8192,   13,     0,      0,      0.937
+    8192,   13,     0,      1,      0.939
+    8192,   45,     0,      0,      0.932
+    8192,   45,     0,      1,      0.927
+    8192,   0,      13,     0,      0.621
+    8192,   0,      13,     1,      0.62
+    8192,   0,      45,     0,      0.53
+    8192,   0,      45,     1,      0.516
+    8192,   13,     13,     0,      0.664
+    8192,   13,     13,     1,      0.659
+    8192,   45,     45,     0,      0.593
+    8192,   45,     45,     1,      0.575
+    8192,   2048,   0,      0,      0.854
+    8192,   2048,   0,      1,      0.834
+    8192,   2061,   0,      0,      0.863
+    8192,   2061,   0,      1,      0.857
+    8192,   2048,   13,     0,      0.63
+    8192,   2048,   13,     1,      0.629
+    8192,   2061,   13,     0,      0.627
+    8192,   2061,   13,     1,      0.62
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+ #endif
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
++  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
++     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
++     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
++      rep_movsb_threshold = 4096 * (64 / 16);
+ #if HAVE_TUNABLES
+       minimum_rep_movsb_threshold = 64 * 8;
+ #endif
+@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
++      rep_movsb_threshold = 4096 * (32 / 16);
+ #if HAVE_TUNABLES
+       minimum_rep_movsb_threshold = 32 * 8;
+ #endif
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index dd6e1d65c9490d4f..419313804d49cf65 100644
+--- a/sysdeps/x86/dl-tunables.list
++++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
++      # Since there is overhead to set up REP MOVSB operation, REP
++      # MOVSB isn't faster on short data.  The memcpy micro benchmark
++      # in glibc shows that 2KB is the approximate value above which
++      # REP MOVSB becomes faster than SSE2 optimization on processors
++      # with Enhanced REP MOVSB.  Since larger register size can move
++      # more data with a single load and store, the threshold is
++      # higher with larger register size.  Micro benchmarks show AVX
++      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
++      # threshold is extrapolated to 16KB.  For machines with FSRM the
++      # threshold is universally set at 2112 bytes.  Note: Since the
++      # REP MOVSB threshold must be greater than 8 times of vector
++      # size and the default value is 4096 * (vector size / 16), the
++      # default value and the minimum value must be updated at
++      # run-time.  NB: Don't set the default value since we can't tell
++      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
diff --git a/glibc-upstream-2.34-183.patch b/glibc-upstream-2.34-183.patch
new file mode 100644
index 0000000..a1a7285
--- /dev/null
+++ b/glibc-upstream-2.34-183.patch
@@ -0,0 +1,2423 @@
+commit 7cb126e7e7febf9dc3e369cc3e4885e34fb9433b
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+    
+    No bug.
+    
+    This implementation refactors memcmp-sse4.S primarily with minimizing
+    code size in mind. It does this by removing the lookup table logic and
+    removing the unrolled check from (256, 512] bytes.
+    
+    memcmp-sse4 code size reduction : -3487 bytes
+    wmemcmp-sse4 code size reduction: -1472 bytes
+    
+    The current memcmp-sse4.S implementation has a large code size
+    cost. This has serious adverse affects on the ICache / ITLB. While
+    in micro-benchmarks the implementations appears fast, traces of
+    real-world code have shown that the speed in micro benchmarks does not
+    translate when the ICache/ITLB are not primed, and that the cost
+    of the code size has measurable negative affects on overall
+    application performance.
+    
+    See https://research.google/pubs/pub48320/ for more details.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 2f9062d7171850451e6044ef78d91ff8c017b9c0)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index b7ac034569ec6178..97c102a9c5ab2b91 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -25,14 +25,14 @@
+ #  define MEMCMP	__memcmp_sse4_1
+ # endif
+ 
+-# define JMPTBL(I, B)	(I - B)
++#ifdef USE_AS_WMEMCMP
++# define CMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++#else
++# define CMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++#endif
+ 
+-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+-  lea		TABLE(%rip), %r11;				\
+-  movslq	(%r11, INDEX, SCALE), %rcx;			\
+-  add		%r11, %rcx;					\
+-  _CET_NOTRACK jmp *%rcx;					\
+-  ud2
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -47,33 +47,253 @@ ENTRY (MEMCMP)
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-	pxor	%xmm0, %xmm0
+ 	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
++
++	cmp	$CHAR_SIZE, %RDX_LP
++	jbe	L(firstbyte)
++
++	/* N in (CHAR_SIZE, 79) bytes.  */
++	cmpl	$32, %edx
++	ja	L(more_32_bytes)
++
++	cmpl	$16, %edx
++	jae	L(16_to_32_bytes)
++
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %RDX_LP
+-	je	L(firstbyte)
++	cmpl	$8, %edx
++	jae	L(8_to_16_bytes)
++
++	cmpl	$4, %edx
++	jb	L(2_to_3_bytes)
++
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++
++	bswap	%eax
++	bswap	%ecx
++
++	shlq	$32, %rax
++	shlq	$32, %rcx
++
++	movl	-4(%rdi, %rdx), %edi
++	movl	-4(%rsi, %rdx), %esi
++
++	bswap	%edi
++	bswap	%esi
++
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(2_to_3_bytes):
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(8_to_16_bytes):
++	movq	(%rdi), %rax
++	movq	(%rsi), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++	jne	L(8_to_16_bytes_done)
++
++	movq	-8(%rdi, %rdx), %rax
++	movq	-8(%rsi, %rdx), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++
++L(8_to_16_bytes_done):
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++# else
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	4(%rdi), %ecx
++	cmpl	4(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	-4(%rdi, %rdx), %ecx
++	cmpl	-4(%rsi, %rdx), %ecx
++	jne	L(8_to_16_bytes_done)
++	ret
+ # endif
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-# ifndef USE_AS_WMEMCMP
+-	.p2align 4
++	.p2align 4,, 3
++L(ret_zero):
++	xorl	%eax, %eax
++L(zero):
++	ret
++
++	.p2align 4,, 8
+ L(firstbyte):
++	jb	L(ret_zero)
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++L(8_to_16_bytes_done):
++	setg	%al
++	leal	-1(%rax, %rax), %eax
++# else
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	sub	%ecx, %eax
++# endif
+ 	ret
++
++	.p2align 4
++L(vec_return_begin_48):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin_32):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	32(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	32(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	32(%rsi, %rax), %ecx
++	movzbl	32(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_begin_16):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_end_16):
++	subl	$16, %edx
++L(vec_return_end):
++	bsfl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-16(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-16(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-16(%rsi, %rax), %ecx
++	movzbl	-16(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
++	ret
++
++	.p2align 4,, 8
++L(more_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	cmpl	$64, %edx
++	jbe	L(32_to_64_bytes)
++	movdqu	32(%rdi), %xmm0
++	movdqu	32(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	.p2align 4,, 6
++L(32_to_64_bytes):
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
++	.p2align 4
++L(16_to_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
+ 
+ 	.p2align 4
+ L(79bytesormore):
++	movdqu	(%rdi), %xmm0
+ 	movdqu	(%rsi), %xmm1
+-	movdqu	(%rdi), %xmm2
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++
+ 	mov	%rsi, %rcx
+ 	and	$-16, %rsi
+ 	add	$16, %rsi
+@@ -86,1694 +306,499 @@ L(79bytesormore):
+ 
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormore)
+-L(less128bytes):
+-	sub	$64, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+ 
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(less128bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(last_64_bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
++	.p2align 4
+ L(128bytesormore):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormore)
+ 	cmp	$256, %rdx
+-	ja	L(less512bytes)
++	ja	L(unaligned_loop)
+ L(less256bytes):
+-	sub	$128, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin128)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-L(less512bytes):
+-	sub	$256, %rdx
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqu	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqu	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqu	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqu	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqu	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqu	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqu	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqu	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytes)
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytes)
++	ja	L(less128bytes)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin256)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormore):
++L(unaligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_unaglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_unaligned)
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loop)
++	ja	L(64bytesormore_loop)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(loop_tail):
++	addq	%rdx, %rdi
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	addq	%rdx, %rsi
++	movdqu	(%rsi), %xmm4
++	movdqu	16(%rsi), %xmm5
++	movdqu	32(%rsi), %xmm6
++	movdqu	48(%rsi), %xmm7
++
++	CMPEQ	%xmm4, %xmm0
++	CMPEQ	%xmm5, %xmm1
++	CMPEQ	%xmm6, %xmm2
++	CMPEQ	%xmm7, %xmm3
++
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
++
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
++	ret
+ 
+-L(L2_L3_cache_unaglined):
+-	sub	$64, %rdx
++L(L2_L3_cache_unaligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_unaligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(L2_L3_unaligned_128bytes_loop)
++	ja	L(L2_L3_unaligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-/*
+- * This case is for machines which are sensitive for unaligned instructions.
+- */
++	/* This case is for machines which are sensitive for unaligned
++	 * instructions.  */
+ 	.p2align 4
+ L(2aligned):
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormorein2aligned)
+ L(less128bytesin2aligned):
+-	sub	$64, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64in2alinged)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64in2alinged):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(aligned_last_64_bytes):
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+ L(128bytesormorein2aligned):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormorein2aligned)
+ 	cmp	$256, %rdx
+-	ja	L(256bytesormorein2aligned)
++	ja	L(aligned_loop)
+ L(less256bytesin2alinged):
+-	sub	$128, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
++	ja	L(less128bytesin2aligned)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin128in2aligned)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128in2aligned):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-	.p2align 4
+-L(256bytesormorein2aligned):
+-
+-	sub	$256, %rdx
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqa	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqa	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqa	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqa	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqa	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqa	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqa	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqa	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytesin2alinged)
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin256in2alinged)
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256in2alinged):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(aligned_last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormorein2aligned):
++L(aligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_aglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_aligned)
+ 
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loopin2aligned)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-L(L2_L3_cache_aglined):
+-	sub	$64, %rdx
++	ja	L(64bytesormore_loopin2aligned)
++	jmp	L(loop_tail)
+ 
++L(L2_L3_cache_aligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_aligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	jae	L(L2_L3_aligned_128bytes_loop)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
++	addq	$64, %rsi
++	addq	$64, %rdi
++	subq	$64, %rdx
++	ja	L(L2_L3_aligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+ 	.p2align 4
+ L(64bytesormore_loop_end):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm3, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm4, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	jmp	L(16bytes)
+-
+-L(256bytesin256):
+-	add	$256, %rdi
+-	add	$256, %rsi
+-	jmp	L(16bytes)
+-L(240bytesin256):
+-	add	$240, %rdi
+-	add	$240, %rsi
+-	jmp	L(16bytes)
+-L(224bytesin256):
+-	add	$224, %rdi
+-	add	$224, %rsi
+-	jmp	L(16bytes)
+-L(208bytesin256):
+-	add	$208, %rdi
+-	add	$208, %rsi
+-	jmp	L(16bytes)
+-L(192bytesin256):
+-	add	$192, %rdi
+-	add	$192, %rsi
+-	jmp	L(16bytes)
+-L(176bytesin256):
+-	add	$176, %rdi
+-	add	$176, %rsi
+-	jmp	L(16bytes)
+-L(160bytesin256):
+-	add	$160, %rdi
+-	add	$160, %rsi
+-	jmp	L(16bytes)
+-L(144bytesin256):
+-	add	$144, %rdi
+-	add	$144, %rsi
+-	jmp	L(16bytes)
+-L(128bytesin256):
+-	add	$128, %rdi
+-	add	$128, %rsi
+-	jmp	L(16bytes)
+-L(112bytesin256):
+-	add	$112, %rdi
+-	add	$112, %rsi
+-	jmp	L(16bytes)
+-L(96bytesin256):
+-	add	$96, %rdi
+-	add	$96, %rsi
+-	jmp	L(16bytes)
+-L(80bytesin256):
+-	add	$80, %rdi
+-	add	$80, %rsi
+-	jmp	L(16bytes)
+-L(64bytesin256):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	jmp	L(16bytes)
+-L(48bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(32bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytes):
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(8bytes):
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(12bytes):
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(4bytes):
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-L(0bytes):
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal case for wmemcmp */
+-	.p2align 4
+-L(65bytes):
+-	movdqu	-65(%rdi), %xmm1
+-	movdqu	-65(%rsi), %xmm2
+-	mov	$-65, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(49bytes):
+-	movdqu	-49(%rdi), %xmm1
+-	movdqu	-49(%rsi), %xmm2
+-	mov	$-49, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(33bytes):
+-	movdqu	-33(%rdi), %xmm1
+-	movdqu	-33(%rsi), %xmm2
+-	mov	$-33, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(17bytes):
+-	mov	-17(%rdi), %rax
+-	mov	-17(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(9bytes):
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(13bytes):
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(5bytes):
+-	mov	-5(%rdi), %eax
+-	mov	-5(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(66bytes):
+-	movdqu	-66(%rdi), %xmm1
+-	movdqu	-66(%rsi), %xmm2
+-	mov	$-66, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(50bytes):
+-	movdqu	-50(%rdi), %xmm1
+-	movdqu	-50(%rsi), %xmm2
+-	mov	$-50, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(34bytes):
+-	movdqu	-34(%rdi), %xmm1
+-	movdqu	-34(%rsi), %xmm2
+-	mov	$-34, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(18bytes):
+-	mov	-18(%rdi), %rax
+-	mov	-18(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(10bytes):
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(14bytes):
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(6bytes):
+-	mov	-6(%rdi), %eax
+-	mov	-6(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-L(2bytes):
+-	movzwl	-2(%rsi), %ecx
+-	movzwl	-2(%rdi), %eax
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(67bytes):
+-	movdqu	-67(%rdi), %xmm2
+-	movdqu	-67(%rsi), %xmm1
+-	mov	$-67, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(51bytes):
+-	movdqu	-51(%rdi), %xmm2
+-	movdqu	-51(%rsi), %xmm1
+-	mov	$-51, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(35bytes):
+-	movdqu	-35(%rsi), %xmm1
+-	movdqu	-35(%rdi), %xmm2
+-	mov	$-35, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(19bytes):
+-	mov	-19(%rdi), %rax
+-	mov	-19(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(11bytes):
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(15bytes):
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(7bytes):
+-	mov	-7(%rdi), %eax
+-	mov	-7(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(3bytes):
+-	movzwl	-3(%rdi), %eax
+-	movzwl	-3(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin2bytes)
+-L(1bytes):
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(68bytes):
+-	movdqu	-68(%rdi), %xmm2
+-	movdqu	-68(%rsi), %xmm1
+-	mov	$-68, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(52bytes):
+-	movdqu	-52(%rdi), %xmm2
+-	movdqu	-52(%rsi), %xmm1
+-	mov	$-52, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(36bytes):
+-	movdqu	-36(%rdi), %xmm2
+-	movdqu	-36(%rsi), %xmm1
+-	mov	$-36, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(20bytes):
+-	movdqu	-20(%rdi), %xmm2
+-	movdqu	-20(%rsi), %xmm1
+-	mov	$-20, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-4(%rsi), %ecx
+-
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(69bytes):
+-	movdqu	-69(%rsi), %xmm1
+-	movdqu	-69(%rdi), %xmm2
+-	mov	$-69, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(53bytes):
+-	movdqu	-53(%rsi), %xmm1
+-	movdqu	-53(%rdi), %xmm2
+-	mov	$-53, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(37bytes):
+-	movdqu	-37(%rsi), %xmm1
+-	movdqu	-37(%rdi), %xmm2
+-	mov	$-37, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(21bytes):
+-	movdqu	-21(%rsi), %xmm1
+-	movdqu	-21(%rdi), %xmm2
+-	mov	$-21, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(70bytes):
+-	movdqu	-70(%rsi), %xmm1
+-	movdqu	-70(%rdi), %xmm2
+-	mov	$-70, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(54bytes):
+-	movdqu	-54(%rsi), %xmm1
+-	movdqu	-54(%rdi), %xmm2
+-	mov	$-54, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(38bytes):
+-	movdqu	-38(%rsi), %xmm1
+-	movdqu	-38(%rdi), %xmm2
+-	mov	$-38, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(22bytes):
+-	movdqu	-22(%rsi), %xmm1
+-	movdqu	-22(%rdi), %xmm2
+-	mov	$-22, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(71bytes):
+-	movdqu	-71(%rsi), %xmm1
+-	movdqu	-71(%rdi), %xmm2
+-	mov	$-71, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(55bytes):
+-	movdqu	-55(%rdi), %xmm2
+-	movdqu	-55(%rsi), %xmm1
+-	mov	$-55, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(39bytes):
+-	movdqu	-39(%rdi), %xmm2
+-	movdqu	-39(%rsi), %xmm1
+-	mov	$-39, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(23bytes):
+-	movdqu	-23(%rdi), %xmm2
+-	movdqu	-23(%rsi), %xmm1
+-	mov	$-23, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(72bytes):
+-	movdqu	-72(%rsi), %xmm1
+-	movdqu	-72(%rdi), %xmm2
+-	mov	$-72, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(56bytes):
+-	movdqu	-56(%rdi), %xmm2
+-	movdqu	-56(%rsi), %xmm1
+-	mov	$-56, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(40bytes):
+-	movdqu	-40(%rdi), %xmm2
+-	movdqu	-40(%rsi), %xmm1
+-	mov	$-40, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(24bytes):
+-	movdqu	-24(%rdi), %xmm2
+-	movdqu	-24(%rsi), %xmm1
+-	mov	$-24, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-8(%rsi), %rcx
+-	mov	-8(%rdi), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(73bytes):
+-	movdqu	-73(%rsi), %xmm1
+-	movdqu	-73(%rdi), %xmm2
+-	mov	$-73, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(57bytes):
+-	movdqu	-57(%rdi), %xmm2
+-	movdqu	-57(%rsi), %xmm1
+-	mov	$-57, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(41bytes):
+-	movdqu	-41(%rdi), %xmm2
+-	movdqu	-41(%rsi), %xmm1
+-	mov	$-41, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(25bytes):
+-	movdqu	-25(%rdi), %xmm2
+-	movdqu	-25(%rsi), %xmm1
+-	mov	$-25, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(74bytes):
+-	movdqu	-74(%rsi), %xmm1
+-	movdqu	-74(%rdi), %xmm2
+-	mov	$-74, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(58bytes):
+-	movdqu	-58(%rdi), %xmm2
+-	movdqu	-58(%rsi), %xmm1
+-	mov	$-58, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(42bytes):
+-	movdqu	-42(%rdi), %xmm2
+-	movdqu	-42(%rsi), %xmm1
+-	mov	$-42, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(26bytes):
+-	movdqu	-26(%rdi), %xmm2
+-	movdqu	-26(%rsi), %xmm1
+-	mov	$-26, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	jmp	L(diffin2bytes)
+-
+-	.p2align 4
+-L(75bytes):
+-	movdqu	-75(%rsi), %xmm1
+-	movdqu	-75(%rdi), %xmm2
+-	mov	$-75, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(59bytes):
+-	movdqu	-59(%rdi), %xmm2
+-	movdqu	-59(%rsi), %xmm1
+-	mov	$-59, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(43bytes):
+-	movdqu	-43(%rdi), %xmm2
+-	movdqu	-43(%rsi), %xmm1
+-	mov	$-43, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(27bytes):
+-	movdqu	-27(%rdi), %xmm2
+-	movdqu	-27(%rsi), %xmm1
+-	mov	$-27, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(76bytes):
+-	movdqu	-76(%rsi), %xmm1
+-	movdqu	-76(%rdi), %xmm2
+-	mov	$-76, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(60bytes):
+-	movdqu	-60(%rdi), %xmm2
+-	movdqu	-60(%rsi), %xmm1
+-	mov	$-60, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(44bytes):
+-	movdqu	-44(%rdi), %xmm2
+-	movdqu	-44(%rsi), %xmm1
+-	mov	$-44, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(28bytes):
+-	movdqu	-28(%rdi), %xmm2
+-	movdqu	-28(%rsi), %xmm1
+-	mov	$-28, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(77bytes):
+-	movdqu	-77(%rsi), %xmm1
+-	movdqu	-77(%rdi), %xmm2
+-	mov	$-77, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(61bytes):
+-	movdqu	-61(%rdi), %xmm2
+-	movdqu	-61(%rsi), %xmm1
+-	mov	$-61, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(45bytes):
+-	movdqu	-45(%rdi), %xmm2
+-	movdqu	-45(%rsi), %xmm1
+-	mov	$-45, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(29bytes):
+-	movdqu	-29(%rdi), %xmm2
+-	movdqu	-29(%rsi), %xmm1
+-	mov	$-29, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(78bytes):
+-	movdqu	-78(%rsi), %xmm1
+-	movdqu	-78(%rdi), %xmm2
+-	mov	$-78, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(62bytes):
+-	movdqu	-62(%rdi), %xmm2
+-	movdqu	-62(%rsi), %xmm1
+-	mov	$-62, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(46bytes):
+-	movdqu	-46(%rdi), %xmm2
+-	movdqu	-46(%rsi), %xmm1
+-	mov	$-46, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(30bytes):
+-	movdqu	-30(%rdi), %xmm2
+-	movdqu	-30(%rsi), %xmm1
+-	mov	$-30, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(79bytes):
+-	movdqu	-79(%rsi), %xmm1
+-	movdqu	-79(%rdi), %xmm2
+-	mov	$-79, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(63bytes):
+-	movdqu	-63(%rdi), %xmm2
+-	movdqu	-63(%rsi), %xmm1
+-	mov	$-63, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(47bytes):
+-	movdqu	-47(%rdi), %xmm2
+-	movdqu	-47(%rsi), %xmm1
+-	mov	$-47, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(31bytes):
+-	movdqu	-31(%rdi), %xmm2
+-	movdqu	-31(%rsi), %xmm1
+-	mov	$-31, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(64bytes):
+-	movdqu	-64(%rdi), %xmm2
+-	movdqu	-64(%rsi), %xmm1
+-	mov	$-64, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(48bytes):
+-	movdqu	-48(%rdi), %xmm2
+-	movdqu	-48(%rsi), %xmm1
+-	mov	$-48, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(32bytes):
+-	movdqu	-32(%rdi), %xmm2
+-	movdqu	-32(%rsi), %xmm1
+-	mov	$-32, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-/*
+- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+- */
+-	.p2align 3
+-L(less16bytes):
+-	movsbq	%dl, %rdx
+-	mov	(%rsi, %rdx), %rcx
+-	mov	(%rdi, %rdx), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	8(%rsi, %rdx), %rcx
+-	mov	8(%rdi, %rdx), %rax
+-L(diffin8bytes):
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	shr	$32, %rcx
+-	shr	$32, %rax
+-
++	pmovmskb %xmm0, %ecx
++	incw	%cx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm1, %ecx
++	notw	%cx
++	sall	$16, %ecx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm2, %ecx
++	notw	%cx
++	shlq	$32, %rcx
++	jnz	L(loop_end_ret)
++
++	addq	$48, %rdi
++	addq	$48, %rsi
++	movq	%rax, %rcx
++
++	.p2align 4,, 6
++L(loop_end_ret):
++	bsfq	%rcx, %rcx
+ # ifdef USE_AS_WMEMCMP
+-/* for wmemcmp */
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-L(diffin4bytes):
+-# ifndef USE_AS_WMEMCMP
+-	cmp	%cx, %ax
+-	jne	L(diffin2bytes)
+-	shr	$16, %ecx
+-	shr	$16, %eax
+-L(diffin2bytes):
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(end):
+-	and	$0xff, %eax
+-	and	$0xff, %ecx
+-	sub	%ecx, %eax
+-	ret
++	movl	(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-
+-/* for wmemcmp */
+-	mov	$1, %eax
+-	jl	L(nequal_bigger)
+-	neg	%eax
+-	ret
+-
+-	.p2align 4
+-L(nequal_bigger):
+-	ret
+-
+-L(unreal_case):
+-	xor	%eax, %eax
+-	ret
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
+-
++	ret
+ END (MEMCMP)
+-
+-	.section .rodata.sse4.1,"a",@progbits
+-	.p2align 3
+-# ifndef USE_AS_WMEMCMP
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(1bytes), L(table_64bytes))
+-	.int	JMPTBL (L(2bytes), L(table_64bytes))
+-	.int	JMPTBL (L(3bytes), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(5bytes), L(table_64bytes))
+-	.int	JMPTBL (L(6bytes), L(table_64bytes))
+-	.int	JMPTBL (L(7bytes), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(9bytes), L(table_64bytes))
+-	.int	JMPTBL (L(10bytes), L(table_64bytes))
+-	.int	JMPTBL (L(11bytes), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(13bytes), L(table_64bytes))
+-	.int	JMPTBL (L(14bytes), L(table_64bytes))
+-	.int	JMPTBL (L(15bytes), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(17bytes), L(table_64bytes))
+-	.int	JMPTBL (L(18bytes), L(table_64bytes))
+-	.int	JMPTBL (L(19bytes), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(21bytes), L(table_64bytes))
+-	.int	JMPTBL (L(22bytes), L(table_64bytes))
+-	.int	JMPTBL (L(23bytes), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(25bytes), L(table_64bytes))
+-	.int	JMPTBL (L(26bytes), L(table_64bytes))
+-	.int	JMPTBL (L(27bytes), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(29bytes), L(table_64bytes))
+-	.int	JMPTBL (L(30bytes), L(table_64bytes))
+-	.int	JMPTBL (L(31bytes), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(33bytes), L(table_64bytes))
+-	.int	JMPTBL (L(34bytes), L(table_64bytes))
+-	.int	JMPTBL (L(35bytes), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(37bytes), L(table_64bytes))
+-	.int	JMPTBL (L(38bytes), L(table_64bytes))
+-	.int	JMPTBL (L(39bytes), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(41bytes), L(table_64bytes))
+-	.int	JMPTBL (L(42bytes), L(table_64bytes))
+-	.int	JMPTBL (L(43bytes), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(45bytes), L(table_64bytes))
+-	.int	JMPTBL (L(46bytes), L(table_64bytes))
+-	.int	JMPTBL (L(47bytes), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(49bytes), L(table_64bytes))
+-	.int	JMPTBL (L(50bytes), L(table_64bytes))
+-	.int	JMPTBL (L(51bytes), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(53bytes), L(table_64bytes))
+-	.int	JMPTBL (L(54bytes), L(table_64bytes))
+-	.int	JMPTBL (L(55bytes), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(57bytes), L(table_64bytes))
+-	.int	JMPTBL (L(58bytes), L(table_64bytes))
+-	.int	JMPTBL (L(59bytes), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(61bytes), L(table_64bytes))
+-	.int	JMPTBL (L(62bytes), L(table_64bytes))
+-	.int	JMPTBL (L(63bytes), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(65bytes), L(table_64bytes))
+-	.int	JMPTBL (L(66bytes), L(table_64bytes))
+-	.int	JMPTBL (L(67bytes), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(69bytes), L(table_64bytes))
+-	.int	JMPTBL (L(70bytes), L(table_64bytes))
+-	.int	JMPTBL (L(71bytes), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(73bytes), L(table_64bytes))
+-	.int	JMPTBL (L(74bytes), L(table_64bytes))
+-	.int	JMPTBL (L(75bytes), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(77bytes), L(table_64bytes))
+-	.int	JMPTBL (L(78bytes), L(table_64bytes))
+-	.int	JMPTBL (L(79bytes), L(table_64bytes))
+-# else
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-# endif
+ #endif
diff --git a/glibc-upstream-2.34-184.patch b/glibc-upstream-2.34-184.patch
new file mode 100644
index 0000000..805f91e
--- /dev/null
+++ b/glibc-upstream-2.34-184.patch
@@ -0,0 +1,104 @@
+commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Dec 3 15:29:25 2021 -0800
+
+    x86-64: Use notl in EVEX strcmp [BZ #28646]
+    
+    Must use notl %edi here as lower bits are for CHAR comparisons
+    potentially out of range thus can be 0 without indicating mismatch.
+    This fixes BZ #28646.
+    
+    Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
+
+diff --git a/string/test-strcmp.c b/string/test-strcmp.c
+index 7feababf4ddc5603..a0255b9625fbcedd 100644
+--- a/string/test-strcmp.c
++++ b/string/test-strcmp.c
+@@ -25,6 +25,7 @@
+ # define TEST_NAME "strcmp"
+ #endif
+ #include "test-string.h"
++#include <support/test-driver.h>
+ 
+ #ifdef WIDE
+ # include <wchar.h>
+@@ -392,6 +393,32 @@ check2 (void)
+ 	}
+ }
+ 
++static void
++check3 (void)
++{
++  size_t size = 0xd000 + 0x4000;
++  CHAR *s1, *s2;
++  CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
++			MAP_PRIVATE | MAP_ANON, -1, 0);
++  CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
++			MAP_PRIVATE | MAP_ANON, -1, 0);
++  if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
++    error (EXIT_UNSUPPORTED, errno, "mmap failed");
++
++  s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
++  s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
++
++  STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
++  STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
++
++  int exp_result = SIMPLE_STRCMP (s1, s2);
++  FOR_EACH_IMPL (impl, 0)
++    check_result (impl, s1, s2, exp_result);
++
++  munmap ((void *) buffer1, size);
++  munmap ((void *) buffer2, size);
++}
++
+ int
+ test_main (void)
+ {
+@@ -400,6 +427,7 @@ test_main (void)
+   test_init ();
+   check();
+   check2 ();
++  check3 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac89bcae20b..6f5c4bf984da2b80 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
++    /* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
++	/* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
diff --git a/glibc-upstream-2.34-185.patch b/glibc-upstream-2.34-185.patch
new file mode 100644
index 0000000..f06f86f
--- /dev/null
+++ b/glibc-upstream-2.34-185.patch
@@ -0,0 +1,30 @@
+commit f3a99b2216114f89b20329ae7664b764248b4bbd
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Dec 6 07:14:12 2021 -0800
+
+    x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
+    
+    Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+    they won't lower CPU frequency when ZMM load and store instructions are
+    used.
+    
+    (cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index f4d4049e391cbabd..09590d8794b1c6fb 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -566,8 +566,11 @@ disable_tsx:
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
++	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
++	     when ZMM load and store instructions are used.  */
++	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
++	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
diff --git a/glibc-upstream-2.34-186.patch b/glibc-upstream-2.34-186.patch
new file mode 100644
index 0000000..a046844
--- /dev/null
+++ b/glibc-upstream-2.34-186.patch
@@ -0,0 +1,384 @@
+commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Dec 24 18:54:41 2021 -0600
+
+    x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+    
+    No bug.
+    Optimizations are twofold.
+    
+    1) Replace page cross and 0/1 checks with masked load instructions in
+       L(less_vec). In applications this reduces branch-misses in the
+       hot [0, 32] case.
+    2) Change controlflow so that L(less_vec) case gets the fall through.
+    
+    Change 2) helps copies in the [0, 32] size range but comes at the cost
+    of copies in the [33, 64] size range.  From profiles of GCC and
+    Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+    appears to the the right tradeoff.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757fac8a356..d2899e7c7078cd41 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
++#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
++
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
++	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
++	ja	L(more_1x_vec)
++
++	/* Create mask for CHAR's we want to compare. This allows us to
++	   avoid having to include page cross logic.  */
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k2
++
++	/* Safe to load full ymm with mask.  */
++	VMOVU_MASK (%rsi), %YMM2{%k2}
++	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++	ret
+ 
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++
++	.p2align 4
++L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
++	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
++
++	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
++
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
++	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
diff --git a/glibc-upstream-2.34-187.patch b/glibc-upstream-2.34-187.patch
new file mode 100644
index 0000000..6186aeb
--- /dev/null
+++ b/glibc-upstream-2.34-187.patch
@@ -0,0 +1,42 @@
+commit 9681691402052b727e01ae3375c73e0f76566593
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Wed Apr 27 13:59:26 2022 -0300
+
+    linux: Fix missing internal 64 bit time_t stat usage
+    
+    These are two missing spots initially done by 52a5fe70a2c77935.
+    
+    Checked on i686-linux-gnu.
+    
+    (cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9)
+
+diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
+index 13160d32499c4e58..00e4ce7f80ee2dfe 100644
+--- a/sysdeps/unix/sysv/linux/faccessat.c
++++ b/sysdeps/unix/sysv/linux/faccessat.c
+@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
+   if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
+     return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
+ 
+-  struct stat64 stats;
+-  if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
++  struct __stat64_t64 stats;
++  if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+     return -1;
+ 
+   mode &= (X_OK | W_OK | R_OK);	/* Clear any bogus bits. */
+diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
+index b599a66c930cad4d..f79930303118ebcd 100644
+--- a/sysdeps/unix/sysv/linux/pathconf.c
++++ b/sysdeps/unix/sysv/linux/pathconf.c
+@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
+ 	      && strcmp (mntbuf.mnt_type, "ext4") != 0)
+ 	    continue;
+ 
+-	  struct stat64 fsst;
+-	  if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
++	  struct __stat64_t64 fsst;
++	  if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
+ 	      && st.st_dev == fsst.st_dev)
+ 	    {
+ 	      if (strcmp (mntbuf.mnt_type, "ext4") == 0)
diff --git a/glibc-upstream-2.34-188.patch b/glibc-upstream-2.34-188.patch
new file mode 100644
index 0000000..8b49369
--- /dev/null
+++ b/glibc-upstream-2.34-188.patch
@@ -0,0 +1,39 @@
+commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe
+Author: Carlos O'Donell <carlos@redhat.com>
+Date:   Tue Apr 26 10:52:41 2022 -0400
+
+    i386: Regenerate ulps
+    
+    These failures were caught while building glibc master for Fedora
+    Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse'
+    using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon
+    processor.
+    
+    (cherry picked from commit e465d97653311c3687aee49de782177353acfe86)
+
+diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
+index 7601049110789201..84e6686eba5fe79a 100644
+--- a/sysdeps/i386/fpu/libm-test-ulps
++++ b/sysdeps/i386/fpu/libm-test-ulps
+@@ -668,7 +668,7 @@ ldouble: 4
+ 
+ Function: Imaginary part of "clog10":
+ double: 2
+-float: 1
++float: 2
+ float128: 2
+ ldouble: 2
+ 
+diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+index a39c89cec1141935..cc21e6907fe8b6a3 100644
+--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
++++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+@@ -668,7 +668,7 @@ ldouble: 4
+ 
+ Function: Imaginary part of "clog10":
+ double: 2
+-float: 1
++float: 2
+ float128: 2
+ ldouble: 2
+ 
diff --git a/glibc.spec b/glibc.spec
index b0f53f7..4df755d 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -148,7 +148,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 30%{?dist}
+Release: 31%{?dist}
 
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch
 Patch176: glibc-rh2058230.patch
 Patch177: glibc-rh2054789.patch
 Patch178: glibc-upstream-2.34-108.patch
-Patch179: glibc-upstream-2.34-110.patch
 # glibc-2.34-109-gd64b08d5ba only changes NEWS.
+Patch179: glibc-upstream-2.34-110.patch
 Patch180: glibc-upstream-2.34-111.patch
 Patch181: glibc-upstream-2.34-112.patch
 Patch182: glibc-upstream-2.34-113.patch
 Patch183: glibc-upstream-2.34-114.patch
+# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
+# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch184: glibc-upstream-2.34-117.patch
 Patch185: glibc-upstream-2.34-118.patch
 Patch186: glibc-upstream-2.34-119.patch
-# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
-# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
 Patch187: glibc-upstream-2.34-120.patch
 Patch188: glibc-upstream-2.34-121.patch
 Patch189: glibc-upstream-2.34-122.patch
@@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch
 Patch230: glibc-upstream-2.34-164.patch
 Patch231: glibc-upstream-2.34-165.patch
 Patch232: glibc-upstream-2.34-166.patch
+Patch233: glibc-upstream-2.34-167.patch
+Patch234: glibc-upstream-2.34-168.patch
+Patch235: glibc-upstream-2.34-169.patch
+Patch236: glibc-upstream-2.34-170.patch
+Patch237: glibc-upstream-2.34-171.patch
+Patch238: glibc-upstream-2.34-172.patch
+Patch239: glibc-upstream-2.34-173.patch
+Patch240: glibc-upstream-2.34-174.patch
+Patch241: glibc-upstream-2.34-175.patch
+Patch242: glibc-upstream-2.34-176.patch
+Patch243: glibc-upstream-2.34-177.patch
+Patch244: glibc-upstream-2.34-178.patch
+Patch245: glibc-upstream-2.34-179.patch
+Patch246: glibc-upstream-2.34-180.patch
+Patch247: glibc-upstream-2.34-181.patch
+Patch248: glibc-upstream-2.34-182.patch
+Patch249: glibc-upstream-2.34-183.patch
+Patch250: glibc-upstream-2.34-184.patch
+Patch251: glibc-upstream-2.34-185.patch
+Patch252: glibc-upstream-2.34-186.patch
+Patch253: glibc-upstream-2.34-187.patch
+Patch254: glibc-upstream-2.34-188.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2493,6 +2515,32 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
+* Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31
+- Sync with upstream branch release/2.34/master,
+  commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe:
+- i386: Regenerate ulps
+- linux: Fix missing internal 64 bit time_t stat usage
+- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
+- x86-64: Use notl in EVEX strcmp [BZ #28646]
+- x86: Shrink memcmp-sse4.S code size
+- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
+- x86: Optimize memmove-vec-unaligned-erms.S
+- x86-64: Replace movzx with movzbl
+- x86-64: Remove Prefer_AVX2_STRCMP
+- x86-64: Improve EVEX strcmp with masked load
+- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
+- x86: Optimize memset-vec-unaligned-erms.S
+- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
+- x86: Modify ENTRY in sysdep.h so that p2align can be specified
+- x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
+- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
+- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
+- INSTALL: Rephrase -with-default-link documentation
+- misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
+- Default to --with-default-link=no (bug 25812)
+- scripts: Add glibcelf.py module
+
 * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30
 - Sync with upstream branch release/2.34/master,
   commit 71326f1f2fd09dafb9c34404765fb88129e94237: