diff --git a/glibc-upstream-2.34-167.patch b/glibc-upstream-2.34-167.patch new file mode 100644 index 0000000..e00042d --- /dev/null +++ b/glibc-upstream-2.34-167.patch @@ -0,0 +1,1446 @@ +commit 3e0a91b79b409a6a4113a0fdb08221c0bb29cfce +Author: Florian Weimer +Date: Mon Apr 11 11:28:08 2022 +0200 + + scripts: Add glibcelf.py module + + Hopefully, this will lead to tests that are easier to maintain. The + current approach of parsing readelf -W output using regular expressions + is not necessarily easier than parsing the ELF data directly. + + This module is still somewhat incomplete (e.g., coverage of relocation + types and versioning information is missing), but it is sufficient to + perform basic symbol analysis or program header analysis. + + The EM_* mapping for architecture-specific constant classes (e.g., + SttX86_64) is not yet implemented. The classes are defined for the + benefit of elf/tst-glibcelf.py. + + Reviewed-by: Siddhesh Poyarekar + (cherry picked from commit 30035d67728a846fa39749cd162afd278ac654c4) + +diff --git a/elf/Makefile b/elf/Makefile +index 8e2dd91c583f9a62..8afbe3f6ab259331 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -1053,6 +1053,13 @@ CFLAGS-tst-prelink.c += -fno-pie + tst-prelink-no-pie = yes + endif + ++tests-special += $(objpfx)tst-glibcelf.out ++$(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \ ++ $(..)/scripts/glibcextract.py ++ PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcelf.py \ ++ --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \ ++ < /dev/null > $@ 2>&1; $(evaluate-test) ++ + # The test requires shared _and_ PIE because the executable + # unit test driver must be able to link with the shared object + # that is going to eventually go into an installed DSO. +diff --git a/elf/tst-glibcelf.py b/elf/tst-glibcelf.py +new file mode 100644 +index 0000000000000000..bf15a3bad4479e08 +--- /dev/null ++++ b/elf/tst-glibcelf.py +@@ -0,0 +1,260 @@ ++#!/usr/bin/python3 ++# Verify scripts/glibcelf.py contents against elf/elf.h. ++# Copyright (C) 2022 Free Software Foundation, Inc. ++# This file is part of the GNU C Library. ++# ++# The GNU C Library is free software; you can redistribute it and/or ++# modify it under the terms of the GNU Lesser General Public ++# License as published by the Free Software Foundation; either ++# version 2.1 of the License, or (at your option) any later version. ++# ++# The GNU C Library is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with the GNU C Library; if not, see ++# . ++ ++import argparse ++import enum ++import sys ++ ++import glibcelf ++import glibcextract ++ ++errors_encountered = 0 ++ ++def error(message): ++ global errors_encountered ++ sys.stdout.write('error: {}\n'.format(message)) ++ errors_encountered += 1 ++ ++# The enum constants in glibcelf are expected to have exactly these ++# prefixes. ++expected_constant_prefixes = tuple( ++ 'ELFCLASS ELFDATA EM_ ET_ DT_ PF_ PT_ SHF_ SHN_ SHT_ STB_ STT_'.split()) ++ ++def find_constant_prefix(name): ++ """Returns a matching prefix from expected_constant_prefixes or None.""" ++ for prefix in expected_constant_prefixes: ++ if name.startswith(prefix): ++ return prefix ++ return None ++ ++def find_enum_types(): ++ """A generator for OpenIntEnum and IntFlag classes in glibcelf.""" ++ for obj in vars(glibcelf).values(): ++ if isinstance(obj, type) and obj.__bases__[0] in ( ++ glibcelf._OpenIntEnum, enum.Enum, enum.IntFlag): ++ yield obj ++ ++def check_duplicates(): ++ """Verifies that enum types do not have duplicate values. ++ ++ Different types must have different member names, too. ++ ++ """ ++ global_seen = {} ++ for typ in find_enum_types(): ++ seen = {} ++ last = None ++ for (name, e) in typ.__members__.items(): ++ if e.value in seen: ++ error('{} has {}={} and {}={}'.format( ++ typ, seen[e.value], e.value, name, e.value)) ++ last = e ++ else: ++ seen[e.value] = name ++ if last is not None and last.value > e.value: ++ error('{} has {}={} after {}={}'.format( ++ typ, name, e.value, last.name, last.value)) ++ if name in global_seen: ++ error('{} used in {} and {}'.format( ++ name, global_seen[name], typ)) ++ else: ++ global_seen[name] = typ ++ ++def check_constant_prefixes(): ++ """Check that the constant prefixes match expected_constant_prefixes.""" ++ seen = set() ++ for typ in find_enum_types(): ++ typ_prefix = None ++ for val in typ: ++ prefix = find_constant_prefix(val.name) ++ if prefix is None: ++ error('constant {!r} for {} has unknown prefix'.format( ++ val, typ)) ++ break ++ elif typ_prefix is None: ++ typ_prefix = prefix ++ seen.add(typ_prefix) ++ elif prefix != typ_prefix: ++ error('prefix {!r} for constant {!r}, expected {!r}'.format( ++ prefix, val, typ_prefix)) ++ if typ_prefix is None: ++ error('empty enum type {}'.format(typ)) ++ ++ for prefix in sorted(set(expected_constant_prefixes) - seen): ++ error('missing constant prefix {!r}'.format(prefix)) ++ # Reverse difference is already covered inside the loop. ++ ++def find_elf_h_constants(cc): ++ """Returns a dictionary of relevant constants from .""" ++ return glibcextract.compute_macro_consts( ++ source_text='#include ', ++ cc=cc, ++ macro_re='|'.join( ++ prefix + '.*' for prefix in expected_constant_prefixes)) ++ ++# The first part of the pair is a name of an constant that is ++# dropped from glibcelf. The second part is the constant as it is ++# used in . ++glibcelf_skipped_aliases = ( ++ ('EM_ARC_A5', 'EM_ARC_COMPACT'), ++ ('PF_PARISC_SBP', 'PF_HP_SBP') ++) ++ ++# Constants that provide little value and are not included in ++# glibcelf: *LO*/*HI* range constants, *NUM constants counting the ++# number of constants. Also includes the alias names from ++# glibcelf_skipped_aliases. ++glibcelf_skipped_constants = frozenset( ++ [e[0] for e in glibcelf_skipped_aliases]) | frozenset(""" ++DT_AARCH64_NUM ++DT_ADDRNUM ++DT_ADDRRNGHI ++DT_ADDRRNGLO ++DT_ALPHA_NUM ++DT_ENCODING ++DT_EXTRANUM ++DT_HIOS ++DT_HIPROC ++DT_IA_64_NUM ++DT_LOOS ++DT_LOPROC ++DT_MIPS_NUM ++DT_NUM ++DT_PPC64_NUM ++DT_PPC_NUM ++DT_PROCNUM ++DT_SPARC_NUM ++DT_VALNUM ++DT_VALRNGHI ++DT_VALRNGLO ++DT_VERSIONTAGNUM ++ELFCLASSNUM ++ELFDATANUM ++ET_HIOS ++ET_HIPROC ++ET_LOOS ++ET_LOPROC ++ET_NUM ++PF_MASKOS ++PF_MASKPROC ++PT_HIOS ++PT_HIPROC ++PT_HISUNW ++PT_LOOS ++PT_LOPROC ++PT_LOSUNW ++SHF_MASKOS ++SHF_MASKPROC ++SHN_HIOS ++SHN_HIPROC ++SHN_HIRESERVE ++SHN_LOOS ++SHN_LOPROC ++SHN_LORESERVE ++SHT_HIOS ++SHT_HIPROC ++SHT_HIPROC ++SHT_HISUNW ++SHT_HIUSER ++SHT_LOOS ++SHT_LOPROC ++SHT_LOSUNW ++SHT_LOUSER ++SHT_NUM ++STB_HIOS ++STB_HIPROC ++STB_LOOS ++STB_LOPROC ++STB_NUM ++STT_HIOS ++STT_HIPROC ++STT_LOOS ++STT_LOPROC ++STT_NUM ++""".strip().split()) ++ ++def check_constant_values(cc): ++ """Checks the values of constants against glibcelf.""" ++ ++ glibcelf_constants = { ++ e.name: e for typ in find_enum_types() for e in typ} ++ elf_h_constants = find_elf_h_constants(cc=cc) ++ ++ missing_in_glibcelf = (set(elf_h_constants) - set(glibcelf_constants) ++ - glibcelf_skipped_constants) ++ for name in sorted(missing_in_glibcelf): ++ error('constant {} is missing from glibcelf'.format(name)) ++ ++ unexpected_in_glibcelf = \ ++ set(glibcelf_constants) & glibcelf_skipped_constants ++ for name in sorted(unexpected_in_glibcelf): ++ error('constant {} is supposed to be filtered from glibcelf'.format( ++ name)) ++ ++ missing_in_elf_h = set(glibcelf_constants) - set(elf_h_constants) ++ for name in sorted(missing_in_elf_h): ++ error('constant {} is missing from '.format(name)) ++ ++ expected_in_elf_h = glibcelf_skipped_constants - set(elf_h_constants) ++ for name in expected_in_elf_h: ++ error('filtered constant {} is missing from '.format(name)) ++ ++ for alias_name, name_in_glibcelf in glibcelf_skipped_aliases: ++ if name_in_glibcelf not in glibcelf_constants: ++ error('alias value {} for {} not in glibcelf'.format( ++ name_in_glibcelf, alias_name)) ++ elif (int(elf_h_constants[alias_name]) ++ != glibcelf_constants[name_in_glibcelf].value): ++ error(' has {}={}, glibcelf has {}={}'.format( ++ alias_name, elf_h_constants[alias_name], ++ name_in_glibcelf, glibcelf_constants[name_in_glibcelf])) ++ ++ # Check for value mismatches: ++ for name in sorted(set(glibcelf_constants) & set(elf_h_constants)): ++ glibcelf_value = glibcelf_constants[name].value ++ elf_h_value = int(elf_h_constants[name]) ++ # On 32-bit architectures as some constants that are ++ # parsed as signed, while they are unsigned in glibcelf. So ++ # far, this only affects some flag constants, so special-case ++ # them here. ++ if (glibcelf_value != elf_h_value ++ and not (isinstance(glibcelf_constants[name], enum.IntFlag) ++ and glibcelf_value == 1 << 31 ++ and elf_h_value == -(1 << 31))): ++ error('{}: glibcelf has {!r}, has {!r}'.format( ++ name, glibcelf_value, elf_h_value)) ++ ++def main(): ++ """The main entry point.""" ++ parser = argparse.ArgumentParser( ++ description="Check glibcelf.py and elf.h against each other.") ++ parser.add_argument('--cc', metavar='CC', ++ help='C compiler (including options) to use') ++ args = parser.parse_args() ++ ++ check_duplicates() ++ check_constant_prefixes() ++ check_constant_values(cc=args.cc) ++ ++ if errors_encountered > 0: ++ print("note: errors encountered:", errors_encountered) ++ sys.exit(1) ++ ++if __name__ == '__main__': ++ main() +diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py +new file mode 100644 +index 0000000000000000..8f7d0ca184845714 +--- /dev/null ++++ b/scripts/glibcelf.py +@@ -0,0 +1,1135 @@ ++#!/usr/bin/python3 ++# ELF support functionality for Python. ++# Copyright (C) 2022 Free Software Foundation, Inc. ++# This file is part of the GNU C Library. ++# ++# The GNU C Library is free software; you can redistribute it and/or ++# modify it under the terms of the GNU Lesser General Public ++# License as published by the Free Software Foundation; either ++# version 2.1 of the License, or (at your option) any later version. ++# ++# The GNU C Library is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with the GNU C Library; if not, see ++# . ++ ++"""Basic ELF parser. ++ ++Use Image.readfile(path) to read an ELF file into memory and begin ++parsing it. ++ ++""" ++ ++import collections ++import enum ++import struct ++ ++class _OpenIntEnum(enum.IntEnum): ++ """Integer enumeration that supports arbitrary int values.""" ++ @classmethod ++ def _missing_(cls, value): ++ # See enum.IntFlag._create_pseudo_member_. This allows ++ # creating of enum constants with arbitrary integer values. ++ pseudo_member = int.__new__(cls, value) ++ pseudo_member._name_ = None ++ pseudo_member._value_ = value ++ return pseudo_member ++ ++ def __repr__(self): ++ name = self._name_ ++ if name is not None: ++ # The names have prefixes like SHT_, implying their type. ++ return name ++ return '{}({})'.format(self.__class__.__name__, self._value_) ++ ++ def __str__(self): ++ name = self._name_ ++ if name is not None: ++ return name ++ return str(self._value_) ++ ++class ElfClass(_OpenIntEnum): ++ """ELF word size. Type of EI_CLASS values.""" ++ ELFCLASSNONE = 0 ++ ELFCLASS32 = 1 ++ ELFCLASS64 = 2 ++ ++class ElfData(_OpenIntEnum): ++ """ELF endianess. Type of EI_DATA values.""" ++ ELFDATANONE = 0 ++ ELFDATA2LSB = 1 ++ ELFDATA2MSB = 2 ++ ++class Machine(_OpenIntEnum): ++ """ELF machine type. Type of values in Ehdr.e_machine field.""" ++ EM_NONE = 0 ++ EM_M32 = 1 ++ EM_SPARC = 2 ++ EM_386 = 3 ++ EM_68K = 4 ++ EM_88K = 5 ++ EM_IAMCU = 6 ++ EM_860 = 7 ++ EM_MIPS = 8 ++ EM_S370 = 9 ++ EM_MIPS_RS3_LE = 10 ++ EM_PARISC = 15 ++ EM_VPP500 = 17 ++ EM_SPARC32PLUS = 18 ++ EM_960 = 19 ++ EM_PPC = 20 ++ EM_PPC64 = 21 ++ EM_S390 = 22 ++ EM_SPU = 23 ++ EM_V800 = 36 ++ EM_FR20 = 37 ++ EM_RH32 = 38 ++ EM_RCE = 39 ++ EM_ARM = 40 ++ EM_FAKE_ALPHA = 41 ++ EM_SH = 42 ++ EM_SPARCV9 = 43 ++ EM_TRICORE = 44 ++ EM_ARC = 45 ++ EM_H8_300 = 46 ++ EM_H8_300H = 47 ++ EM_H8S = 48 ++ EM_H8_500 = 49 ++ EM_IA_64 = 50 ++ EM_MIPS_X = 51 ++ EM_COLDFIRE = 52 ++ EM_68HC12 = 53 ++ EM_MMA = 54 ++ EM_PCP = 55 ++ EM_NCPU = 56 ++ EM_NDR1 = 57 ++ EM_STARCORE = 58 ++ EM_ME16 = 59 ++ EM_ST100 = 60 ++ EM_TINYJ = 61 ++ EM_X86_64 = 62 ++ EM_PDSP = 63 ++ EM_PDP10 = 64 ++ EM_PDP11 = 65 ++ EM_FX66 = 66 ++ EM_ST9PLUS = 67 ++ EM_ST7 = 68 ++ EM_68HC16 = 69 ++ EM_68HC11 = 70 ++ EM_68HC08 = 71 ++ EM_68HC05 = 72 ++ EM_SVX = 73 ++ EM_ST19 = 74 ++ EM_VAX = 75 ++ EM_CRIS = 76 ++ EM_JAVELIN = 77 ++ EM_FIREPATH = 78 ++ EM_ZSP = 79 ++ EM_MMIX = 80 ++ EM_HUANY = 81 ++ EM_PRISM = 82 ++ EM_AVR = 83 ++ EM_FR30 = 84 ++ EM_D10V = 85 ++ EM_D30V = 86 ++ EM_V850 = 87 ++ EM_M32R = 88 ++ EM_MN10300 = 89 ++ EM_MN10200 = 90 ++ EM_PJ = 91 ++ EM_OPENRISC = 92 ++ EM_ARC_COMPACT = 93 ++ EM_XTENSA = 94 ++ EM_VIDEOCORE = 95 ++ EM_TMM_GPP = 96 ++ EM_NS32K = 97 ++ EM_TPC = 98 ++ EM_SNP1K = 99 ++ EM_ST200 = 100 ++ EM_IP2K = 101 ++ EM_MAX = 102 ++ EM_CR = 103 ++ EM_F2MC16 = 104 ++ EM_MSP430 = 105 ++ EM_BLACKFIN = 106 ++ EM_SE_C33 = 107 ++ EM_SEP = 108 ++ EM_ARCA = 109 ++ EM_UNICORE = 110 ++ EM_EXCESS = 111 ++ EM_DXP = 112 ++ EM_ALTERA_NIOS2 = 113 ++ EM_CRX = 114 ++ EM_XGATE = 115 ++ EM_C166 = 116 ++ EM_M16C = 117 ++ EM_DSPIC30F = 118 ++ EM_CE = 119 ++ EM_M32C = 120 ++ EM_TSK3000 = 131 ++ EM_RS08 = 132 ++ EM_SHARC = 133 ++ EM_ECOG2 = 134 ++ EM_SCORE7 = 135 ++ EM_DSP24 = 136 ++ EM_VIDEOCORE3 = 137 ++ EM_LATTICEMICO32 = 138 ++ EM_SE_C17 = 139 ++ EM_TI_C6000 = 140 ++ EM_TI_C2000 = 141 ++ EM_TI_C5500 = 142 ++ EM_TI_ARP32 = 143 ++ EM_TI_PRU = 144 ++ EM_MMDSP_PLUS = 160 ++ EM_CYPRESS_M8C = 161 ++ EM_R32C = 162 ++ EM_TRIMEDIA = 163 ++ EM_QDSP6 = 164 ++ EM_8051 = 165 ++ EM_STXP7X = 166 ++ EM_NDS32 = 167 ++ EM_ECOG1X = 168 ++ EM_MAXQ30 = 169 ++ EM_XIMO16 = 170 ++ EM_MANIK = 171 ++ EM_CRAYNV2 = 172 ++ EM_RX = 173 ++ EM_METAG = 174 ++ EM_MCST_ELBRUS = 175 ++ EM_ECOG16 = 176 ++ EM_CR16 = 177 ++ EM_ETPU = 178 ++ EM_SLE9X = 179 ++ EM_L10M = 180 ++ EM_K10M = 181 ++ EM_AARCH64 = 183 ++ EM_AVR32 = 185 ++ EM_STM8 = 186 ++ EM_TILE64 = 187 ++ EM_TILEPRO = 188 ++ EM_MICROBLAZE = 189 ++ EM_CUDA = 190 ++ EM_TILEGX = 191 ++ EM_CLOUDSHIELD = 192 ++ EM_COREA_1ST = 193 ++ EM_COREA_2ND = 194 ++ EM_ARCV2 = 195 ++ EM_OPEN8 = 196 ++ EM_RL78 = 197 ++ EM_VIDEOCORE5 = 198 ++ EM_78KOR = 199 ++ EM_56800EX = 200 ++ EM_BA1 = 201 ++ EM_BA2 = 202 ++ EM_XCORE = 203 ++ EM_MCHP_PIC = 204 ++ EM_INTELGT = 205 ++ EM_KM32 = 210 ++ EM_KMX32 = 211 ++ EM_EMX16 = 212 ++ EM_EMX8 = 213 ++ EM_KVARC = 214 ++ EM_CDP = 215 ++ EM_COGE = 216 ++ EM_COOL = 217 ++ EM_NORC = 218 ++ EM_CSR_KALIMBA = 219 ++ EM_Z80 = 220 ++ EM_VISIUM = 221 ++ EM_FT32 = 222 ++ EM_MOXIE = 223 ++ EM_AMDGPU = 224 ++ EM_RISCV = 243 ++ EM_BPF = 247 ++ EM_CSKY = 252 ++ EM_NUM = 253 ++ EM_ALPHA = 0x9026 ++ ++class Et(_OpenIntEnum): ++ """ELF file type. Type of ET_* values and the Ehdr.e_type field.""" ++ ET_NONE = 0 ++ ET_REL = 1 ++ ET_EXEC = 2 ++ ET_DYN = 3 ++ ET_CORE = 4 ++ ++class Shn(_OpenIntEnum): ++ """ELF reserved section indices.""" ++ SHN_UNDEF = 0 ++ SHN_BEFORE = 0xff00 ++ SHN_AFTER = 0xff01 ++ SHN_ABS = 0xfff1 ++ SHN_COMMON = 0xfff2 ++ SHN_XINDEX = 0xffff ++ ++class ShnMIPS(enum.Enum): ++ """Supplemental SHN_* constants for EM_MIPS.""" ++ SHN_MIPS_ACOMMON = 0xff00 ++ SHN_MIPS_TEXT = 0xff01 ++ SHN_MIPS_DATA = 0xff02 ++ SHN_MIPS_SCOMMON = 0xff03 ++ SHN_MIPS_SUNDEFINED = 0xff04 ++ ++class ShnPARISC(enum.Enum): ++ """Supplemental SHN_* constants for EM_PARISC.""" ++ SHN_PARISC_ANSI_COMMON = 0xff00 ++ SHN_PARISC_HUGE_COMMON = 0xff01 ++ ++class Sht(_OpenIntEnum): ++ """ELF section types. Type of SHT_* values.""" ++ SHT_NULL = 0 ++ SHT_PROGBITS = 1 ++ SHT_SYMTAB = 2 ++ SHT_STRTAB = 3 ++ SHT_RELA = 4 ++ SHT_HASH = 5 ++ SHT_DYNAMIC = 6 ++ SHT_NOTE = 7 ++ SHT_NOBITS = 8 ++ SHT_REL = 9 ++ SHT_SHLIB = 10 ++ SHT_DYNSYM = 11 ++ SHT_INIT_ARRAY = 14 ++ SHT_FINI_ARRAY = 15 ++ SHT_PREINIT_ARRAY = 16 ++ SHT_GROUP = 17 ++ SHT_SYMTAB_SHNDX = 18 ++ SHT_GNU_ATTRIBUTES = 0x6ffffff5 ++ SHT_GNU_HASH = 0x6ffffff6 ++ SHT_GNU_LIBLIST = 0x6ffffff7 ++ SHT_CHECKSUM = 0x6ffffff8 ++ SHT_SUNW_move = 0x6ffffffa ++ SHT_SUNW_COMDAT = 0x6ffffffb ++ SHT_SUNW_syminfo = 0x6ffffffc ++ SHT_GNU_verdef = 0x6ffffffd ++ SHT_GNU_verneed = 0x6ffffffe ++ SHT_GNU_versym = 0x6fffffff ++ ++class ShtALPHA(enum.Enum): ++ """Supplemental SHT_* constants for EM_ALPHA.""" ++ SHT_ALPHA_DEBUG = 0x70000001 ++ SHT_ALPHA_REGINFO = 0x70000002 ++ ++class ShtARM(enum.Enum): ++ """Supplemental SHT_* constants for EM_ARM.""" ++ SHT_ARM_EXIDX = 0x70000001 ++ SHT_ARM_PREEMPTMAP = 0x70000002 ++ SHT_ARM_ATTRIBUTES = 0x70000003 ++ ++class ShtCSKY(enum.Enum): ++ """Supplemental SHT_* constants for EM_CSKY.""" ++ SHT_CSKY_ATTRIBUTES = 0x70000001 ++ ++class ShtIA_64(enum.Enum): ++ """Supplemental SHT_* constants for EM_IA_64.""" ++ SHT_IA_64_EXT = 0x70000000 ++ SHT_IA_64_UNWIND = 0x70000001 ++ ++class ShtMIPS(enum.Enum): ++ """Supplemental SHT_* constants for EM_MIPS.""" ++ SHT_MIPS_LIBLIST = 0x70000000 ++ SHT_MIPS_MSYM = 0x70000001 ++ SHT_MIPS_CONFLICT = 0x70000002 ++ SHT_MIPS_GPTAB = 0x70000003 ++ SHT_MIPS_UCODE = 0x70000004 ++ SHT_MIPS_DEBUG = 0x70000005 ++ SHT_MIPS_REGINFO = 0x70000006 ++ SHT_MIPS_PACKAGE = 0x70000007 ++ SHT_MIPS_PACKSYM = 0x70000008 ++ SHT_MIPS_RELD = 0x70000009 ++ SHT_MIPS_IFACE = 0x7000000b ++ SHT_MIPS_CONTENT = 0x7000000c ++ SHT_MIPS_OPTIONS = 0x7000000d ++ SHT_MIPS_SHDR = 0x70000010 ++ SHT_MIPS_FDESC = 0x70000011 ++ SHT_MIPS_EXTSYM = 0x70000012 ++ SHT_MIPS_DENSE = 0x70000013 ++ SHT_MIPS_PDESC = 0x70000014 ++ SHT_MIPS_LOCSYM = 0x70000015 ++ SHT_MIPS_AUXSYM = 0x70000016 ++ SHT_MIPS_OPTSYM = 0x70000017 ++ SHT_MIPS_LOCSTR = 0x70000018 ++ SHT_MIPS_LINE = 0x70000019 ++ SHT_MIPS_RFDESC = 0x7000001a ++ SHT_MIPS_DELTASYM = 0x7000001b ++ SHT_MIPS_DELTAINST = 0x7000001c ++ SHT_MIPS_DELTACLASS = 0x7000001d ++ SHT_MIPS_DWARF = 0x7000001e ++ SHT_MIPS_DELTADECL = 0x7000001f ++ SHT_MIPS_SYMBOL_LIB = 0x70000020 ++ SHT_MIPS_EVENTS = 0x70000021 ++ SHT_MIPS_TRANSLATE = 0x70000022 ++ SHT_MIPS_PIXIE = 0x70000023 ++ SHT_MIPS_XLATE = 0x70000024 ++ SHT_MIPS_XLATE_DEBUG = 0x70000025 ++ SHT_MIPS_WHIRL = 0x70000026 ++ SHT_MIPS_EH_REGION = 0x70000027 ++ SHT_MIPS_XLATE_OLD = 0x70000028 ++ SHT_MIPS_PDR_EXCEPTION = 0x70000029 ++ SHT_MIPS_XHASH = 0x7000002b ++ ++class ShtPARISC(enum.Enum): ++ """Supplemental SHT_* constants for EM_PARISC.""" ++ SHT_PARISC_EXT = 0x70000000 ++ SHT_PARISC_UNWIND = 0x70000001 ++ SHT_PARISC_DOC = 0x70000002 ++ ++class Pf(enum.IntFlag): ++ """Program header flags. Type of Phdr.p_flags values.""" ++ PF_X = 1 ++ PF_W = 2 ++ PF_R = 4 ++ ++class PfARM(enum.IntFlag): ++ """Supplemental PF_* flags for EM_ARM.""" ++ PF_ARM_SB = 0x10000000 ++ PF_ARM_PI = 0x20000000 ++ PF_ARM_ABS = 0x40000000 ++ ++class PfPARISC(enum.IntFlag): ++ """Supplemental PF_* flags for EM_PARISC.""" ++ PF_HP_PAGE_SIZE = 0x00100000 ++ PF_HP_FAR_SHARED = 0x00200000 ++ PF_HP_NEAR_SHARED = 0x00400000 ++ PF_HP_CODE = 0x01000000 ++ PF_HP_MODIFY = 0x02000000 ++ PF_HP_LAZYSWAP = 0x04000000 ++ PF_HP_SBP = 0x08000000 ++ ++class PfIA_64(enum.IntFlag): ++ """Supplemental PF_* flags for EM_IA_64.""" ++ PF_IA_64_NORECOV = 0x80000000 ++ ++class PfMIPS(enum.IntFlag): ++ """Supplemental PF_* flags for EM_MIPS.""" ++ PF_MIPS_LOCAL = 0x10000000 ++ ++class Shf(enum.IntFlag): ++ """Section flags. Type of Shdr.sh_type values.""" ++ SHF_WRITE = 1 << 0 ++ SHF_ALLOC = 1 << 1 ++ SHF_EXECINSTR = 1 << 2 ++ SHF_MERGE = 1 << 4 ++ SHF_STRINGS = 1 << 5 ++ SHF_INFO_LINK = 1 << 6 ++ SHF_LINK_ORDER = 1 << 7 ++ SHF_OS_NONCONFORMING = 256 ++ SHF_GROUP = 1 << 9 ++ SHF_TLS = 1 << 10 ++ SHF_COMPRESSED = 1 << 11 ++ SHF_GNU_RETAIN = 1 << 21 ++ SHF_ORDERED = 1 << 30 ++ SHF_EXCLUDE = 1 << 31 ++ ++class ShfALPHA(enum.IntFlag): ++ """Supplemental SHF_* constants for EM_ALPHA.""" ++ SHF_ALPHA_GPREL = 0x10000000 ++ ++class ShfARM(enum.IntFlag): ++ """Supplemental SHF_* constants for EM_ARM.""" ++ SHF_ARM_ENTRYSECT = 0x10000000 ++ SHF_ARM_COMDEF = 0x80000000 ++ ++class ShfIA_64(enum.IntFlag): ++ """Supplemental SHF_* constants for EM_IA_64.""" ++ SHF_IA_64_SHORT = 0x10000000 ++ SHF_IA_64_NORECOV = 0x20000000 ++ ++class ShfMIPS(enum.IntFlag): ++ """Supplemental SHF_* constants for EM_MIPS.""" ++ SHF_MIPS_GPREL = 0x10000000 ++ SHF_MIPS_MERGE = 0x20000000 ++ SHF_MIPS_ADDR = 0x40000000 ++ SHF_MIPS_STRINGS = 0x80000000 ++ SHF_MIPS_NOSTRIP = 0x08000000 ++ SHF_MIPS_LOCAL = 0x04000000 ++ SHF_MIPS_NAMES = 0x02000000 ++ SHF_MIPS_NODUPE = 0x01000000 ++ ++class ShfPARISC(enum.IntFlag): ++ """Supplemental SHF_* constants for EM_PARISC.""" ++ SHF_PARISC_SHORT = 0x20000000 ++ SHF_PARISC_HUGE = 0x40000000 ++ SHF_PARISC_SBP = 0x80000000 ++ ++class Stb(_OpenIntEnum): ++ """ELF symbol binding type.""" ++ STB_LOCAL = 0 ++ STB_GLOBAL = 1 ++ STB_WEAK = 2 ++ STB_GNU_UNIQUE = 10 ++ STB_MIPS_SPLIT_COMMON = 13 ++ ++class Stt(_OpenIntEnum): ++ """ELF symbol type.""" ++ STT_NOTYPE = 0 ++ STT_OBJECT = 1 ++ STT_FUNC = 2 ++ STT_SECTION = 3 ++ STT_FILE = 4 ++ STT_COMMON = 5 ++ STT_TLS = 6 ++ STT_GNU_IFUNC = 10 ++ ++class SttARM(enum.Enum): ++ """Supplemental STT_* constants for EM_ARM.""" ++ STT_ARM_TFUNC = 13 ++ STT_ARM_16BIT = 15 ++ ++class SttPARISC(enum.Enum): ++ """Supplemental STT_* constants for EM_PARISC.""" ++ STT_HP_OPAQUE = 11 ++ STT_HP_STUB = 12 ++ STT_PARISC_MILLICODE = 13 ++ ++class SttSPARC(enum.Enum): ++ """Supplemental STT_* constants for EM_SPARC.""" ++ STT_SPARC_REGISTER = 13 ++ ++class SttX86_64(enum.Enum): ++ """Supplemental STT_* constants for EM_X86_64.""" ++ SHT_X86_64_UNWIND = 0x70000001 ++ ++class Pt(_OpenIntEnum): ++ """ELF program header types. Type of Phdr.p_type.""" ++ PT_NULL = 0 ++ PT_LOAD = 1 ++ PT_DYNAMIC = 2 ++ PT_INTERP = 3 ++ PT_NOTE = 4 ++ PT_SHLIB = 5 ++ PT_PHDR = 6 ++ PT_TLS = 7 ++ PT_NUM = 8 ++ PT_GNU_EH_FRAME = 0x6474e550 ++ PT_GNU_STACK = 0x6474e551 ++ PT_GNU_RELRO = 0x6474e552 ++ PT_GNU_PROPERTY = 0x6474e553 ++ PT_SUNWBSS = 0x6ffffffa ++ PT_SUNWSTACK = 0x6ffffffb ++ ++class PtARM(enum.Enum): ++ """Supplemental PT_* constants for EM_ARM.""" ++ PT_ARM_EXIDX = 0x70000001 ++ ++class PtIA_64(enum.Enum): ++ """Supplemental PT_* constants for EM_IA_64.""" ++ PT_IA_64_HP_OPT_ANOT = 0x60000012 ++ PT_IA_64_HP_HSL_ANOT = 0x60000013 ++ PT_IA_64_HP_STACK = 0x60000014 ++ PT_IA_64_ARCHEXT = 0x70000000 ++ PT_IA_64_UNWIND = 0x70000001 ++ ++class PtMIPS(enum.Enum): ++ """Supplemental PT_* constants for EM_MIPS.""" ++ PT_MIPS_REGINFO = 0x70000000 ++ PT_MIPS_RTPROC = 0x70000001 ++ PT_MIPS_OPTIONS = 0x70000002 ++ PT_MIPS_ABIFLAGS = 0x70000003 ++ ++class PtPARISC(enum.Enum): ++ """Supplemental PT_* constants for EM_PARISC.""" ++ PT_HP_TLS = 0x60000000 ++ PT_HP_CORE_NONE = 0x60000001 ++ PT_HP_CORE_VERSION = 0x60000002 ++ PT_HP_CORE_KERNEL = 0x60000003 ++ PT_HP_CORE_COMM = 0x60000004 ++ PT_HP_CORE_PROC = 0x60000005 ++ PT_HP_CORE_LOADABLE = 0x60000006 ++ PT_HP_CORE_STACK = 0x60000007 ++ PT_HP_CORE_SHM = 0x60000008 ++ PT_HP_CORE_MMF = 0x60000009 ++ PT_HP_PARALLEL = 0x60000010 ++ PT_HP_FASTBIND = 0x60000011 ++ PT_HP_OPT_ANNOT = 0x60000012 ++ PT_HP_HSL_ANNOT = 0x60000013 ++ PT_HP_STACK = 0x60000014 ++ PT_PARISC_ARCHEXT = 0x70000000 ++ PT_PARISC_UNWIND = 0x70000001 ++ ++class Dt(_OpenIntEnum): ++ """ELF dynamic segment tags. Type of Dyn.d_val.""" ++ DT_NULL = 0 ++ DT_NEEDED = 1 ++ DT_PLTRELSZ = 2 ++ DT_PLTGOT = 3 ++ DT_HASH = 4 ++ DT_STRTAB = 5 ++ DT_SYMTAB = 6 ++ DT_RELA = 7 ++ DT_RELASZ = 8 ++ DT_RELAENT = 9 ++ DT_STRSZ = 10 ++ DT_SYMENT = 11 ++ DT_INIT = 12 ++ DT_FINI = 13 ++ DT_SONAME = 14 ++ DT_RPATH = 15 ++ DT_SYMBOLIC = 16 ++ DT_REL = 17 ++ DT_RELSZ = 18 ++ DT_RELENT = 19 ++ DT_PLTREL = 20 ++ DT_DEBUG = 21 ++ DT_TEXTREL = 22 ++ DT_JMPREL = 23 ++ DT_BIND_NOW = 24 ++ DT_INIT_ARRAY = 25 ++ DT_FINI_ARRAY = 26 ++ DT_INIT_ARRAYSZ = 27 ++ DT_FINI_ARRAYSZ = 28 ++ DT_RUNPATH = 29 ++ DT_FLAGS = 30 ++ DT_PREINIT_ARRAY = 32 ++ DT_PREINIT_ARRAYSZ = 33 ++ DT_SYMTAB_SHNDX = 34 ++ DT_GNU_PRELINKED = 0x6ffffdf5 ++ DT_GNU_CONFLICTSZ = 0x6ffffdf6 ++ DT_GNU_LIBLISTSZ = 0x6ffffdf7 ++ DT_CHECKSUM = 0x6ffffdf8 ++ DT_PLTPADSZ = 0x6ffffdf9 ++ DT_MOVEENT = 0x6ffffdfa ++ DT_MOVESZ = 0x6ffffdfb ++ DT_FEATURE_1 = 0x6ffffdfc ++ DT_POSFLAG_1 = 0x6ffffdfd ++ DT_SYMINSZ = 0x6ffffdfe ++ DT_SYMINENT = 0x6ffffdff ++ DT_GNU_HASH = 0x6ffffef5 ++ DT_TLSDESC_PLT = 0x6ffffef6 ++ DT_TLSDESC_GOT = 0x6ffffef7 ++ DT_GNU_CONFLICT = 0x6ffffef8 ++ DT_GNU_LIBLIST = 0x6ffffef9 ++ DT_CONFIG = 0x6ffffefa ++ DT_DEPAUDIT = 0x6ffffefb ++ DT_AUDIT = 0x6ffffefc ++ DT_PLTPAD = 0x6ffffefd ++ DT_MOVETAB = 0x6ffffefe ++ DT_SYMINFO = 0x6ffffeff ++ DT_VERSYM = 0x6ffffff0 ++ DT_RELACOUNT = 0x6ffffff9 ++ DT_RELCOUNT = 0x6ffffffa ++ DT_FLAGS_1 = 0x6ffffffb ++ DT_VERDEF = 0x6ffffffc ++ DT_VERDEFNUM = 0x6ffffffd ++ DT_VERNEED = 0x6ffffffe ++ DT_VERNEEDNUM = 0x6fffffff ++ DT_AUXILIARY = 0x7ffffffd ++ DT_FILTER = 0x7fffffff ++ ++class DtAARCH64(enum.Enum): ++ """Supplemental DT_* constants for EM_AARCH64.""" ++ DT_AARCH64_BTI_PLT = 0x70000001 ++ DT_AARCH64_PAC_PLT = 0x70000003 ++ DT_AARCH64_VARIANT_PCS = 0x70000005 ++ ++class DtALPHA(enum.Enum): ++ """Supplemental DT_* constants for EM_ALPHA.""" ++ DT_ALPHA_PLTRO = 0x70000000 ++ ++class DtALTERA_NIOS2(enum.Enum): ++ """Supplemental DT_* constants for EM_ALTERA_NIOS2.""" ++ DT_NIOS2_GP = 0x70000002 ++ ++class DtIA_64(enum.Enum): ++ """Supplemental DT_* constants for EM_IA_64.""" ++ DT_IA_64_PLT_RESERVE = 0x70000000 ++ ++class DtMIPS(enum.Enum): ++ """Supplemental DT_* constants for EM_MIPS.""" ++ DT_MIPS_RLD_VERSION = 0x70000001 ++ DT_MIPS_TIME_STAMP = 0x70000002 ++ DT_MIPS_ICHECKSUM = 0x70000003 ++ DT_MIPS_IVERSION = 0x70000004 ++ DT_MIPS_FLAGS = 0x70000005 ++ DT_MIPS_BASE_ADDRESS = 0x70000006 ++ DT_MIPS_MSYM = 0x70000007 ++ DT_MIPS_CONFLICT = 0x70000008 ++ DT_MIPS_LIBLIST = 0x70000009 ++ DT_MIPS_LOCAL_GOTNO = 0x7000000a ++ DT_MIPS_CONFLICTNO = 0x7000000b ++ DT_MIPS_LIBLISTNO = 0x70000010 ++ DT_MIPS_SYMTABNO = 0x70000011 ++ DT_MIPS_UNREFEXTNO = 0x70000012 ++ DT_MIPS_GOTSYM = 0x70000013 ++ DT_MIPS_HIPAGENO = 0x70000014 ++ DT_MIPS_RLD_MAP = 0x70000016 ++ DT_MIPS_DELTA_CLASS = 0x70000017 ++ DT_MIPS_DELTA_CLASS_NO = 0x70000018 ++ DT_MIPS_DELTA_INSTANCE = 0x70000019 ++ DT_MIPS_DELTA_INSTANCE_NO = 0x7000001a ++ DT_MIPS_DELTA_RELOC = 0x7000001b ++ DT_MIPS_DELTA_RELOC_NO = 0x7000001c ++ DT_MIPS_DELTA_SYM = 0x7000001d ++ DT_MIPS_DELTA_SYM_NO = 0x7000001e ++ DT_MIPS_DELTA_CLASSSYM = 0x70000020 ++ DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021 ++ DT_MIPS_CXX_FLAGS = 0x70000022 ++ DT_MIPS_PIXIE_INIT = 0x70000023 ++ DT_MIPS_SYMBOL_LIB = 0x70000024 ++ DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025 ++ DT_MIPS_LOCAL_GOTIDX = 0x70000026 ++ DT_MIPS_HIDDEN_GOTIDX = 0x70000027 ++ DT_MIPS_PROTECTED_GOTIDX = 0x70000028 ++ DT_MIPS_OPTIONS = 0x70000029 ++ DT_MIPS_INTERFACE = 0x7000002a ++ DT_MIPS_DYNSTR_ALIGN = 0x7000002b ++ DT_MIPS_INTERFACE_SIZE = 0x7000002c ++ DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002d ++ DT_MIPS_PERF_SUFFIX = 0x7000002e ++ DT_MIPS_COMPACT_SIZE = 0x7000002f ++ DT_MIPS_GP_VALUE = 0x70000030 ++ DT_MIPS_AUX_DYNAMIC = 0x70000031 ++ DT_MIPS_PLTGOT = 0x70000032 ++ DT_MIPS_RWPLT = 0x70000034 ++ DT_MIPS_RLD_MAP_REL = 0x70000035 ++ DT_MIPS_XHASH = 0x70000036 ++ ++class DtPPC(enum.Enum): ++ """Supplemental DT_* constants for EM_PPC.""" ++ DT_PPC_GOT = 0x70000000 ++ DT_PPC_OPT = 0x70000001 ++ ++class DtPPC64(enum.Enum): ++ """Supplemental DT_* constants for EM_PPC64.""" ++ DT_PPC64_GLINK = 0x70000000 ++ DT_PPC64_OPD = 0x70000001 ++ DT_PPC64_OPDSZ = 0x70000002 ++ DT_PPC64_OPT = 0x70000003 ++ ++class DtSPARC(enum.Enum): ++ """Supplemental DT_* constants for EM_SPARC.""" ++ DT_SPARC_REGISTER = 0x70000001 ++ ++class StInfo: ++ """ELF symbol binding and type. Type of the Sym.st_info field.""" ++ def __init__(self, arg0, arg1=None): ++ if isinstance(arg0, int) and arg1 is None: ++ self.bind = Stb(arg0 >> 4) ++ self.type = Stt(arg0 & 15) ++ else: ++ self.bind = Stb(arg0) ++ self.type = Stt(arg1) ++ ++ def value(self): ++ """Returns the raw value for the bind/type combination.""" ++ return (self.bind.value() << 4) | (self.type.value()) ++ ++# Type in an ELF file. Used for deserialization. ++_Layout = collections.namedtuple('_Layout', 'unpack size') ++ ++def _define_layouts(baseclass: type, layout32: str, layout64: str, ++ types=None, fields32=None): ++ """Assign variants dict to baseclass. ++ ++ The variants dict is indexed by (ElfClass, ElfData) pairs, and its ++ values are _Layout instances. ++ ++ """ ++ struct32 = struct.Struct(layout32) ++ struct64 = struct.Struct(layout64) ++ ++ # Check that the struct formats yield the right number of components. ++ for s in (struct32, struct64): ++ example = s.unpack(b' ' * s.size) ++ if len(example) != len(baseclass._fields): ++ raise ValueError('{!r} yields wrong field count: {} != {}'.format( ++ s.format, len(example), len(baseclass._fields))) ++ ++ # Check that field names in types are correct. ++ if types is None: ++ types = () ++ for n in types: ++ if n not in baseclass._fields: ++ raise ValueError('{} does not have field {!r}'.format( ++ baseclass.__name__, n)) ++ ++ if fields32 is not None \ ++ and set(fields32) != set(baseclass._fields): ++ raise ValueError('{!r} is not a permutation of the fields {!r}'.format( ++ fields32, baseclass._fields)) ++ ++ def unique_name(name, used_names = (set((baseclass.__name__,)) ++ | set(baseclass._fields) ++ | {n.__name__ ++ for n in (types or {}).values()})): ++ """Find a name that is not used for a class or field name.""" ++ candidate = name ++ n = 0 ++ while candidate in used_names: ++ n += 1 ++ candidate = '{}{}'.format(name, n) ++ used_names.add(candidate) ++ return candidate ++ ++ blob_name = unique_name('blob') ++ struct_unpack_name = unique_name('struct_unpack') ++ comps_name = unique_name('comps') ++ ++ layouts = {} ++ for (bits, elfclass, layout, fields) in ( ++ (32, ElfClass.ELFCLASS32, layout32, fields32), ++ (64, ElfClass.ELFCLASS64, layout64, None), ++ ): ++ for (elfdata, structprefix, funcsuffix) in ( ++ (ElfData.ELFDATA2LSB, '<', 'LE'), ++ (ElfData.ELFDATA2MSB, '>', 'BE'), ++ ): ++ env = { ++ baseclass.__name__: baseclass, ++ struct_unpack_name: struct.unpack, ++ } ++ ++ # Add the type converters. ++ if types: ++ for cls in types.values(): ++ env[cls.__name__] = cls ++ ++ funcname = ''.join( ++ ('unpack_', baseclass.__name__, str(bits), funcsuffix)) ++ ++ code = ''' ++def {funcname}({blob_name}): ++'''.format(funcname=funcname, blob_name=blob_name) ++ ++ indent = ' ' * 4 ++ unpack_call = '{}({!r}, {})'.format( ++ struct_unpack_name, structprefix + layout, blob_name) ++ field_names = ', '.join(baseclass._fields) ++ if types is None and fields is None: ++ code += '{}return {}({})\n'.format( ++ indent, baseclass.__name__, unpack_call) ++ else: ++ # Destructuring tuple assignment. ++ if fields is None: ++ code += '{}{} = {}\n'.format( ++ indent, field_names, unpack_call) ++ else: ++ # Use custom field order. ++ code += '{}{} = {}\n'.format( ++ indent, ', '.join(fields), unpack_call) ++ ++ # Perform the type conversions. ++ for n in baseclass._fields: ++ if n in types: ++ code += '{}{} = {}({})\n'.format( ++ indent, n, types[n].__name__, n) ++ # Create the named tuple. ++ code += '{}return {}({})\n'.format( ++ indent, baseclass.__name__, field_names) ++ ++ exec(code, env) ++ layouts[(elfclass, elfdata)] = _Layout( ++ env[funcname], struct.calcsize(layout)) ++ baseclass.layouts = layouts ++ ++ ++# Corresponds to EI_* indices into Elf*_Ehdr.e_indent. ++class Ident(collections.namedtuple('Ident', ++ 'ei_mag ei_class ei_data ei_version ei_osabi ei_abiversion ei_pad')): ++ ++ def __new__(cls, *args): ++ """Construct an object from a blob or its constituent fields.""" ++ if len(args) == 1: ++ return cls.unpack(args[0]) ++ return cls.__base__.__new__(cls, *args) ++ ++ @staticmethod ++ def unpack(blob: memoryview) -> 'Ident': ++ """Parse raws data into a tuple.""" ++ ei_mag, ei_class, ei_data, ei_version, ei_osabi, ei_abiversion, \ ++ ei_pad = struct.unpack('4s5B7s', blob) ++ return Ident(ei_mag, ElfClass(ei_class), ElfData(ei_data), ++ ei_version, ei_osabi, ei_abiversion, ei_pad) ++ size = 16 ++ ++# Corresponds to Elf32_Ehdr and Elf64_Ehdr. ++Ehdr = collections.namedtuple('Ehdr', ++ 'e_ident e_type e_machine e_version e_entry e_phoff e_shoff e_flags' ++ + ' e_ehsize e_phentsize e_phnum e_shentsize e_shnum e_shstrndx') ++_define_layouts(Ehdr, ++ layout32='16s2H5I6H', ++ layout64='16s2HI3QI6H', ++ types=dict(e_ident=Ident, ++ e_machine=Machine, ++ e_type=Et, ++ e_shstrndx=Shn)) ++ ++# Corresponds to Elf32_Phdr and Elf64_Pdhr. Order follows the latter. ++Phdr = collections.namedtuple('Phdr', ++ 'p_type p_flags p_offset p_vaddr p_paddr p_filesz p_memsz p_align') ++_define_layouts(Phdr, ++ layout32='8I', ++ fields32=('p_type', 'p_offset', 'p_vaddr', 'p_paddr', ++ 'p_filesz', 'p_memsz', 'p_flags', 'p_align'), ++ layout64='2I6Q', ++ types=dict(p_type=Pt, p_flags=Pf)) ++ ++ ++# Corresponds to Elf32_Shdr and Elf64_Shdr. ++class Shdr(collections.namedtuple('Shdr', ++ 'sh_name sh_type sh_flags sh_addr sh_offset sh_size sh_link sh_info' ++ + ' sh_addralign sh_entsize')): ++ def resolve(self, strtab: 'StringTable') -> 'Shdr': ++ """Resolve sh_name using a string table.""" ++ return self.__class__(strtab.get(self[0]), *self[1:]) ++_define_layouts(Shdr, ++ layout32='10I', ++ layout64='2I4Q2I2Q', ++ types=dict(sh_type=Sht, ++ sh_flags=Shf, ++ sh_link=Shn)) ++ ++# Corresponds to Elf32_Dyn and Elf64_Dyn. The nesting through the ++# d_un union is skipped, and d_ptr is missing (its representation in ++# Python would be identical to d_val). ++Dyn = collections.namedtuple('Dyn', 'd_tag d_val') ++_define_layouts(Dyn, ++ layout32='2i', ++ layout64='2q', ++ types=dict(d_tag=Dt)) ++ ++# Corresponds to Elf32_Sym and Elf64_Sym. ++class Sym(collections.namedtuple('Sym', ++ 'st_name st_info st_other st_shndx st_value st_size')): ++ def resolve(self, strtab: 'StringTable') -> 'Sym': ++ """Resolve st_name using a string table.""" ++ return self.__class__(strtab.get(self[0]), *self[1:]) ++_define_layouts(Sym, ++ layout32='3I2BH', ++ layout64='I2BH2Q', ++ fields32=('st_name', 'st_value', 'st_size', 'st_info', ++ 'st_other', 'st_shndx'), ++ types=dict(st_shndx=Shn, ++ st_info=StInfo)) ++ ++# Corresponds to Elf32_Rel and Elf64_Rel. ++Rel = collections.namedtuple('Rel', 'r_offset r_info') ++_define_layouts(Rel, ++ layout32='2I', ++ layout64='2Q') ++ ++# Corresponds to Elf32_Rel and Elf64_Rel. ++Rela = collections.namedtuple('Rela', 'r_offset r_info r_addend') ++_define_layouts(Rela, ++ layout32='3I', ++ layout64='3Q') ++ ++class StringTable: ++ """ELF string table.""" ++ def __init__(self, blob): ++ """Create a new string table backed by the data in the blob. ++ ++ blob: a memoryview-like object ++ ++ """ ++ self.blob = blob ++ ++ def get(self, index) -> bytes: ++ """Returns the null-terminated byte string at the index.""" ++ blob = self.blob ++ endindex = index ++ while True: ++ if blob[endindex] == 0: ++ return bytes(blob[index:endindex]) ++ endindex += 1 ++ ++class Image: ++ """ELF image parser.""" ++ def __init__(self, image): ++ """Create an ELF image from binary image data. ++ ++ image: a memoryview-like object that supports efficient range ++ subscripting. ++ ++ """ ++ self.image = image ++ ident = self.read(Ident, 0) ++ classdata = (ident.ei_class, ident.ei_data) ++ # Set self.Ehdr etc. to the subtypes with the right parsers. ++ for typ in (Ehdr, Phdr, Shdr, Dyn, Sym, Rel, Rela): ++ setattr(self, typ.__name__, typ.layouts.get(classdata, None)) ++ ++ if self.Ehdr is not None: ++ self.ehdr = self.read(self.Ehdr, 0) ++ self._shdr_num = self._compute_shdr_num() ++ else: ++ self.ehdr = None ++ self._shdr_num = 0 ++ ++ self._section = {} ++ self._stringtab = {} ++ ++ if self._shdr_num > 0: ++ self._shdr_strtab = self._find_shdr_strtab() ++ else: ++ self._shdr_strtab = None ++ ++ @staticmethod ++ def readfile(path: str) -> 'Image': ++ """Reads the ELF file at the specified path.""" ++ with open(path, 'rb') as inp: ++ return Image(memoryview(inp.read())) ++ ++ def _compute_shdr_num(self) -> int: ++ """Computes the actual number of section headers.""" ++ shnum = self.ehdr.e_shnum ++ if shnum == 0: ++ if self.ehdr.e_shoff == 0 or self.ehdr.e_shentsize == 0: ++ # No section headers. ++ return 0 ++ # Otherwise the extension mechanism is used (which may be ++ # needed because e_shnum is just 16 bits). ++ return self.read(self.Shdr, self.ehdr.e_shoff).sh_size ++ return shnum ++ ++ def _find_shdr_strtab(self) -> StringTable: ++ """Finds the section header string table (maybe via extensions).""" ++ shstrndx = self.ehdr.e_shstrndx ++ if shstrndx == Shn.SHN_XINDEX: ++ shstrndx = self.read(self.Shdr, self.ehdr.e_shoff).sh_link ++ return self._find_stringtab(shstrndx) ++ ++ def read(self, typ: type, offset:int ): ++ """Reads an object at a specific offset. ++ ++ The type must have been enhanced using _define_variants. ++ ++ """ ++ return typ.unpack(self.image[offset: offset + typ.size]) ++ ++ def phdrs(self) -> Phdr: ++ """Generator iterating over the program headers.""" ++ if self.ehdr is None: ++ return ++ size = self.ehdr.e_phentsize ++ if size != self.Phdr.size: ++ raise ValueError('Unexpected Phdr size in ELF header: {} != {}' ++ .format(size, self.Phdr.size)) ++ ++ offset = self.ehdr.e_phoff ++ for _ in range(self.ehdr.e_phnum): ++ yield self.read(self.Phdr, offset) ++ offset += size ++ ++ def shdrs(self, resolve: bool=True) -> Shdr: ++ """Generator iterating over the section headers. ++ ++ If resolve, section names are automatically translated ++ using the section header string table. ++ ++ """ ++ if self._shdr_num == 0: ++ return ++ ++ size = self.ehdr.e_shentsize ++ if size != self.Shdr.size: ++ raise ValueError('Unexpected Shdr size in ELF header: {} != {}' ++ .format(size, self.Shdr.size)) ++ ++ offset = self.ehdr.e_shoff ++ for _ in range(self._shdr_num): ++ shdr = self.read(self.Shdr, offset) ++ if resolve: ++ shdr = shdr.resolve(self._shdr_strtab) ++ yield shdr ++ offset += size ++ ++ def dynamic(self) -> Dyn: ++ """Generator iterating over the dynamic segment.""" ++ for phdr in self.phdrs(): ++ if phdr.p_type == Pt.PT_DYNAMIC: ++ # Pick the first dynamic segment, like the loader. ++ if phdr.p_filesz == 0: ++ # Probably separated debuginfo. ++ return ++ offset = phdr.p_offset ++ end = offset + phdr.p_memsz ++ size = self.Dyn.size ++ while True: ++ next_offset = offset + size ++ if next_offset > end: ++ raise ValueError( ++ 'Dynamic segment size {} is not a multiple of Dyn size {}'.format( ++ phdr.p_memsz, size)) ++ yield self.read(self.Dyn, offset) ++ if next_offset == end: ++ return ++ offset = next_offset ++ ++ def syms(self, shdr: Shdr, resolve: bool=True) -> Sym: ++ """A generator iterating over a symbol table. ++ ++ If resolve, symbol names are automatically translated using ++ the string table for the symbol table. ++ ++ """ ++ assert shdr.sh_type == Sht.SHT_SYMTAB ++ size = shdr.sh_entsize ++ if size != self.Sym.size: ++ raise ValueError('Invalid symbol table entry size {}'.format(size)) ++ offset = shdr.sh_offset ++ end = shdr.sh_offset + shdr.sh_size ++ if resolve: ++ strtab = self._find_stringtab(shdr.sh_link) ++ while offset < end: ++ sym = self.read(self.Sym, offset) ++ if resolve: ++ sym = sym.resolve(strtab) ++ yield sym ++ offset += size ++ if offset != end: ++ raise ValueError('Symbol table is not a multiple of entry size') ++ ++ def lookup_string(self, strtab_index: int, strtab_offset: int) -> bytes: ++ """Looks up a string in a string table identified by its link index.""" ++ try: ++ strtab = self._stringtab[strtab_index] ++ except KeyError: ++ strtab = self._find_stringtab(strtab_index) ++ return strtab.get(strtab_offset) ++ ++ def find_section(self, shndx: Shn) -> Shdr: ++ """Returns the section header for the indexed section. ++ ++ The section name is not resolved. ++ """ ++ try: ++ return self._section[shndx] ++ except KeyError: ++ pass ++ if shndx in Shn: ++ raise ValueError('Reserved section index {}'.format(shndx)) ++ idx = shndx.value ++ if idx < 0 or idx > self._shdr_num: ++ raise ValueError('Section index {} out of range [0, {})'.format( ++ idx, self._shdr_num)) ++ shdr = self.read( ++ self.Shdr, self.ehdr.e_shoff + idx * self.Shdr.size) ++ self._section[shndx] = shdr ++ return shdr ++ ++ def _find_stringtab(self, sh_link: int) -> StringTable: ++ if sh_link in self._stringtab: ++ return self._stringtab ++ if sh_link < 0 or sh_link >= self._shdr_num: ++ raise ValueError('Section index {} out of range [0, {})'.format( ++ sh_link, self._shdr_num)) ++ shdr = self.read( ++ self.Shdr, self.ehdr.e_shoff + sh_link * self.Shdr.size) ++ if shdr.sh_type != Sht.SHT_STRTAB: ++ raise ValueError( ++ 'Section {} is not a string table: {}'.format( ++ sh_link, shdr.sh_type)) ++ strtab = StringTable( ++ self.image[shdr.sh_offset:shdr.sh_offset + shdr.sh_size]) ++ # This could retrain essentially arbitrary amounts of data, ++ # but caching string tables seems important for performance. ++ self._stringtab[sh_link] = strtab ++ return strtab ++ ++ ++__all__ = [name for name in dir() if name[0].isupper()] diff --git a/glibc-upstream-2.34-168.patch b/glibc-upstream-2.34-168.patch new file mode 100644 index 0000000..49e07b7 --- /dev/null +++ b/glibc-upstream-2.34-168.patch @@ -0,0 +1,407 @@ +commit f0c71b34f96c816292c49122d50da3a511b67bf2 +Author: Florian Weimer +Date: Mon Apr 11 11:30:31 2022 +0200 + + Default to --with-default-link=no (bug 25812) + + This is necessary to place the libio vtables into the RELRO segment. + New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to + verify that this is what actually happens. + + The new tests fail on ia64 due to lack of (default) RELRO support + inbutils, so they are XFAILed there. + + (cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7) + +diff --git a/INSTALL b/INSTALL +index d8d4e9f155f56616..60d01568d77645c7 100644 +--- a/INSTALL ++++ b/INSTALL +@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization. For example: + library will still be usable, but functionality may be lost--for + example, you can't build a shared libc with old binutils. + ++'--with-default-link=FLAG' ++ With '--with-default-link=yes', the build system does not use a ++ custom linker script for linking shared objects. The default for ++ FLAG is the opposite, 'no', because the custom linker script is ++ needed for full RELRO protection. ++ + '--with-nonshared-cflags=CFLAGS' + Use additional compiler flags CFLAGS to build the parts of the + library which are always statically linked into applications and +diff --git a/configure b/configure +index 03f4e59e754b5463..34c64f8de44e3086 100755 +--- a/configure ++++ b/configure +@@ -3373,7 +3373,7 @@ fi + if test "${with_default_link+set}" = set; then : + withval=$with_default_link; use_default_link=$withval + else +- use_default_link=default ++ use_default_link=no + fi + + +@@ -6085,69 +6085,6 @@ fi + $as_echo "$libc_cv_hashstyle" >&6; } + + +-# The linker's default -shared behavior is good enough if it +-# does these things that our custom linker scripts ensure that +-# all allocated NOTE sections come first. +-if test "$use_default_link" = default; then +- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5 +-$as_echo_n "checking for sufficient default -shared layout... " >&6; } +-if ${libc_cv_use_default_link+:} false; then : +- $as_echo_n "(cached) " >&6 +-else +- libc_cv_use_default_link=no +- cat > conftest.s <<\EOF +- .section .note.a,"a",%note +- .balign 4 +- .long 4,4,9 +- .string "GNU" +- .string "foo" +- .section .note.b,"a",%note +- .balign 4 +- .long 4,4,9 +- .string "GNU" +- .string "bar" +-EOF +- if { ac_try=' ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5' +- { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 +- (eval $ac_try) 2>&5 +- ac_status=$? +- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 +- test $ac_status = 0; }; } && +- ac_try=`$READELF -S conftest.so | sed -n \ +- '${x;p;} +- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/ +- t a +- b +- : a +- H'` +- then +- libc_seen_a=no libc_seen_b=no +- set -- $ac_try +- while test $# -ge 2 -a "$1" = NOTE; do +- case "$2" in +- .note.a) libc_seen_a=yes ;; +- .note.b) libc_seen_b=yes ;; +- esac +- shift 2 +- done +- case "$libc_seen_a$libc_seen_b" in +- yesyes) +- libc_cv_use_default_link=yes +- ;; +- *) +- echo >&5 "\ +-$libc_seen_a$libc_seen_b from: +-$ac_try" +- ;; +- esac +- fi +- rm -f conftest* +-fi +-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5 +-$as_echo "$libc_cv_use_default_link" >&6; } +- use_default_link=$libc_cv_use_default_link +-fi +- + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5 + $as_echo_n "checking for GLOB_DAT reloc... " >&6; } + if ${libc_cv_has_glob_dat+:} false; then : +diff --git a/configure.ac b/configure.ac +index eb9431875fae1b0e..2c69af0807266e7e 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link], + AS_HELP_STRING([--with-default-link], + [do not use explicit linker scripts]), + [use_default_link=$withval], +- [use_default_link=default]) ++ [use_default_link=no]) + + dnl Additional build flags injection. + AC_ARG_WITH([nonshared-cflags], +@@ -1378,59 +1378,6 @@ fi + rm -f conftest*]) + AC_SUBST(libc_cv_hashstyle) + +-# The linker's default -shared behavior is good enough if it +-# does these things that our custom linker scripts ensure that +-# all allocated NOTE sections come first. +-if test "$use_default_link" = default; then +- AC_CACHE_CHECK([for sufficient default -shared layout], +- libc_cv_use_default_link, [dnl +- libc_cv_use_default_link=no +- cat > conftest.s <<\EOF +- .section .note.a,"a",%note +- .balign 4 +- .long 4,4,9 +- .string "GNU" +- .string "foo" +- .section .note.b,"a",%note +- .balign 4 +- .long 4,4,9 +- .string "GNU" +- .string "bar" +-EOF +- if AC_TRY_COMMAND([dnl +- ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) && +- ac_try=`$READELF -S conftest.so | sed -n \ +- ['${x;p;} +- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/ +- t a +- b +- : a +- H']` +- then +- libc_seen_a=no libc_seen_b=no +- set -- $ac_try +- while test $# -ge 2 -a "$1" = NOTE; do +- case "$2" in +- .note.a) libc_seen_a=yes ;; +- .note.b) libc_seen_b=yes ;; +- esac +- shift 2 +- done +- case "$libc_seen_a$libc_seen_b" in +- yesyes) +- libc_cv_use_default_link=yes +- ;; +- *) +- echo >&AS_MESSAGE_LOG_FD "\ +-$libc_seen_a$libc_seen_b from: +-$ac_try" +- ;; +- esac +- fi +- rm -f conftest*]) +- use_default_link=$libc_cv_use_default_link +-fi +- + AC_CACHE_CHECK(for GLOB_DAT reloc, + libc_cv_has_glob_dat, [dnl + cat > conftest.c < $@ 2>&1; $(evaluate-test) ++# The optional symbols are present in libc only if the architecture has ++# the GLIBC_2.0 symbol set in libc. ++$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \ ++ $(common-objpfx)libc.so ++ $(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \ ++ --required=_IO_cookie_jumps \ ++ --required=_IO_file_jumps \ ++ --required=_IO_file_jumps_maybe_mmap \ ++ --required=_IO_file_jumps_mmap \ ++ --required=_IO_helper_jumps \ ++ --required=_IO_mem_jumps \ ++ --required=_IO_obstack_jumps \ ++ --required=_IO_proc_jumps \ ++ --required=_IO_str_chk_jumps \ ++ --required=_IO_str_jumps \ ++ --required=_IO_strn_jumps \ ++ --required=_IO_wfile_jumps \ ++ --required=_IO_wfile_jumps_maybe_mmap \ ++ --required=_IO_wfile_jumps_mmap \ ++ --required=_IO_wmem_jumps \ ++ --required=_IO_wstr_jumps \ ++ --required=_IO_wstrn_jumps \ ++ --optional=_IO_old_cookie_jumps \ ++ --optional=_IO_old_file_jumps \ ++ --optional=_IO_old_proc_jumps \ ++ > $@ 2>&1; $(evaluate-test) ++ + tests += $(tests-execstack-$(have-z-execstack)) + ifeq ($(run-built-tests),yes) + tests-special += \ +diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py +new file mode 100644 +index 0000000000000000..368ea3349f86bd81 +--- /dev/null ++++ b/elf/tst-relro-symbols.py +@@ -0,0 +1,137 @@ ++#!/usr/bin/python3 ++# Verify that certain symbols are covered by RELRO. ++# Copyright (C) 2022 Free Software Foundation, Inc. ++# This file is part of the GNU C Library. ++# ++# The GNU C Library is free software; you can redistribute it and/or ++# modify it under the terms of the GNU Lesser General Public ++# License as published by the Free Software Foundation; either ++# version 2.1 of the License, or (at your option) any later version. ++# ++# The GNU C Library is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with the GNU C Library; if not, see ++# . ++ ++"""Analyze a (shared) object to verify that certain symbols are ++present and covered by the PT_GNU_RELRO segment. ++ ++""" ++ ++import argparse ++import os.path ++import sys ++ ++# Make available glibc Python modules. ++sys.path.append(os.path.join( ++ os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts')) ++ ++import glibcelf ++ ++def find_relro(path: str, img: glibcelf.Image) -> (int, int): ++ """Discover the address range of the PT_GNU_RELRO segment.""" ++ for phdr in img.phdrs(): ++ if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO: ++ # The computation is not entirely accurate because ++ # _dl_protect_relro in elf/dl-reloc.c rounds both the ++ # start end and downwards using the run-time page size. ++ return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz ++ sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path)) ++ sys.exit(1) ++ ++def check_in_relro(kind, relro_begin, relro_end, name, start, size, error): ++ """Check if a section or symbol falls within in the RELRO segment.""" ++ end = start + size - 1 ++ if not (relro_begin <= start < end < relro_end): ++ error( ++ '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format( ++ kind, name.decode('UTF-8'), start, size, ++ relro_begin, relro_end)) ++ ++def get_parser(): ++ """Return an argument parser for this script.""" ++ parser = argparse.ArgumentParser(description=__doc__) ++ parser.add_argument('object', help='path to object file to check') ++ parser.add_argument('--required', metavar='NAME', default=(), ++ help='required symbol names', nargs='*') ++ parser.add_argument('--optional', metavar='NAME', default=(), ++ help='required symbol names', nargs='*') ++ return parser ++ ++def main(argv): ++ """The main entry point.""" ++ parser = get_parser() ++ opts = parser.parse_args(argv) ++ img = glibcelf.Image.readfile(opts.object) ++ ++ required_symbols = frozenset([sym.encode('UTF-8') ++ for sym in opts.required]) ++ optional_symbols = frozenset([sym.encode('UTF-8') ++ for sym in opts.optional]) ++ check_symbols = required_symbols | optional_symbols ++ ++ # Tracks the symbols in check_symbols that have been found. ++ symbols_found = set() ++ ++ # Discover the extent of the RELRO segment. ++ relro_begin, relro_end = find_relro(opts.object, img) ++ symbol_table_found = False ++ ++ errors = False ++ def error(msg: str) -> None: ++ """Record an error condition and write a message to standard output.""" ++ nonlocal errors ++ errors = True ++ sys.stdout.write('{}: error: {}\n'.format(opts.object, msg)) ++ ++ # Iterate over section headers to find the symbol table. ++ for shdr in img.shdrs(): ++ if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB: ++ symbol_table_found = True ++ for sym in img.syms(shdr): ++ if sym.st_name in check_symbols: ++ symbols_found.add(sym.st_name) ++ ++ # Validate symbol type, section, and size. ++ if sym.st_info.type != glibcelf.Stt.STT_OBJECT: ++ error('symbol {!r} has wrong type {}'.format( ++ sym.st_name.decode('UTF-8'), sym.st_info.type)) ++ if sym.st_shndx in glibcelf.Shn: ++ error('symbol {!r} has reserved section {}'.format( ++ sym.st_name.decode('UTF-8'), sym.st_shndx)) ++ continue ++ if sym.st_size == 0: ++ error('symbol {!r} has size zero'.format( ++ sym.st_name.decode('UTF-8'))) ++ continue ++ ++ check_in_relro('symbol', relro_begin, relro_end, ++ sym.st_name, sym.st_value, sym.st_size, ++ error) ++ continue # SHT_SYMTAB ++ if shdr.sh_name == b'.data.rel.ro' \ ++ or shdr.sh_name.startswith(b'.data.rel.ro.'): ++ check_in_relro('section', relro_begin, relro_end, ++ shdr.sh_name, shdr.sh_addr, shdr.sh_size, ++ error) ++ continue ++ ++ if required_symbols - symbols_found: ++ for sym in sorted(required_symbols - symbols_found): ++ error('symbol {!r} not found'.format(sym.decode('UTF-8'))) ++ ++ if errors: ++ sys.exit(1) ++ ++ if not symbol_table_found: ++ sys.stdout.write( ++ '{}: warning: no symbol table found (stripped object)\n'.format( ++ opts.object)) ++ sys.exit(77) ++ ++if __name__ == '__main__': ++ main(sys.argv[1:]) +diff --git a/manual/install.texi b/manual/install.texi +index 816b77a0a25a88a7..36a5af62bc5722b0 100644 +--- a/manual/install.texi ++++ b/manual/install.texi +@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be + usable, but functionality may be lost---for example, you can't build a + shared libc with old binutils. + ++@item --with-default-link=@var{FLAG} ++With @code{--with-default-link=yes}, the build system does not use a ++custom linker script for linking shared objects. The default for ++@var{FLAG} is the opposite, @samp{no}, because the custom linker script ++is needed for full RELRO protection. ++ + @item --with-nonshared-cflags=@var{cflags} + Use additional compiler flags @var{cflags} to build the parts of the + library which are always statically linked into applications and +diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile +index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644 +--- a/sysdeps/unix/sysv/linux/ia64/Makefile ++++ b/sysdeps/unix/sysv/linux/ia64/Makefile +@@ -1,3 +1,9 @@ ++ifeq ($(subdir),elf) ++# ia64 does not support PT_GNU_RELRO. ++test-xfail-tst-relro-ldso = yes ++test-xfail-tst-relro-libc = yes ++endif ++ + ifeq ($(subdir),misc) + sysdep_headers += sys/rse.h + endif diff --git a/glibc-upstream-2.34-169.patch b/glibc-upstream-2.34-169.patch new file mode 100644 index 0000000..63cb452 --- /dev/null +++ b/glibc-upstream-2.34-169.patch @@ -0,0 +1,87 @@ +commit ca0faa140ff8cebe4c041d935f0f5eb480873d99 +Author: Joan Bruguera +Date: Mon Apr 11 19:49:56 2022 +0200 + + misc: Fix rare fortify crash on wchar funcs. [BZ 29030] + + If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify + checks should pass, and `__whatever_alias` should be called. + + Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but + on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`. + + A comment says the -1 case should work as: "The -1 check is redundant because + since it implies that __glibc_safe_len_cond is true.". But this fails when: + * `__s > 1` + * `__osz == -1` (i.e. unknown size at compile time) + * `__l` is big enough + * `__l * __s <= __osz` can be folded to a constant + (I only found this to be true for `mbsrtowcs` and other functions in wchar2.h) + + In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be + called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program. + + This commit adds the explicit `__osz == -1` check again. + moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041 + + Minimal test case (test.c): + #include + + int main (void) + { + const char *hw = "HelloWorld"; + mbsrtowcs (NULL, &hw, (size_t)-1, NULL); + return 0; + } + + Build with: + gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test + + Output: + *** buffer overflow detected ***: terminated + + Fixes: BZ #29030 + Signed-off-by: Joan Bruguera + Signed-off-by: Siddhesh Poyarekar + (cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e) + +diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c +index 8b5902423cf0ad88..fb02452f5993c594 100644 +--- a/debug/tst-fortify.c ++++ b/debug/tst-fortify.c +@@ -1505,6 +1505,11 @@ do_test (void) + CHK_FAIL_END + #endif + ++ /* Bug 29030 regresion check */ ++ cp = "HelloWorld"; ++ if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10) ++ FAIL (); ++ + cp = "A"; + if (mbstowcs (wenough, cp, 10) != 1 + || wcscmp (wenough, L"A") != 0) +diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h +index 515fb681a0547217..b36013b9a6b4d9c3 100644 +--- a/misc/sys/cdefs.h ++++ b/misc/sys/cdefs.h +@@ -161,13 +161,13 @@ + || (__builtin_constant_p (__l) && (__l) > 0)) + + /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ +- condition can be folded to a constant and if it is true. The -1 check is +- redundant because since it implies that __glibc_safe_len_cond is true. */ ++ condition can be folded to a constant and if it is true, or unknown (-1) */ + #define __glibc_safe_or_unknown_len(__l, __s, __osz) \ +- (__glibc_unsigned_or_positive (__l) \ +- && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \ +- __s, __osz)) \ +- && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz)) ++ ((__osz) == (__SIZE_TYPE__) -1 \ ++ || (__glibc_unsigned_or_positive (__l) \ ++ && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \ ++ (__s), (__osz))) \ ++ && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz)))) + + /* Conversely, we know at compile time that the length is unsafe if the + __L * __S <= __OBJSZ condition can be folded to a constant and if it is diff --git a/glibc-upstream-2.34-170.patch b/glibc-upstream-2.34-170.patch new file mode 100644 index 0000000..11aa68c --- /dev/null +++ b/glibc-upstream-2.34-170.patch @@ -0,0 +1,49 @@ +commit 0d477e92c49db2906b32e44135b98746ccc73c7b +Author: Florian Weimer +Date: Tue Apr 26 14:22:10 2022 +0200 + + INSTALL: Rephrase -with-default-link documentation + + Reviewed-by: Carlos O'Donell + (cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f) + +diff --git a/INSTALL b/INSTALL +index 60d01568d77645c7..10a3dcdc0a8db665 100644 +--- a/INSTALL ++++ b/INSTALL +@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization. For example: + library will still be usable, but functionality may be lost--for + example, you can't build a shared libc with old binutils. + +-'--with-default-link=FLAG' +- With '--with-default-link=yes', the build system does not use a +- custom linker script for linking shared objects. The default for +- FLAG is the opposite, 'no', because the custom linker script is ++'--with-default-link' ++ With '--with-default-link', the build system does not use a custom ++ linker script for linking shared objects. The default is ++ '--without-default-link', because the custom linker script is + needed for full RELRO protection. + + '--with-nonshared-cflags=CFLAGS' +diff --git a/manual/install.texi b/manual/install.texi +index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644 +--- a/manual/install.texi ++++ b/manual/install.texi +@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be + usable, but functionality may be lost---for example, you can't build a + shared libc with old binutils. + +-@item --with-default-link=@var{FLAG} +-With @code{--with-default-link=yes}, the build system does not use a +-custom linker script for linking shared objects. The default for +-@var{FLAG} is the opposite, @samp{no}, because the custom linker script +-is needed for full RELRO protection. ++@item --with-default-link ++With @code{--with-default-link}, the build system does not use a custom ++linker script for linking shared objects. The default is ++@code{--without-default-link}, because the custom linker script is ++needed for full RELRO protection. + + @item --with-nonshared-cflags=@var{cflags} + Use additional compiler flags @var{cflags} to build the parts of the diff --git a/glibc-upstream-2.34-171.patch b/glibc-upstream-2.34-171.patch new file mode 100644 index 0000000..04e6898 --- /dev/null +++ b/glibc-upstream-2.34-171.patch @@ -0,0 +1,377 @@ +commit bc56ab1f4aa937665034373d3e320d0779a839aa +Author: Florian Weimer +Date: Tue Apr 26 14:23:02 2022 +0200 + + dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) + + When audit modules are loaded, ld.so initialization is not yet + complete, and rtld_active () returns false even though ld.so is + mostly working. Instead, the static dlopen hook is used, but that + does not work at all because this is not a static dlopen situation. + + Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework + static dlopen hooks") moved the hook pointer into _rtld_global_ro, + which means that separate protection is not needed anymore and the + hook pointer can be checked directly. + + The guard for disabling libio vtable hardening in _IO_vtable_check + should stay for now. + + Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so: + Examine GLRO to detect inactive loader [BZ #20204]"). + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b) + +diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c +index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644 +--- a/dlfcn/dladdr.c ++++ b/dlfcn/dladdr.c +@@ -24,7 +24,7 @@ int + __dladdr (const void *address, Dl_info *info) + { + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dladdr (address, info); + #endif + return _dl_addr (address, info, NULL, NULL); +diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c +index 78560dbac208c316..93ce68c1d6067fe2 100644 +--- a/dlfcn/dladdr1.c ++++ b/dlfcn/dladdr1.c +@@ -24,7 +24,7 @@ int + __dladdr1 (const void *address, Dl_info *info, void **extra, int flags) + { + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags); + #endif + +diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c +index 6a013a81bb648191..07ecb21bf7d43be4 100644 +--- a/dlfcn/dlclose.c ++++ b/dlfcn/dlclose.c +@@ -24,7 +24,7 @@ int + __dlclose (void *handle) + { + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlclose (handle); + #endif + +diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c +index 5047b140662bc33e..63da79c63000eef0 100644 +--- a/dlfcn/dlerror.c ++++ b/dlfcn/dlerror.c +@@ -32,7 +32,7 @@ char * + __dlerror (void) + { + # ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlerror (); + # endif + +diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c +index c6f9a1da09ff8622..47d2daa96fa5986f 100644 +--- a/dlfcn/dlinfo.c ++++ b/dlfcn/dlinfo.c +@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg) + int + ___dlinfo (void *handle, int request, void *arg) + { +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg); + else + return dlinfo_implementation (handle, request, arg); +diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c +index c171c8953da20fc7..2309224eb8484b1a 100644 +--- a/dlfcn/dlmopen.c ++++ b/dlfcn/dlmopen.c +@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode, + void * + ___dlmopen (Lmid_t nsid, const char *file, int mode) + { +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0)); + else + return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0)); +diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c +index e04b374b82b04337..9c59c751c4eaf7a7 100644 +--- a/dlfcn/dlopen.c ++++ b/dlfcn/dlopen.c +@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller) + void * + ___dlopen (const char *file, int mode) + { +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0)); + else + return dlopen_implementation (file, mode, RETURN_ADDRESS (0)); +diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c +index 9115501ac121eeca..c2f2a42194d50953 100644 +--- a/dlfcn/dlopenold.c ++++ b/dlfcn/dlopenold.c +@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode) + mode |= RTLD_LAZY; + args.mode = mode; + +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0)); + + return _dlerror_run (dlopen_doit, &args) ? NULL : args.new; +diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c +index 43044cf7bb95801e..d3861170a7631d01 100644 +--- a/dlfcn/dlsym.c ++++ b/dlfcn/dlsym.c +@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller) + void * + ___dlsym (void *handle, const char *name) + { +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0)); + else + return dlsym_implementation (handle, name, RETURN_ADDRESS (0)); +diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c +index 9b76f9afa513e11f..3af02109c306b800 100644 +--- a/dlfcn/dlvsym.c ++++ b/dlfcn/dlvsym.c +@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version, + void * + ___dlvsym (void *handle, const char *name, const char *version) + { +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version, + RETURN_ADDRESS (0)); + else +diff --git a/elf/Makefile b/elf/Makefile +index fec6e23b5b625e3b..c89a6a58690646ee 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -376,6 +376,7 @@ tests += \ + tst-audit24d \ + tst-audit25a \ + tst-audit25b \ ++ tst-audit26 \ + tst-auditmany \ + tst-auxobj \ + tst-auxobj-dlopen \ +@@ -721,6 +722,7 @@ modules-names = \ + tst-auditmod24c \ + tst-auditmod24d \ + tst-auditmod25 \ ++ tst-auditmod26 \ + tst-auxvalmod \ + tst-big-note-lib \ + tst-deep1mod1 \ +@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \ + LDFLAGS-tst-audit25b = -Wl,-z,now + tst-audit25b-ARGS = -- $(host-test-program-cmd) + ++$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so ++$(objpfx)tst-auditmod26.so: $(libsupport) ++tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so ++ + # tst-sonamemove links against an older implementation of the library. + LDFLAGS-tst-sonamemove-linkmod1.so = \ + -Wl,--version-script=tst-sonamemove-linkmod1.map \ +diff --git a/elf/dl-libc.c b/elf/dl-libc.c +index d5bc4a277f4c6ef3..db4342a3256921f0 100644 +--- a/elf/dl-libc.c ++++ b/elf/dl-libc.c +@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode) + args.caller_dlopen = RETURN_ADDRESS (0); + + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode); + #endif + return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map; +@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name) + args.name = name; + + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name); + #endif + return (dlerror_run (do_dlsym, &args) ? NULL +@@ -199,7 +199,7 @@ void * + __libc_dlvsym (void *map, const char *name, const char *version) + { + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version); + #endif + +@@ -222,7 +222,7 @@ int + __libc_dlclose (void *map) + { + #ifdef SHARED +- if (!rtld_active ()) ++ if (GLRO (dl_dlfcn_hook) != NULL) + return GLRO (dl_dlfcn_hook)->libc_dlclose (map); + #endif + return dlerror_run (do_dlclose, map); +diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c +new file mode 100644 +index 0000000000000000..3f920e83bac247a5 +--- /dev/null ++++ b/elf/tst-audit26.c +@@ -0,0 +1,35 @@ ++/* Check the usability of functions in audit modules. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#include ++#include ++ ++static int ++do_test (void) ++{ ++ /* Check that the audit module has been loaded. */ ++ void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW); ++ TEST_VERIFY (handle ++ == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD)); ++ ++ return 0; ++} ++ ++#include +diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c +new file mode 100644 +index 0000000000000000..db7ba95abec20f53 +--- /dev/null ++++ b/elf/tst-auditmod26.c +@@ -0,0 +1,104 @@ ++/* Check the usability of functions in audit modules. Audit module. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++unsigned int ++la_version (unsigned int current) ++{ ++ /* Exercise various functions. */ ++ ++ /* Check dlopen, dlsym, dlclose. */ ++ void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW); ++ void *ptr = xdlsym (handle, "sincos"); ++ TEST_VERIFY (ptr != NULL); ++ ptr = dlsym (handle, "SINCOS"); ++ TEST_VERIFY (ptr == NULL); ++ const char *message = dlerror (); ++ TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL); ++ ptr = dlsym (handle, "SINCOS"); ++ TEST_VERIFY (ptr == NULL); ++ xdlclose (handle); ++ TEST_COMPARE_STRING (dlerror (), NULL); ++ ++ handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD); ++ ++ /* Check dlvsym. _exit is unlikely to gain another symbol ++ version. */ ++ TEST_VERIFY (xdlsym (handle, "_exit") ++ == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING)); ++ ++ /* Check dlinfo. */ ++ { ++ void *handle2 = NULL; ++ TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0); ++ TEST_VERIFY (handle2 == handle); ++ } ++ ++ /* Check dladdr and dladdr1. */ ++ Dl_info info = { }; ++ TEST_VERIFY (dladdr (&_exit, &info) != 0); ++ if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias. */ ++ TEST_COMPARE_STRING (info.dli_sname, "_exit"); ++ TEST_VERIFY (info.dli_saddr == &_exit); ++ TEST_VERIFY (strstr (info.dli_fname, LIBC_SO)); ++ void *extra_info; ++ memset (&info, 0, sizeof (info)); ++ TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0); ++ TEST_VERIFY (extra_info == handle); ++ ++ /* Verify that dlmopen creates a new namespace. */ ++ void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW); ++ TEST_VERIFY (dlmopen_handle != handle); ++ memset (&info, 0, sizeof (info)); ++ extra_info = NULL; ++ ptr = xdlsym (dlmopen_handle, "_exit"); ++ TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0); ++ TEST_VERIFY (extra_info == dlmopen_handle); ++ xdlclose (dlmopen_handle); ++ ++ /* Terminate the process with an error state. This does not happen ++ automatically because the audit module state is not shared with ++ the main program. */ ++ if (support_record_failure_is_failed ()) ++ { ++ fflush (stdout); ++ fflush (stderr); ++ _exit (1); ++ } ++ ++ return LAV_CURRENT; ++} ++ ++char * ++la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) ++{ ++ if (strcmp (name, "mapped to libc") == 0) ++ return (char *) LIBC_SO; ++ else ++ return (char *) name; ++} diff --git a/glibc-upstream-2.34-172.patch b/glibc-upstream-2.34-172.patch new file mode 100644 index 0000000..06dc695 --- /dev/null +++ b/glibc-upstream-2.34-172.patch @@ -0,0 +1,28 @@ +commit 83cc145830bdbefdabe03787ed884d548bea9c99 +Author: Florian Weimer +Date: Fri Apr 22 19:34:52 2022 +0200 + + scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier + + enum.IntFlag and enum.EnumMeta._missing_ support are not part of + earlier Python versions. + + (cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b) + +diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py +index 8f7d0ca184845714..da0d5380f33a195e 100644 +--- a/scripts/glibcelf.py ++++ b/scripts/glibcelf.py +@@ -28,6 +28,12 @@ import collections + import enum + import struct + ++if not hasattr(enum, 'IntFlag'): ++ import sys ++ sys.stdout.write( ++ 'warning: glibcelf.py needs Python 3.6 for enum support\n') ++ sys.exit(77) ++ + class _OpenIntEnum(enum.IntEnum): + """Integer enumeration that supports arbitrary int values.""" + @classmethod diff --git a/glibc-upstream-2.34-173.patch b/glibc-upstream-2.34-173.patch new file mode 100644 index 0000000..69a92b8 --- /dev/null +++ b/glibc-upstream-2.34-173.patch @@ -0,0 +1,254 @@ +commit 16245986fb9bfe396113fc7dfd1929f69a9e748e +Author: H.J. Lu +Date: Fri Aug 20 06:42:24 2021 -0700 + + x86-64: Optimize load of all bits set into ZMM register [BZ #28252] + + Optimize loads of all bits set into ZMM register in AVX512 SVML codes + by replacing + + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX + + and + + vmovups .L_2il0floatpacket.13(%rip), %zmmX + + with + vpternlogd $0xff, %zmmX, %zmmX, %zmmX + + This fixes BZ #28252. + + (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260) + +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +index e68fcdbb16a79f36..58e588a3d42a8bc9 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + vmovaps %zmm0, %zmm8 + + /* Check for large arguments path */ +- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 ++ vpternlogd $0xff, %zmm2, %zmm2, %zmm2 + + /* + ARGUMENT RANGE REDUCTION: +@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_cos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.16: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.16,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +index dfa2acafc486b56b..f5f117d474f66176 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + + /* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 +- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm1 + vpsrlq $32, %zmm4, %zmm6 + + /* reciprocal approximation good to at least 11 bits */ +@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_log_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.12: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.12,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +index be8ab7c6e0e33819..48d251db16ccab9d 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax +- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm14 + vmovups __dAbsMask(%rax), %zmm7 + vmovups __dInvPI(%rax), %zmm2 + vmovups __dRShifter(%rax), %zmm1 +@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_sin_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.14: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.14,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +index 611887082a545854..a4944a4feef6aa98 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos + + /* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 +- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + + /* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 +@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl) + ENTRY (_ZGVeN8vvv_sincos_skx) + WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx + END (_ZGVeN8vvv_sincos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.15: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.15,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +index f671d60d5dab5a0e..fe8474fed943e8ad 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 +- vmovups .L_2il0floatpacket.13(%rip), %zmm12 ++ vpternlogd $0xff, %zmm12, %zmm12, %zmm12 + vmovups __sRShifter(%rax), %zmm3 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sA9_FMA(%rax), %zmm9 +@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_cosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +index 637bfe3c06ab9ad4..229b7828cde04db2 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + vmovaps %zmm0, %zmm7 + + /* compare against threshold */ +- vmovups .L_2il0floatpacket.13(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + vmovups __sInvLn2(%rax), %zmm4 + vmovups __sShifter(%rax), %zmm1 + vmovups __sLn2hi(%rax), %zmm6 +@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + + #endif + END (_ZGVeN16v_expf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +index 9d790fbf0ad6c8ec..fa2aae986f543582 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax +- vmovups .L_2il0floatpacket.7(%rip), %zmm6 ++ vpternlogd $0xff, %zmm6, %zmm6, %zmm6 + vmovups _iBrkValue(%rax), %zmm4 + vmovups _sPoly_7(%rax), %zmm8 + +@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + + #endif + END (_ZGVeN16v_logf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.7: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.7,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpsrlq $32, %zmm3, %zmm2 + vpmovqd %zmm2, %ymm11 + vcvtps2pd %ymm14, %zmm13 +- vmovups .L_2il0floatpacket.23(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovaps %zmm14, %zmm26 + vpandd _ABSMASK(%rax), %zmm1, %zmm8 + vpcmpd $1, _INF(%rax), %zmm8, %k2 +@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpmovqd %zmm11, %ymm5 + vpxord %zmm10, %zmm10, %zmm10 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} +- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 ++ vpternlogd $0xff, %zmm4, %zmm4, %zmm4 + vpxord %zmm11, %zmm11, %zmm11 + vcvtdq2pd %ymm7, %zmm7 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} +@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + jmp .LBL_2_7 + #endif + END (_ZGVeN16vv_powf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.23: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.23,@object +-.L_2il0floatpacket.24: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.24,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +index 9cf359c86ff9bd70..a446c504f63c9399 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf + + /* Result sign calculations */ + vpternlogd $150, %zmm0, %zmm14, %zmm1 +- vmovups .L_2il0floatpacket.13(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + + /* Add correction term 0.5 for cos() part */ + vaddps %zmm8, %zmm5, %zmm15 +@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl) + ENTRY (_ZGVeN16vvv_sincosf_skx) + WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx + END (_ZGVeN16vvv_sincosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +index bd05109a62181f22..c1b352d0ad1992cd 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + + /* Check for large and special values */ +- vmovups .L_2il0floatpacket.11(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovups __sAbsMask(%rax), %zmm5 + vmovups __sInvPI(%rax), %zmm1 + vmovups __sRShifter(%rax), %zmm2 +@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_sinf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.11: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.11,@object diff --git a/glibc-upstream-2.34-174.patch b/glibc-upstream-2.34-174.patch new file mode 100644 index 0000000..3bf44a8 --- /dev/null +++ b/glibc-upstream-2.34-174.patch @@ -0,0 +1,42 @@ +commit b5a44a6a471aafd3677659a610f32468c40a666b +Author: Noah Goldstein +Date: Tue Sep 21 18:31:49 2021 -0500 + + x86: Modify ENTRY in sysdep.h so that p2align can be specified + + No bug. + + This change adds a new macro ENTRY_P2ALIGN which takes a second + argument, log2 of the desired function alignment. + + The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this + doesn't affect any existing functionality. + + Signed-off-by: Noah Goldstein + (cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index cac1d762fb3f99d0..937180c1bd791570 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -78,15 +78,18 @@ enum cf_protection_level + #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; + + /* Define an entry point visible from C. */ +-#define ENTRY(name) \ ++#define ENTRY_P2ALIGN(name, alignment) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ +- .align ALIGNARG(4); \ ++ .align ALIGNARG(alignment); \ + C_LABEL(name) \ + cfi_startproc; \ + _CET_ENDBR; \ + CALL_MCOUNT + ++/* Common entry 16 byte aligns. */ ++#define ENTRY(name) ENTRY_P2ALIGN (name, 4) ++ + #undef END + #define END(name) \ + cfi_endproc; \ diff --git a/glibc-upstream-2.34-175.patch b/glibc-upstream-2.34-175.patch new file mode 100644 index 0000000..5ebf0b7 --- /dev/null +++ b/glibc-upstream-2.34-175.patch @@ -0,0 +1,653 @@ +commit 5ec3416853c4150c4d13312e05f93a053586d528 +Author: Noah Goldstein +Date: Tue Sep 21 18:45:03 2021 -0500 + + x86: Optimize memcmp-evex-movbe.S for frontend behavior and size + + No bug. + + The frontend optimizations are to: + 1. Reorganize logically connected basic blocks so they are either in + the same cache line or adjacent cache lines. + 2. Avoid cases when basic blocks unnecissarily cross cache lines. + 3. Try and 32 byte align any basic blocks possible without sacrificing + code size. Smaller / Less hot basic blocks are used for this. + + Overall code size shrunk by 168 bytes. This should make up for any + extra costs due to aligning to 64 bytes. + + In general performance before deviated a great deal dependending on + whether entry alignment % 64 was 0, 16, 32, or 48. These changes + essentially make it so that the current implementation is at least + equal to the best alignment of the original for any arguments. + + The only additional optimization is in the page cross case. Branch on + equals case was removed from the size == [4, 7] case. As well the [4, + 7] and [2, 3] case where swapped as [4, 7] is likely a more hot + argument size. + + test-memcmp and test-wmemcmp are both passing. + + (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -34,7 +34,24 @@ + area. + 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. + 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. +- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ ++ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. ++ ++When possible the implementation tries to optimize for frontend in the ++following ways: ++Throughput: ++ 1. All code sections that fit are able to run optimally out of the ++ LSD. ++ 2. All code sections that fit are able to run optimally out of the ++ DSB ++ 3. Basic blocks are contained in minimum number of fetch blocks ++ necessary. ++ ++Latency: ++ 1. Logically connected basic blocks are put in the same ++ cache-line. ++ 2. Logically connected basic blocks that do not fit in the same ++ cache-line are put in adjacent lines. This can get beneficial ++ L2 spatial prefetching and L1 next-line prefetching. */ + + # include + +@@ -47,9 +64,11 @@ + # ifdef USE_AS_WMEMCMP + # define CHAR_SIZE 4 + # define VPCMP vpcmpd ++# define VPTEST vptestmd + # else + # define CHAR_SIZE 1 + # define VPCMP vpcmpub ++# define VPTEST vptestmb + # endif + + # define VEC_SIZE 32 +@@ -75,7 +94,9 @@ + */ + + .section .text.evex,"ax",@progbits +-ENTRY (MEMCMP) ++/* Cache align memcmp entry. This allows for much more thorough ++ frontend optimization. */ ++ENTRY_P2ALIGN (MEMCMP, 6) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -89,7 +110,7 @@ ENTRY (MEMCMP) + VPCMP $4, (%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to +- L(return_vec_[0,2]). For L(return_vec_3 destination register ++ L(return_vec_[0,2]). For L(return_vec_3) destination register + must be ecx. */ + testl %eax, %eax + jnz L(return_vec_0) +@@ -121,10 +142,6 @@ ENTRY (MEMCMP) + testl %ecx, %ecx + jnz L(return_vec_3) + +- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so +- compare with zero to get a mask is needed. */ +- vpxorq %XMM0, %XMM0, %XMM0 +- + /* Go to 4x VEC loop. */ + cmpq $(CHAR_PER_VEC * 8), %rdx + ja L(more_8x_vec) +@@ -148,47 +165,61 @@ ENTRY (MEMCMP) + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- /* Or together YMM1, YMM2, and YMM3 into YMM3. */ +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while +- oring with YMM3. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ +- VPCMP $4, %YMM4, %YMM0, %k1 ++ oring with YMM1. Result is stored in YMM4. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ ++ /* Or together YMM2, YMM3, and YMM4 into YMM4. */ ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ ++ /* Test YMM4 against itself. Store any CHAR mismatches in k1. ++ */ ++ VPTEST %YMM4, %YMM4, %k1 ++ /* k1 must go to ecx for L(return_vec_0_1_2_3). */ + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + +- /* NB: aligning 32 here allows for the rest of the jump targets +- to be tuned for 32 byte alignment. Most important this ensures +- the L(more_8x_vec) loop is 32 byte aligned. */ +- .p2align 5 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size = 0 but +- is also faster for size = CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) ++ .p2align 4 ++L(8x_end_return_vec_0_1_2_3): ++ movq %rdx, %rdi ++L(8x_return_vec_0_1_2_3): ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ VPTEST %YMM1, %YMM1, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) ++ VPTEST %YMM2, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) + +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Create mask in ecx for potentially in bound matches. */ +- bzhil %edx, %eax, %eax +- jnz L(return_vec_0) ++ VPTEST %YMM3, %YMM3, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one ++ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache ++ line. */ ++ bsfl %ecx, %ecx ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif + ret + + .p2align 4 +@@ -209,10 +240,11 @@ L(return_vec_0): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 1 +- which is good enough for a target not in a loop. */ ++ .p2align 4 + L(return_vec_1): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -226,10 +258,11 @@ L(return_vec_1): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 2 +- which is good enough for a target not in a loop. */ ++ .p2align 4,, 10 + L(return_vec_2): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -243,40 +276,6 @@ L(return_vec_2): + # endif + ret + +- .p2align 4 +-L(8x_return_vec_0_1_2_3): +- /* Returning from L(more_8x_vec) requires restoring rsi. */ +- addq %rdi, %rsi +-L(return_vec_0_1_2_3): +- VPCMP $4, %YMM1, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_0) +- +- VPCMP $4, %YMM2, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_1) +- +- VPCMP $4, %YMM3, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_2) +-L(return_vec_3): +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WMEMCMP +- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax +- xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +- + .p2align 4 + L(more_8x_vec): + /* Set end of s1 in rdx. */ +@@ -288,21 +287,19 @@ L(more_8x_vec): + andq $-VEC_SIZE, %rdi + /* Adjust because first 4x vec where check already. */ + subq $-(VEC_SIZE * 4), %rdi ++ + .p2align 4 + L(loop_4x_vec): + VMOVU (%rsi, %rdi), %YMM1 + vpxorq (%rdi), %YMM1, %YMM1 +- + VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 + vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 +- + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(8x_return_vec_0_1_2_3) +@@ -319,28 +316,25 @@ L(loop_4x_vec): + cmpl $(VEC_SIZE * 2), %edi + jae L(8x_last_2x_vec) + ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 ++ + VMOVU (%rsi, %rdx), %YMM1 + vpxorq (%rdx), %YMM1, %YMM1 + + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 +- +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +- /* Restore s1 pointer to rdi. */ +- movq %rdx, %rdi + testl %ecx, %ecx +- jnz L(8x_return_vec_0_1_2_3) ++ jnz L(8x_end_return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + + /* Only entry is from L(more_8x_vec). */ +- .p2align 4 ++ .p2align 4,, 10 + L(8x_last_2x_vec): + VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax +@@ -355,7 +349,31 @@ L(8x_last_1x_vec): + jnz L(8x_return_vec_3) + ret + +- .p2align 4 ++ /* Not ideally aligned (at offset +9 bytes in fetch block) but ++ not aligning keeps it in the same cache line as ++ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code ++ size. */ ++ .p2align 4,, 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ leaq (%rdx, %rax, CHAR_SIZE), %rax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ addq %rdx, %rax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4,, 10 + L(last_2x_vec): + /* Check second to last VEC. */ + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 +@@ -374,26 +392,49 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4 +-L(8x_return_vec_2): +- subq $VEC_SIZE, %rdx +-L(8x_return_vec_3): +- tzcntl %eax, %eax ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ /* Use bsf to save code size. This is necessary to have ++ L(one_or_less) fit in aligning bytes between. */ ++ bsfl %eax, %eax ++ addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- leaq (%rdx, %rax, CHAR_SIZE), %rax +- movl (VEC_SIZE * 3)(%rax), %ecx ++ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- addq %rdx, %rax +- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx +- movzbl (VEC_SIZE * 3)(%rax), %eax ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + ++ /* NB: L(one_or_less) fits in alignment padding between ++ L(return_vec_1_end) and L(return_vec_0_end). */ ++# ifdef USE_AS_WMEMCMP ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++ ret ++# else ++L(one_or_less): ++ jb L(zero) ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++ ret ++# endif ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(return_vec_0_end): + tzcntl %eax, %eax +@@ -412,23 +453,56 @@ L(return_vec_0_end): + ret + + .p2align 4 +-L(return_vec_1_end): ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size == 0 ++ but is also faster for size == CHAR_SIZE. */ ++ cmpl $1, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMP $4, (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check if any matches where in bounds. Intentionally not ++ storing result in eax to limit dependency chain if it goes to ++ L(return_vec_0_lv). */ ++ bzhil %edx, %eax, %edx ++ jnz L(return_vec_0_lv) ++ xorl %eax, %eax ++ ret ++ ++ /* Essentially duplicate of L(return_vec_0). Ends up not costing ++ any code as shrinks L(less_vec) by allowing 2-byte encoding of ++ the jump and ends up fitting in aligning bytes. As well fits on ++ same cache line as L(less_vec) so also saves a line from having ++ to be fetched on cold calls to memcmp. */ ++ .p2align 4,, 4 ++L(return_vec_0_lv): + tzcntl %eax, %eax +- addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ movl (%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx +- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + +- + .p2align 4 + L(page_cross_less_vec): + /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +@@ -439,108 +513,84 @@ L(page_cross_less_vec): + cmpl $8, %edx + jae L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax ++ jb L(between_2_3) ++ ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* edx is guranteed to be positive int32 in range [4, 7]. */ ++ cmovne %edx, %eax ++ /* ecx is -1 if rcx > rax. Otherwise 0. */ ++ sbbl %ecx, %ecx ++ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == ++ rax then eax and ecx are zero. If rax < rax then ecx is -1 so ++ eax doesn't matter. */ ++ orl %ecx, %eax + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_8_15): + # endif + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq (%rdi), %xmm1 ++ vmovq (%rsi), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 ++ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ jnz L(return_vec_0_end) + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ +- VMOVU (%rsi), %XMM2 +- VPCMP $4, (%rdi), %XMM2, %k1 ++ ++ /* Use movups to save code size. */ ++ movups (%rsi), %xmm2 ++ VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- +- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 +- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi +- VPCMP $4, (%rdi), %XMM2, %k1 ++ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +-# ifdef USE_AS_WMEMCMP +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax ++ jnz L(return_vec_0_end) + ret +-# else + +- .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++# ifndef USE_AS_WMEMCMP ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax + ret + # endif +- + END (MEMCMP) + #endif diff --git a/glibc-upstream-2.34-176.patch b/glibc-upstream-2.34-176.patch new file mode 100644 index 0000000..74b18ab --- /dev/null +++ b/glibc-upstream-2.34-176.patch @@ -0,0 +1,497 @@ +commit 6d18a93dbbde2958001d65dff3080beed7ae675a +Author: Noah Goldstein +Date: Mon Sep 20 16:20:15 2021 -0500 + + x86: Optimize memset-vec-unaligned-erms.S + + No bug. + + Optimization are + + 1. change control flow for L(more_2x_vec) to fall through to loop and + jump for L(less_4x_vec) and L(less_8x_vec). This uses less code + size and saves jumps for length > 4x VEC_SIZE. + + 2. For EVEX/AVX512 move L(less_vec) closer to entry. + + 3. Avoid complex address mode for length > 2x VEC_SIZE + + 4. Slightly better aligning code for the loop from the perspective of + code size and uops. + + 5. Align targets so they make full use of their fetch block and if + possible cache line. + + 6. Try and reduce total number of icache lines that will need to be + pulled in for a given length. + + 7. Include "local" version of stosb target. For AVX2/EVEX/AVX512 + jumping to the stosb target in the sse2 code section will almost + certainly be to a new page. The new version does increase code size + marginally by duplicating the target but should get better iTLB + behavior as a result. + + test-memset, test-wmemset, and test-bzero are all passing. + + Signed-off-by: Noah Goldstein + Reviewed-by: H.J. Lu + (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 7d4a327eba29ecb4..0137eba4cdd9f830 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -18,13 +18,15 @@ + . */ + + #include ++#define USE_WITH_SSE2 1 + + #define VEC_SIZE 16 ++#define MOV_SIZE 3 ++#define RET_SIZE 1 ++ + #define VEC(i) xmm##i +-/* Don't use movups and movaps since it will get larger nop paddings for +- alignment. */ +-#define VMOVU movdqu +-#define VMOVA movdqa ++#define VMOVU movups ++#define VMOVA movaps + + #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index ae0860f36a47d594..1af668af0aeda59e 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -1,8 +1,14 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX2 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 4 ++# define RET_SIZE 4 ++ + # define VEC(i) ymm##i +-# define VMOVU vmovdqu +-# define VMOVA vmovdqa ++ ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 8ad842fc2f140527..f14d6f8493c21a36 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX512 1 ++ + # define VEC_SIZE 64 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 zmm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 640f092903302ad0..64b09e77cc20cc42 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_EVEX 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 ymm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index ff196844a093dc3b..e723413a664c088f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -63,8 +63,27 @@ + # endif + #endif + ++#if VEC_SIZE == 64 ++# define LOOP_4X_OFFSET (VEC_SIZE * 4) ++#else ++# define LOOP_4X_OFFSET (0) ++#endif ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++# define END_REG rcx ++# define LOOP_REG rdi ++#else ++# define END_REG rdi ++# define LOOP_REG rdx ++#endif ++ + #define PAGE_SIZE 4096 + ++/* Macro to calculate size of small memset block for aligning ++ purposes. */ ++#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1) ++ ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -74,6 +93,7 @@ + ENTRY (__bzero) + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ ++ xorl %esi, %esi + pxor %XMM0, %XMM0 + jmp L(entry_from_bzero) + END (__bzero) +@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + +-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(0), (%rdi) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. ++ */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + VZEROUPPER_RETURN +- +- .p2align 4 +-L(stosb_more_2x_vec): +- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP +- ja L(stosb) +-#else +- .p2align 4 + #endif +-L(more_2x_vec): +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(0), VEC_SIZE(%rdi) +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_start) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +-L(return): +-#if VEC_SIZE > 16 +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ .p2align 4,, 10 ++L(last_2x_vec): ++#ifdef USE_LESS_VEC_MASK_STORE ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) + #else +- ret ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + #endif ++ VZEROUPPER_RETURN + +-L(loop_start): +- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) +- cmpq $(VEC_SIZE * 8), %rdx +- jbe L(loop_end) +- andq $-(VEC_SIZE * 2), %rdi +- subq $-(VEC_SIZE * 4), %rdi +- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx +- .p2align 4 +-L(loop): +- VMOVA %VEC(0), (%rdi) +- VMOVA %VEC(0), VEC_SIZE(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) +- subq $-(VEC_SIZE * 4), %rdi +- cmpq %rcx, %rdi +- jb L(loop) +-L(loop_end): +- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. +- rdx as length is also unchanged. */ +- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) +- VZEROUPPER_SHORT_RETURN +- +- .p2align 4 ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. ++ */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ .p2align 4,, 10 + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + # endif +-# ifdef USE_LESS_VEC_MASK_STORE + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. +- */ ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ + andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer serious +- performance degradation when it has to fault supress. */ ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault supress. ++ */ + cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ + ja L(cross_page) + # if VEC_SIZE > 32 + movq $-1, %rcx +@@ -247,58 +235,185 @@ L(less_vec): + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k1 + # endif +- vmovdqu8 %VEC(0), (%rax) {%k1} ++ vmovdqu8 %VEC(0), (%rax){%k1} + VZEROUPPER_RETURN + ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* Include L(stosb_local) here if including L(less_vec) between ++ L(stosb_more_2x_vec) and ENTRY. This is to cache align the ++ L(stosb_more_2x_vec) target. */ ++ .p2align 4,, 10 ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN ++# endif ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 +-L(cross_page): ++L(stosb_more_2x_vec): ++ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP ++ ja L(stosb_local) ++#endif ++ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] ++ and (4x, 8x] jump to target. */ ++L(more_2x_vec): ++ ++ /* Two different methods of setting up pointers / compare. The ++ two methods are based on the fact that EVEX/AVX512 mov ++ instructions take more bytes then AVX2/SSE2 mov instructions. As ++ well that EVEX/AVX512 machines also have fast LEA_BID. Both ++ setup and END_REG to avoid complex address mode. For EVEX/AVX512 ++ this saves code size and keeps a few targets in one fetch block. ++ For AVX2/SSE2 this helps prevent AGU bottlenecks. */ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + ++ LOOP_4X_OFFSET) with LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ ++ /* Stores to first 2x VEC before cmp as any path forward will ++ require it. */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), VEC_SIZE(%rax) ++ ++ ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ ++ addq %rdx, %END_REG ++#endif ++ ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_2x_vec) ++ ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) ++ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) ++ ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add ++ extra offset to addresses in loop. Used for AVX512 to save space ++ as no way to get (VEC_SIZE * 4) in imm8. */ ++# if LOOP_4X_OFFSET == 0 ++ subq $-(VEC_SIZE * 4), %LOOP_REG + # endif +-# if VEC_SIZE > 32 +- cmpb $32, %dl +- jae L(between_32_63) ++ /* Avoid imm32 compare here to save code size. */ ++ cmpq %rdi, %rcx ++#else ++ addq $-(VEC_SIZE * 4), %END_REG ++ cmpq $(VEC_SIZE * 8), %rdx ++#endif ++ jbe L(last_4x_vec) ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* Set LOOP_REG (rdx). */ ++ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG ++#endif ++ /* Align dst for loop. */ ++ andq $(VEC_SIZE * -2), %LOOP_REG ++ .p2align 4 ++L(loop): ++ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) ++ subq $-(VEC_SIZE * 4), %LOOP_REG ++ cmpq %END_REG, %LOOP_REG ++ jb L(loop) ++ .p2align 4,, MOV_SIZE ++L(last_4x_vec): ++ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) ++L(return): ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else ++ ret ++#endif ++ ++ .p2align 4,, 10 ++#ifndef USE_LESS_VEC_MASK_STORE ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in ++ range for 2-byte jump encoding. */ ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN + # endif +-# if VEC_SIZE > 16 +- cmpb $16, %dl ++ /* Define L(less_vec) only if not otherwise defined. */ ++ .p2align 4 ++L(less_vec): ++#endif ++L(cross_page): ++#if VEC_SIZE > 32 ++ cmpl $32, %edx ++ jae L(between_32_63) ++#endif ++#if VEC_SIZE > 16 ++ cmpl $16, %edx + jae L(between_16_31) +-# endif +- MOVQ %XMM0, %rcx +- cmpb $8, %dl ++#endif ++ MOVQ %XMM0, %rdi ++ cmpl $8, %edx + jae L(between_8_15) +- cmpb $4, %dl ++ cmpl $4, %edx + jae L(between_4_7) +- cmpb $1, %dl ++ cmpl $1, %edx + ja L(between_2_3) +- jb 1f +- movb %cl, (%rax) +-1: ++ jb L(return) ++ movb %sil, (%rax) + VZEROUPPER_RETURN +-# if VEC_SIZE > 32 ++ ++ /* Align small targets only if not doing so would cross a fetch ++ line. */ ++#if VEC_SIZE > 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, -32(%rax,%rdx) + VMOVU %YMM0, (%rax) ++ VMOVU %YMM0, -32(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +-# if VEC_SIZE > 16 +- /* From 16 to 31. No branch when size == 16. */ ++#endif ++ ++#if VEC_SIZE >= 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + L(between_16_31): +- VMOVU %XMM0, -16(%rax,%rdx) ++ /* From 16 to 31. No branch when size == 16. */ + VMOVU %XMM0, (%rax) ++ VMOVU %XMM0, -16(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +- /* From 8 to 15. No branch when size == 8. */ ++#endif ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_8_15): +- movq %rcx, -8(%rax,%rdx) +- movq %rcx, (%rax) ++ /* From 8 to 15. No branch when size == 8. */ ++ movq %rdi, (%rax) ++ movq %rdi, -8(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %ecx, -4(%rax,%rdx) +- movl %ecx, (%rax) ++ movl %edi, (%rax) ++ movl %edi, -4(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %cx, -2(%rax,%rdx) +- movw %cx, (%rax) ++ movw %di, (%rax) ++ movb %dil, -1(%rax, %rdx) + VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/glibc-upstream-2.34-177.patch b/glibc-upstream-2.34-177.patch new file mode 100644 index 0000000..112bcad --- /dev/null +++ b/glibc-upstream-2.34-177.patch @@ -0,0 +1,40 @@ +commit baf3ece63453adac59c5688930324a78ced5b2e4 +Author: Noah Goldstein +Date: Sat Oct 23 01:26:47 2021 -0400 + + x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S + + This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'. + + it could potentially be dangerous to use SSE2 if this function is ever + called without using 'vzeroupper' beforehand. While compilers appear + to use 'vzeroupper' before function calls if AVX2 has been used, using + SSE2 here is more brittle. Since it is not absolutely necessary it + should be avoided. + + It costs 2-extra bytes but the extra bytes should only eat into + alignment padding. + Reviewed-by: H.J. Lu + + (cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 2761b54f2e7dea9f..640f6757fac8a356 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -561,13 +561,13 @@ L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + + /* Use movups to save code size. */ +- movups (%rsi), %xmm2 ++ vmovdqu (%rsi), %xmm2 + VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 + VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 + addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax diff --git a/glibc-upstream-2.34-178.patch b/glibc-upstream-2.34-178.patch new file mode 100644 index 0000000..1540e2f --- /dev/null +++ b/glibc-upstream-2.34-178.patch @@ -0,0 +1,690 @@ +commit f35ad30da4880a1574996df0674986ecf82fa7ae +Author: H.J. Lu +Date: Fri Oct 29 12:40:20 2021 -0700 + + x86-64: Improve EVEX strcmp with masked load + + In strcmp-evex.S, to compare 2 32-byte strings, replace + + VMOVU (%rdi, %rdx), %YMM0 + VMOVU (%rsi, %rdx), %YMM1 + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + + with + + VMOVU (%rdi, %rdx), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi, %rdx). */ + VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx + incl %ecx + jne L(last_vector) + + It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake + and Ice Lake. + + Co-Authored-By: Noah Goldstein + (cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index d5aa6daa46c7ed25..82f12ac89bcae20b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -41,6 +41,8 @@ + # ifdef USE_AS_WCSCMP + /* Compare packed dwords. */ + # define VPCMP vpcmpd ++# define VPMINU vpminud ++# define VPTESTM vptestmd + # define SHIFT_REG32 r8d + # define SHIFT_REG64 r8 + /* 1 dword char == 4 bytes. */ +@@ -48,6 +50,8 @@ + # else + /* Compare packed bytes. */ + # define VPCMP vpcmpb ++# define VPMINU vpminub ++# define VPTESTM vptestmb + # define SHIFT_REG32 ecx + # define SHIFT_REG64 rcx + /* 1 byte char == 1 byte. */ +@@ -67,6 +71,9 @@ + # define YMM5 ymm22 + # define YMM6 ymm23 + # define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -76,7 +83,7 @@ + /* The main idea of the string comparison (byte or dword) using 256-bit + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The + latter can be on either packed bytes or dwords depending on +- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the ++ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd +@@ -123,27 +130,21 @@ ENTRY (STRCMP) + jg L(cross_page) + /* Start comparing 4 vectors. */ + VMOVU (%rdi), %YMM0 +- VMOVU (%rsi), %YMM1 + +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 + +- /* Check for NULL in YMM0. */ +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- /* Check for NULL in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi). */ ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} + +- /* Each bit in K1 represents: +- 1. A mismatch in YMM0 and YMM1. Or +- 2. A NULL in YMM0 or YMM1. +- */ +- kord %k0, %k1, %k1 +- +- ktestd %k1, %k1 +- je L(next_3_vectors) + kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(next_3_vectors) + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -172,9 +173,7 @@ L(return): + # endif + ret + +- .p2align 4 + L(return_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -210,9 +209,7 @@ L(return_vec_size): + # endif + ret + +- .p2align 4 + L(return_2_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -248,9 +245,7 @@ L(return_2_vec_size): + # endif + ret + +- .p2align 4 + L(return_3_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -289,43 +284,45 @@ L(return_3_vec_size): + .p2align 4 + L(next_3_vectors): + VMOVU VEC_SIZE(%rdi), %YMM0 +- VMOVU VEC_SIZE(%rsi), %YMM1 +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_vec_size) + +- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 +- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 +- +- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ +- VPCMP $4, %YMM2, %YMM4, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_2_vec_size) + +- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ +- VPCMP $4, %YMM3, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM3, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_3_vec_size) + L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx +@@ -375,56 +372,51 @@ L(back_to_loop): + VMOVA VEC_SIZE(%rax), %YMM2 + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 +- VMOVU (%rdx), %YMM1 +- VMOVU VEC_SIZE(%rdx), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 +- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 +- +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and +- YMM1. */ +- kord %k0, %k1, %k4 +- +- VPCMP $4, %YMM2, %YMM3, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and +- YMM3. */ +- kord %k0, %k1, %k5 +- +- VPCMP $4, %YMM4, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and +- YMM5. */ +- kord %k0, %k1, %k6 +- +- VPCMP $4, %YMM6, %YMM7, %k0 +- VPCMP $0, %YMMZERO, %YMM6, %k1 +- VPCMP $0, %YMMZERO, %YMM7, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and +- YMM7. */ +- kord %k0, %k1, %k7 +- +- kord %k4, %k5, %k0 +- kord %k6, %k7, %k1 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- kortestd %k0, %k1 +- je L(loop) +- ktestd %k4, %k4 ++ ++ VPMINU %YMM0, %YMM2, %YMM8 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ ++ /* A zero CHAR in YMM8 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM8 ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ VPTESTM %YMM8, %YMM8, %k1 ++ ++ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ ++ vpxorq (%rdx), %YMM0, %YMM1 ++ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 ++ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ ++ vporq %YMM1, %YMM3, %YMM9 ++ vporq %YMM5, %YMM7, %YMM10 ++ ++ /* A non-zero CHAR in YMM9 represents a mismatch. */ ++ vporq %YMM9, %YMM10, %YMM9 ++ ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ ++ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(loop) ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM0 and (%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_vec) +- kmovd %k4, %edi +- tzcntl %edi, %ecx ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -466,9 +458,18 @@ L(test_vec): + cmpq $VEC_SIZE, %r11 + jbe L(zero) + # endif +- ktestd %k5, %k5 ++ /* Each bit set in K1 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM2 and VEC_SIZE(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_2_vec) +- kmovd %k5, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -512,9 +513,18 @@ L(test_2_vec): + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) + # endif +- ktestd %k6, %k6 ++ /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ VPTESTM %YMM4, %YMM4, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM4 and (VEC_SIZE * 2)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_3_vec) +- kmovd %k6, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -558,8 +568,18 @@ L(test_3_vec): + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) + # endif +- kmovd %k7, %esi +- tzcntl %esi, %ecx ++ /* Each bit set in K1 represents a non-null CHAR in YMM6. */ ++ VPTESTM %YMM6, %YMM6, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM6 and (VEC_SIZE * 3)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -615,39 +635,51 @@ L(loop_cross_page): + + VMOVU (%rax, %r10), %YMM2 + VMOVU VEC_SIZE(%rax, %r10), %YMM3 +- VMOVU (%rdx, %r10), %YMM4 +- VMOVU VEC_SIZE(%rdx, %r10), %YMM5 +- +- VPCMP $4, %YMM4, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and +- YMM4. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM5, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM3, %k4 +- VPCMP $0, %YMMZERO, %YMM5, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and +- YMM5. */ +- kord %k3, %k4, %k3 ++ ++ /* Each bit set in K2 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM2 and 32 bytes at (%rdx, %r10). */ ++ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d ++# ifdef USE_AS_WCSCMP ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ /* Each bit set in K4 represents a non-null CHAR in YMM3. */ ++ VPTESTM %YMM3, %YMM3, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ ++ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi ++# else ++ incl %edi ++# endif + + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG32 + sarl $2, %SHIFT_REG32 ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi + # else +- kshiftlq $32, %k3, %k2 +-# endif ++ salq $32, %rdi + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrxq %SHIFT_REG64, %rdi, %rdi +@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 +- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 +- +- VPCMP $4, %YMM0, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM2, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and +- YMM2. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM1, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM1, %k4 +- VPCMP $0, %YMMZERO, %YMM3, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and +- YMM3. */ +- kord %k3, %k4, %k3 + ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ VPTESTM %YMM1, %YMM1, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi + # else +- kshiftlq $32, %k3, %k2 ++ incl %edi + # endif + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi ++# else ++ salq $32, %rdi ++ ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec): + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d + # ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ /* NB: Divide shift count by 4 since each bit in RDI represent 4 + bytes. */ + sarl $2, %ecx +-# endif ++ /* Skip ECX bytes. */ ++ shrl %cl, %edi ++# else + /* Skip ECX bytes. */ + shrq %cl, %rdi ++# endif + 1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ +@@ -818,7 +863,7 @@ L(cross_page_loop): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + # endif +- /* Check null char. */ ++ /* Check null CHAR. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +@@ -901,18 +946,17 @@ L(cross_page): + jg L(cross_page_1_vector) + L(loop_1_vector): + VMOVU (%rdi, %rdx), %YMM0 +- VMOVU (%rsi, %rdx), %YMM1 +- +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx +- testl %ecx, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(last_vector) + + addl $VEC_SIZE, %edx +@@ -931,18 +975,17 @@ L(cross_page_1_vector): + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + VMOVU (%rdi, %rdx), %XMM0 +- VMOVU (%rsi, %rdx), %XMM1 +- +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- korw %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korw %k0, %k1, %k1 +- kmovw %k1, %ecx +- testl %ecx, %ecx ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and 16 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xf, %ecx ++# else ++ subl $0xffff, %ecx ++# endif + jne L(last_vector) + + addl $16, %edx +@@ -965,25 +1008,16 @@ L(cross_page_1_xmm): + vmovq (%rdi, %rdx), %XMM0 + vmovq (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- kmovd %k1, %ecx +- ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ kmovb %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* Only last 2 bits are valid. */ +- andl $0x3, %ecx ++ subl $0x3, %ecx + # else +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx ++ subl $0xff, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx +@@ -1002,25 +1036,16 @@ L(cross_page_8bytes): + vmovd (%rdi, %rdx), %XMM0 + vmovd (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} + kmovd %k1, %ecx +- + # ifdef USE_AS_WCSCMP +- /* Only the last bit is valid. */ +- andl $0x1, %ecx ++ subl $0x1, %ecx + # else +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx ++ subl $0xf, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx diff --git a/glibc-upstream-2.34-179.patch b/glibc-upstream-2.34-179.patch new file mode 100644 index 0000000..e9a4329 --- /dev/null +++ b/glibc-upstream-2.34-179.patch @@ -0,0 +1,85 @@ +commit a182bb7a3922404f79def09d79ef89678b4049f0 +Author: H.J. Lu +Date: Fri Oct 29 12:56:53 2021 -0700 + + x86-64: Remove Prefer_AVX2_STRCMP + + Remove Prefer_AVX2_STRCMP to enable EVEX strcmp. When comparing 2 32-byte + strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1 + VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD + and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1 + VPMOVMSKB and 1 TESTL. EVEX strcmp is now faster than AVX2 strcmp by up + to 40% on Tiger Lake and Ice Lake. + + (cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index de4e3c3b7258120d..f4d4049e391cbabd 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -574,14 +574,6 @@ disable_tsx: + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; +- +- /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp +- requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp +- requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB, +- AVX2 strcmp is faster than EVEX strcmp. */ +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) +- cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] +- |= bit_arch_Prefer_AVX2_STRCMP; + } + + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 58f2fad4323d5d91..957db3ad229ba39f 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Fast_Copy_Backward, + disable, 18); +- CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH +- (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18); + } + break; + case 19: +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 3bdc76cf71007948..8250bfcbecd29a9f 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -31,5 +31,4 @@ BIT (Prefer_ERMS) + BIT (Prefer_No_AVX512) + BIT (MathVec_Prefer_No_AVX512) + BIT (Prefer_FSRM) +-BIT (Prefer_AVX2_STRCMP) + BIT (Avoid_Short_Distance_REP_MOVSB) +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index 62b7abeeee646ab4..7c2901bf44456259 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c +index 60ba0fe356b31779..f94a421784bfe923 100644 +--- a/sysdeps/x86_64/multiarch/strncmp.c ++++ b/sysdeps/x86_64/multiarch/strncmp.c +@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) diff --git a/glibc-upstream-2.34-180.patch b/glibc-upstream-2.34-180.patch new file mode 100644 index 0000000..9707cf2 --- /dev/null +++ b/glibc-upstream-2.34-180.patch @@ -0,0 +1,48 @@ +commit 2e64237a8744dd50f9222293275fa52e7248ff76 +Author: Fangrui Song +Date: Tue Nov 2 20:59:52 2021 -0700 + + x86-64: Replace movzx with movzbl + + Clang cannot assemble movzx in the AT&T dialect mode. + + ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction + movzx (%rsi), %ecx + ^~~~ + + Change movzx to movzbl, which follows the AT&T dialect and is used + elsewhere in the file. + + Reviewed-by: H.J. Lu + (cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index bc19547b09639071..6197a723b9e0606e 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz): + .p2align 4 + // XXX Same as code above + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index 824e648230a15739..7f8a1bc756f86aee 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz): + + .p2align 4 + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx diff --git a/glibc-upstream-2.34-181.patch b/glibc-upstream-2.34-181.patch new file mode 100644 index 0000000..36a401f --- /dev/null +++ b/glibc-upstream-2.34-181.patch @@ -0,0 +1,843 @@ +commit a7392db2ff2b9dd906500941ac6361dbe2211b0d +Author: Noah Goldstein +Date: Mon Nov 1 00:49:51 2021 -0500 + + x86: Optimize memmove-vec-unaligned-erms.S + + No bug. + + The optimizations are as follows: + + 1) Always align entry to 64 bytes. This makes behavior more + predictable and makes other frontend optimizations easier. + + 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have + significant benefits in the case that: + 0 < (dst - src) < [256, 512] + + 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%] + improvement and for FSRM [-10%, 25%]. + + In addition to these primary changes there is general cleanup + throughout to optimize the aligning routines and control flow logic. + + Signed-off-by: Noah Goldstein + Reviewed-by: H.J. Lu + (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6) + +diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S +index db106a7a1f23f268..b2b318084823dceb 100644 +--- a/sysdeps/x86_64/memmove.S ++++ b/sysdeps/x86_64/memmove.S +@@ -25,7 +25,7 @@ + /* Use movups and movaps for smaller code sizes. */ + #define VMOVU movups + #define VMOVA movaps +- ++#define MOV_SIZE 3 + #define SECTION(p) p + + #ifdef USE_MULTIARCH +diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +index 1ec1962e861dbf63..67a55f0c85af841c 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +@@ -4,7 +4,7 @@ + # define VMOVNT vmovntdq + # define VMOVU vmovdqu + # define VMOVA vmovdqa +- ++# define MOV_SIZE 4 + # define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S +index e195e93f153c9512..975ae6c0515b83cb 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S +@@ -4,7 +4,7 @@ + # define VMOVNT vmovntdq + # define VMOVU vmovdqu + # define VMOVA vmovdqa +- ++# define MOV_SIZE 4 + # define SECTION(p) p##.avx + # define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +index 848848ab39ff9326..0fa7126830af7acb 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +@@ -25,7 +25,7 @@ + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 + # define VZEROUPPER +- ++# define MOV_SIZE 6 + # define SECTION(p) p##.evex512 + # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +index 0cbce8f944da51a0..88715441feaaccf5 100644 +--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +@@ -25,7 +25,7 @@ + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 + # define VZEROUPPER +- ++# define MOV_SIZE 6 + # define SECTION(p) p##.evex + # define MEMMOVE_SYMBOL(p,s) p##_evex_##s + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index abde8438d41f2320..7b27cbdda5fb99f7 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -76,6 +76,25 @@ + # endif + #endif + ++/* Whether to align before movsb. Ultimately we want 64 byte ++ align and not worth it to load 4x VEC for VEC_SIZE == 16. */ ++#define ALIGN_MOVSB (VEC_SIZE > 16) ++/* Number of bytes to align movsb to. */ ++#define MOVSB_ALIGN_TO 64 ++ ++#define SMALL_MOV_SIZE (MOV_SIZE <= 4) ++#define LARGE_MOV_SIZE (MOV_SIZE > 4) ++ ++#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 ++# error MOV_SIZE Unknown ++#endif ++ ++#if LARGE_MOV_SIZE ++# define SMALL_SIZE_OFFSET (4) ++#else ++# define SMALL_SIZE_OFFSET (0) ++#endif ++ + #ifndef PAGE_SIZE + # define PAGE_SIZE 4096 + #endif +@@ -199,25 +218,21 @@ L(start): + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ /* Load regardless. */ ++ VMOVU (%rsi), %VEC(0) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) +-#if !defined USE_MULTIARCH || !IS_IN (libc) +-L(last_2x_vec): +-#endif + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +-#if !defined USE_MULTIARCH || !IS_IN (libc) +-L(nop): +- ret ++#if !(defined USE_MULTIARCH && IS_IN (libc)) ++ ZERO_UPPER_VEC_REGISTERS_RETURN + #else + VZEROUPPER_RETURN + #endif + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMMOVE_SYMBOL (__memmove, unaligned)) +- + # if VEC_SIZE == 16 + ENTRY (__mempcpy_chk_erms) + cmp %RDX_LP, %RCX_LP +@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + # endif + +-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) ++ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) + movq %rdi, %rax + L(start_erms): + # ifdef __ILP32__ +@@ -298,310 +313,448 @@ L(start_erms): + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ /* Load regardless. */ ++ VMOVU (%rsi), %VEC(0) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(movsb_more_2x_vec) +-L(last_2x_vec): +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %VEC(0) +- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. ++ */ ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) ++ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) + L(return): +-#if VEC_SIZE > 16 ++# if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +-#else ++# else + ret ++# endif + #endif + +-L(movsb): +- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP +- jae L(more_8x_vec) +- cmpq %rsi, %rdi +- jb 1f +- /* Source == destination is less common. */ +- je L(nop) +- leaq (%rsi,%rdx), %r9 +- cmpq %r9, %rdi +- /* Avoid slow backward REP MOVSB. */ +- jb L(more_8x_vec_backward) +-# if AVOID_SHORT_DISTANCE_REP_MOVSB +- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) +- jz 3f +- movq %rdi, %rcx +- subq %rsi, %rcx +- jmp 2f +-# endif +-1: +-# if AVOID_SHORT_DISTANCE_REP_MOVSB +- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) +- jz 3f +- movq %rsi, %rcx +- subq %rdi, %rcx +-2: +-/* Avoid "rep movsb" if RCX, the distance between source and destination, +- is N*4GB + [1..63] with N >= 0. */ +- cmpl $63, %ecx +- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ +-3: +-# endif +- mov %RDX_LP, %RCX_LP +- rep movsb +-L(nop): ++#if LARGE_MOV_SIZE ++ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the ++ ENTRY block and L(less_vec). */ ++ .p2align 4,, 8 ++L(between_4_7): ++ /* From 4 to 7. No branch when size == 4. */ ++ movl (%rsi), %ecx ++ movl (%rsi, %rdx), %esi ++ movl %ecx, (%rdi) ++ movl %esi, (%rdi, %rdx) + ret + #endif + ++ .p2align 4 + L(less_vec): + /* Less than 1 VEC. */ + #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + #endif + #if VEC_SIZE > 32 +- cmpb $32, %dl ++ cmpl $32, %edx + jae L(between_32_63) + #endif + #if VEC_SIZE > 16 +- cmpb $16, %dl ++ cmpl $16, %edx + jae L(between_16_31) + #endif +- cmpb $8, %dl ++ cmpl $8, %edx + jae L(between_8_15) +- cmpb $4, %dl ++#if SMALL_MOV_SIZE ++ cmpl $4, %edx ++#else ++ subq $4, %rdx ++#endif + jae L(between_4_7) +- cmpb $1, %dl +- ja L(between_2_3) +- jb 1f +- movzbl (%rsi), %ecx ++ cmpl $(1 - SMALL_SIZE_OFFSET), %edx ++ jl L(copy_0) ++ movb (%rsi), %cl ++ je L(copy_1) ++ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi ++ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) ++L(copy_1): + movb %cl, (%rdi) +-1: ++L(copy_0): + ret ++ ++#if SMALL_MOV_SIZE ++ .p2align 4,, 8 ++L(between_4_7): ++ /* From 4 to 7. No branch when size == 4. */ ++ movl -4(%rsi, %rdx), %ecx ++ movl (%rsi), %esi ++ movl %ecx, -4(%rdi, %rdx) ++ movl %esi, (%rdi) ++ ret ++#endif ++ ++#if VEC_SIZE > 16 ++ /* From 16 to 31. No branch when size == 16. */ ++ .p2align 4,, 8 ++L(between_16_31): ++ vmovdqu (%rsi), %xmm0 ++ vmovdqu -16(%rsi, %rdx), %xmm1 ++ vmovdqu %xmm0, (%rdi) ++ vmovdqu %xmm1, -16(%rdi, %rdx) ++ /* No ymm registers have been touched. */ ++ ret ++#endif ++ + #if VEC_SIZE > 32 ++ .p2align 4,, 10 + L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + VMOVU (%rsi), %YMM0 +- VMOVU -32(%rsi,%rdx), %YMM1 ++ VMOVU -32(%rsi, %rdx), %YMM1 + VMOVU %YMM0, (%rdi) +- VMOVU %YMM1, -32(%rdi,%rdx) +- VZEROUPPER_RETURN +-#endif +-#if VEC_SIZE > 16 +- /* From 16 to 31. No branch when size == 16. */ +-L(between_16_31): +- VMOVU (%rsi), %XMM0 +- VMOVU -16(%rsi,%rdx), %XMM1 +- VMOVU %XMM0, (%rdi) +- VMOVU %XMM1, -16(%rdi,%rdx) ++ VMOVU %YMM1, -32(%rdi, %rdx) + VZEROUPPER_RETURN + #endif ++ ++ .p2align 4,, 10 + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq -8(%rsi,%rdx), %rcx ++ movq -8(%rsi, %rdx), %rcx + movq (%rsi), %rsi +- movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) ++ movq %rcx, -8(%rdi, %rdx) + ret +-L(between_4_7): +- /* From 4 to 7. No branch when size == 4. */ +- movl -4(%rsi,%rdx), %ecx +- movl (%rsi), %esi +- movl %ecx, -4(%rdi,%rdx) +- movl %esi, (%rdi) +- ret +-L(between_2_3): +- /* From 2 to 3. No branch when size == 2. */ +- movzwl -2(%rsi,%rdx), %ecx +- movzwl (%rsi), %esi +- movw %cx, -2(%rdi,%rdx) +- movw %si, (%rdi) +- ret + ++ .p2align 4,, 10 ++L(last_4x_vec): ++ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ ++ ++ /* VEC(0) and VEC(1) have already been loaded. */ ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(1), VEC_SIZE(%rdi) ++ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) ++ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++ ++ .p2align 4 + #if defined USE_MULTIARCH && IS_IN (libc) + L(movsb_more_2x_vec): + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP + ja L(movsb) + #endif + L(more_2x_vec): +- /* More than 2 * VEC and there may be overlap between destination +- and source. */ ++ /* More than 2 * VEC and there may be overlap between ++ destination and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) ++ /* Load VEC(1) regardless. VEC(0) has already been loaded. */ ++ VMOVU VEC_SIZE(%rsi), %VEC(1) + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) +- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ +- VMOVU (%rsi), %VEC(0) +- VMOVU VEC_SIZE(%rsi), %VEC(1) ++ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) +- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) +- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) +- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) +- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) +- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) +- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) +- VZEROUPPER_RETURN +-L(last_4x_vec): +- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ +- VMOVU (%rsi), %VEC(0) +- VMOVU VEC_SIZE(%rsi), %VEC(1) +- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) +- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(1), VEC_SIZE(%rdi) +- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) ++ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) ++ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) ++ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) + VZEROUPPER_RETURN + ++ .p2align 4,, 4 + L(more_8x_vec): ++ movq %rdi, %rcx ++ subq %rsi, %rcx ++ /* Go to backwards temporal copy if overlap no matter what as ++ backward REP MOVSB is slow and we don't want to use NT stores if ++ there is overlap. */ ++ cmpq %rdx, %rcx ++ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ ++ jb L(more_8x_vec_backward_check_nop) + /* Check if non-temporal move candidate. */ + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_memcpy_2x) + #endif +- /* Entry if rdx is greater than non-temporal threshold but there +- is overlap. */ ++ /* To reach this point there cannot be overlap and dst > src. So ++ check for overlap and src > dst in which case correctness ++ requires forward copy. Otherwise decide between backward/forward ++ copy depending on address aliasing. */ ++ ++ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold ++ but less than __x86_shared_non_temporal_threshold. */ + L(more_8x_vec_check): +- cmpq %rsi, %rdi +- ja L(more_8x_vec_backward) +- /* Source == destination is less common. */ +- je L(nop) +- /* Load the first VEC and last 4 * VEC to support overlapping +- addresses. */ +- VMOVU (%rsi), %VEC(4) ++ /* rcx contains dst - src. Add back length (rdx). */ ++ leaq (%rcx, %rdx), %r8 ++ /* If r8 has different sign than rcx then there is overlap so we ++ must do forward copy. */ ++ xorq %rcx, %r8 ++ /* Isolate just sign bit of r8. */ ++ shrq $63, %r8 ++ /* Get 4k difference dst - src. */ ++ andl $(PAGE_SIZE - 256), %ecx ++ /* If r8 is non-zero must do foward for correctness. Otherwise ++ if ecx is non-zero there is 4k False Alaising so do backward ++ copy. */ ++ addl %r8d, %ecx ++ jz L(more_8x_vec_backward) ++ ++ /* if rdx is greater than __x86_shared_non_temporal_threshold ++ but there is overlap, or from short distance movsb. */ ++L(more_8x_vec_forward): ++ /* Load first and last 4 * VEC to support overlapping addresses. ++ */ ++ ++ /* First vec was already loaded into VEC(0). */ + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) ++ /* Save begining of dst. */ ++ movq %rdi, %rcx ++ /* Align dst to VEC_SIZE - 1. */ ++ orq $(VEC_SIZE - 1), %rdi + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) +- /* Save start and stop of the destination buffer. */ +- movq %rdi, %r11 +- leaq -VEC_SIZE(%rdi, %rdx), %rcx +- /* Align destination for aligned stores in the loop. Compute +- how much destination is misaligned. */ +- movq %rdi, %r8 +- andq $(VEC_SIZE - 1), %r8 +- /* Get the negative of offset for alignment. */ +- subq $VEC_SIZE, %r8 +- /* Adjust source. */ +- subq %r8, %rsi +- /* Adjust destination which should be aligned now. */ +- subq %r8, %rdi +- /* Adjust length. */ +- addq %r8, %rdx + +- .p2align 4 ++ /* Subtract dst from src. Add back after dst aligned. */ ++ subq %rcx, %rsi ++ /* Finish aligning dst. */ ++ incq %rdi ++ /* Restore src adjusted with new value for aligned dst. */ ++ addq %rdi, %rsi ++ /* Store end of buffer minus tail in rdx. */ ++ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx ++ ++ /* Dont use multi-byte nop to align. */ ++ .p2align 4,, 11 + L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ +- VMOVU (%rsi), %VEC(0) +- VMOVU VEC_SIZE(%rsi), %VEC(1) +- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) +- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) ++ VMOVU (%rsi), %VEC(1) ++ VMOVU VEC_SIZE(%rsi), %VEC(2) ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) + subq $-(VEC_SIZE * 4), %rsi +- addq $-(VEC_SIZE * 4), %rdx +- VMOVA %VEC(0), (%rdi) +- VMOVA %VEC(1), VEC_SIZE(%rdi) +- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) +- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) ++ VMOVA %VEC(1), (%rdi) ++ VMOVA %VEC(2), VEC_SIZE(%rdi) ++ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi +- cmpq $(VEC_SIZE * 4), %rdx ++ cmpq %rdi, %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ +- VMOVU %VEC(5), (%rcx) +- VMOVU %VEC(6), -VEC_SIZE(%rcx) +- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) +- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) ++ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) ++ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) ++ VMOVU %VEC(7), VEC_SIZE(%rdx) ++ VMOVU %VEC(8), (%rdx) + /* Store the first VEC. */ +- VMOVU %VEC(4), (%r11) ++ VMOVU %VEC(0), (%rcx) ++ /* Keep L(nop_backward) target close to jmp for 2-byte encoding. ++ */ ++L(nop_backward): + VZEROUPPER_RETURN + ++ .p2align 4,, 8 ++L(more_8x_vec_backward_check_nop): ++ /* rcx contains dst - src. Test for dst == src to skip all of ++ memmove. */ ++ testq %rcx, %rcx ++ jz L(nop_backward) + L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ +- VMOVU (%rsi), %VEC(4) ++ ++ /* First vec was also loaded into VEC(0). */ + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) ++ /* Begining of region for 4x backward copy stored in rcx. */ ++ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) +- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) +- /* Save stop of the destination buffer. */ +- leaq -VEC_SIZE(%rdi, %rdx), %r11 +- /* Align destination end for aligned stores in the loop. Compute +- how much destination end is misaligned. */ +- leaq -VEC_SIZE(%rsi, %rdx), %rcx +- movq %r11, %r9 +- movq %r11, %r8 +- andq $(VEC_SIZE - 1), %r8 +- /* Adjust source. */ +- subq %r8, %rcx +- /* Adjust the end of destination which should be aligned now. */ +- subq %r8, %r9 +- /* Adjust length. */ +- subq %r8, %rdx +- +- .p2align 4 ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) ++ /* Subtract dst from src. Add back after dst aligned. */ ++ subq %rdi, %rsi ++ /* Align dst. */ ++ andq $-(VEC_SIZE), %rcx ++ /* Restore src. */ ++ addq %rcx, %rsi ++ ++ /* Don't use multi-byte nop to align. */ ++ .p2align 4,, 11 + L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ +- VMOVU (%rcx), %VEC(0) +- VMOVU -VEC_SIZE(%rcx), %VEC(1) +- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) +- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) +- addq $-(VEC_SIZE * 4), %rcx +- addq $-(VEC_SIZE * 4), %rdx +- VMOVA %VEC(0), (%r9) +- VMOVA %VEC(1), -VEC_SIZE(%r9) +- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) +- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) +- addq $-(VEC_SIZE * 4), %r9 +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_4x_vec_backward) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) ++ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) ++ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) ++ addq $(VEC_SIZE * -4), %rsi ++ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) ++ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) ++ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) ++ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) ++ addq $(VEC_SIZE * -4), %rcx ++ cmpq %rcx, %rdi ++ jb L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ +- VMOVU %VEC(4), (%rdi) ++ VMOVU %VEC(0), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ +- VMOVU %VEC(8), (%r11) ++ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) ++ VZEROUPPER_RETURN ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++ /* L(skip_short_movsb_check) is only used with ERMS. Not for ++ FSRM. */ ++ .p2align 5,, 16 ++# if ALIGN_MOVSB ++L(skip_short_movsb_check): ++# if MOVSB_ALIGN_TO > VEC_SIZE ++ VMOVU VEC_SIZE(%rsi), %VEC(1) ++# endif ++# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) ++# error Unsupported MOVSB_ALIGN_TO ++# endif ++ /* If CPU does not have FSRM two options for aligning. Align src ++ if dst and src 4k alias. Otherwise align dst. */ ++ testl $(PAGE_SIZE - 512), %ecx ++ jnz L(movsb_align_dst) ++ /* Fall through. dst and src 4k alias. It's better to align src ++ here because the bottleneck will be loads dues to the false ++ dependency on dst. */ ++ ++ /* rcx already has dst - src. */ ++ movq %rcx, %r9 ++ /* Add src to len. Subtract back after src aligned. -1 because ++ src is initially aligned to MOVSB_ALIGN_TO - 1. */ ++ leaq -1(%rsi, %rdx), %rcx ++ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ ++ orq $(MOVSB_ALIGN_TO - 1), %rsi ++ /* Restore dst and len adjusted with new values for aligned dst. ++ */ ++ leaq 1(%rsi, %r9), %rdi ++ subq %rsi, %rcx ++ /* Finish aligning src. */ ++ incq %rsi ++ ++ rep movsb ++ ++ VMOVU %VEC(0), (%r8) ++# if MOVSB_ALIGN_TO > VEC_SIZE ++ VMOVU %VEC(1), VEC_SIZE(%r8) ++# endif + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 12 ++L(movsb): ++ movq %rdi, %rcx ++ subq %rsi, %rcx ++ /* Go to backwards temporal copy if overlap no matter what as ++ backward REP MOVSB is slow and we don't want to use NT stores if ++ there is overlap. */ ++ cmpq %rdx, %rcx ++ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ ++ jb L(more_8x_vec_backward_check_nop) ++# if ALIGN_MOVSB ++ /* Save dest for storing aligning VECs later. */ ++ movq %rdi, %r8 ++# endif ++ /* If above __x86_rep_movsb_stop_threshold most likely is ++ candidate for NT moves aswell. */ ++ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP ++ jae L(large_memcpy_2x_check) ++# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB ++ /* Only avoid short movsb if CPU has FSRM. */ ++ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) ++ jz L(skip_short_movsb_check) ++# if AVOID_SHORT_DISTANCE_REP_MOVSB ++ /* Avoid "rep movsb" if RCX, the distance between source and ++ destination, is N*4GB + [1..63] with N >= 0. */ ++ ++ /* ecx contains dst - src. Early check for backward copy ++ conditions means only case of slow movsb with src = dst + [0, ++ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check ++ for that case. */ ++ cmpl $-64, %ecx ++ ja L(more_8x_vec_forward) ++# endif ++# endif ++# if ALIGN_MOVSB ++# if MOVSB_ALIGN_TO > VEC_SIZE ++ VMOVU VEC_SIZE(%rsi), %VEC(1) ++# endif ++# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) ++# error Unsupported MOVSB_ALIGN_TO ++# endif ++ /* Fall through means cpu has FSRM. In that case exclusively ++ align destination. */ ++L(movsb_align_dst): ++ /* Subtract dst from src. Add back after dst aligned. */ ++ subq %rdi, %rsi ++ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ ++ addq $(MOVSB_ALIGN_TO - 1), %rdi ++ /* Add dst to len. Subtract back after dst aligned. */ ++ leaq (%r8, %rdx), %rcx ++ /* Finish aligning dst. */ ++ andq $-(MOVSB_ALIGN_TO), %rdi ++ /* Restore src and len adjusted with new values for aligned dst. ++ */ ++ addq %rdi, %rsi ++ subq %rdi, %rcx ++ ++ rep movsb ++ ++ /* Store VECs loaded for aligning. */ ++ VMOVU %VEC(0), (%r8) ++# if MOVSB_ALIGN_TO > VEC_SIZE ++ VMOVU %VEC(1), VEC_SIZE(%r8) ++# endif ++ VZEROUPPER_RETURN ++# else /* !ALIGN_MOVSB. */ ++L(skip_short_movsb_check): ++ mov %RDX_LP, %RCX_LP ++ rep movsb ++ ret ++# endif ++#endif + ++ .p2align 4,, 10 + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +- .p2align 4 ++L(large_memcpy_2x_check): ++ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP ++ jb L(more_8x_vec_check) + L(large_memcpy_2x): +- /* Compute absolute value of difference between source and +- destination. */ +- movq %rdi, %r9 +- subq %rsi, %r9 +- movq %r9, %r8 +- leaq -1(%r9), %rcx +- sarq $63, %r8 +- xorq %r8, %r9 +- subq %r8, %r9 +- /* Don't use non-temporal store if there is overlap between +- destination and source since destination may be in cache when +- source is loaded. */ +- cmpq %r9, %rdx +- ja L(more_8x_vec_check) ++ /* To reach this point it is impossible for dst > src and ++ overlap. Remaining to check is src > dst and overlap. rcx ++ already contains dst - src. Negate rcx to get src - dst. If ++ length > rcx then there is overlap and forward copy is best. */ ++ negq %rcx ++ cmpq %rcx, %rdx ++ ja L(more_8x_vec_forward) + + /* Cache align destination. First store the first 64 bytes then + adjust alignments. */ +- VMOVU (%rsi), %VEC(8) +-#if VEC_SIZE < 64 +- VMOVU VEC_SIZE(%rsi), %VEC(9) +-#if VEC_SIZE < 32 +- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) +- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) +-#endif +-#endif +- VMOVU %VEC(8), (%rdi) +-#if VEC_SIZE < 64 +- VMOVU %VEC(9), VEC_SIZE(%rdi) +-#if VEC_SIZE < 32 +- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) +-#endif +-#endif ++ ++ /* First vec was also loaded into VEC(0). */ ++# if VEC_SIZE < 64 ++ VMOVU VEC_SIZE(%rsi), %VEC(1) ++# if VEC_SIZE < 32 ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) ++# endif ++# endif ++ VMOVU %VEC(0), (%rdi) ++# if VEC_SIZE < 64 ++ VMOVU %VEC(1), VEC_SIZE(%rdi) ++# if VEC_SIZE < 32 ++ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) ++ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) ++# endif ++# endif ++ + /* Adjust source, destination, and size. */ + movq %rdi, %r8 + andq $63, %r8 +@@ -614,9 +767,13 @@ L(large_memcpy_2x): + /* Adjust length. */ + addq %r8, %rdx + +- /* Test if source and destination addresses will alias. If they do +- the larger pipeline in large_memcpy_4x alleviated the ++ /* Test if source and destination addresses will alias. If they ++ do the larger pipeline in large_memcpy_4x alleviated the + performance drop. */ ++ ++ /* ecx contains -(dst - src). not ecx will return dst - src - 1 ++ which works for testing aliasing. */ ++ notl %ecx + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + +@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx + L(loop_large_memcpy_4x_inner): +- /* Only one prefetch set per page as doing 4 pages give more time +- for prefetcher to keep up. */ ++ /* Only one prefetch set per page as doing 4 pages give more ++ time for prefetcher to keep up. */ + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) diff --git a/glibc-upstream-2.34-182.patch b/glibc-upstream-2.34-182.patch new file mode 100644 index 0000000..563ff9d --- /dev/null +++ b/glibc-upstream-2.34-182.patch @@ -0,0 +1,131 @@ +commit cecbac52123456e2fbcff062a4165bf7b9174797 +Author: Noah Goldstein +Date: Mon Nov 1 00:49:52 2021 -0500 + + x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h + + No bug. + + This patch doubles the rep_movsb_threshold when using ERMS. Based on + benchmarks the vector copy loop, especially now that it handles 4k + aliasing, is better for these medium ranged. + + On Skylake with ERMS: + + Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) + 4096, 0, 0, 0, 0.975 + 4096, 0, 0, 1, 0.953 + 4096, 12, 0, 0, 0.969 + 4096, 12, 0, 1, 0.872 + 4096, 44, 0, 0, 0.979 + 4096, 44, 0, 1, 0.83 + 4096, 0, 12, 0, 1.006 + 4096, 0, 12, 1, 0.989 + 4096, 0, 44, 0, 0.739 + 4096, 0, 44, 1, 0.942 + 4096, 12, 12, 0, 1.009 + 4096, 12, 12, 1, 0.973 + 4096, 44, 44, 0, 0.791 + 4096, 44, 44, 1, 0.961 + 4096, 2048, 0, 0, 0.978 + 4096, 2048, 0, 1, 0.951 + 4096, 2060, 0, 0, 0.986 + 4096, 2060, 0, 1, 0.963 + 4096, 2048, 12, 0, 0.971 + 4096, 2048, 12, 1, 0.941 + 4096, 2060, 12, 0, 0.977 + 4096, 2060, 12, 1, 0.949 + 8192, 0, 0, 0, 0.85 + 8192, 0, 0, 1, 0.845 + 8192, 13, 0, 0, 0.937 + 8192, 13, 0, 1, 0.939 + 8192, 45, 0, 0, 0.932 + 8192, 45, 0, 1, 0.927 + 8192, 0, 13, 0, 0.621 + 8192, 0, 13, 1, 0.62 + 8192, 0, 45, 0, 0.53 + 8192, 0, 45, 1, 0.516 + 8192, 13, 13, 0, 0.664 + 8192, 13, 13, 1, 0.659 + 8192, 45, 45, 0, 0.593 + 8192, 45, 45, 1, 0.575 + 8192, 2048, 0, 0, 0.854 + 8192, 2048, 0, 1, 0.834 + 8192, 2061, 0, 0, 0.863 + 8192, 2061, 0, 1, 0.857 + 8192, 2048, 13, 0, 0.63 + 8192, 2048, 13, 1, 0.629 + 8192, 2061, 13, 0, 0.627 + 8192, 2061, 13, 1, 0.62 + + Signed-off-by: Noah Goldstein + Reviewed-by: H.J. Lu + (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index e6c94dfd023a25dc..2e43e67e4f4037d3 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ + unsigned int minimum_rep_movsb_threshold; + #endif +- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ ++ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for ++ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB ++ threshold is 2048 * (VEC_SIZE / 16). */ + unsigned int rep_movsb_threshold; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) + { +- rep_movsb_threshold = 2048 * (64 / 16); ++ rep_movsb_threshold = 4096 * (64 / 16); + #if HAVE_TUNABLES + minimum_rep_movsb_threshold = 64 * 8; + #endif +@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + else if (CPU_FEATURE_PREFERRED_P (cpu_features, + AVX_Fast_Unaligned_Load)) + { +- rep_movsb_threshold = 2048 * (32 / 16); ++ rep_movsb_threshold = 4096 * (32 / 16); + #if HAVE_TUNABLES + minimum_rep_movsb_threshold = 32 * 8; + #endif +diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list +index dd6e1d65c9490d4f..419313804d49cf65 100644 +--- a/sysdeps/x86/dl-tunables.list ++++ b/sysdeps/x86/dl-tunables.list +@@ -32,17 +32,21 @@ glibc { + } + x86_rep_movsb_threshold { + type: SIZE_T +- # Since there is overhead to set up REP MOVSB operation, REP MOVSB +- # isn't faster on short data. The memcpy micro benchmark in glibc +- # shows that 2KB is the approximate value above which REP MOVSB +- # becomes faster than SSE2 optimization on processors with Enhanced +- # REP MOVSB. Since larger register size can move more data with a +- # single load and store, the threshold is higher with larger register +- # size. Note: Since the REP MOVSB threshold must be greater than 8 +- # times of vector size and the default value is 2048 * (vector size +- # / 16), the default value and the minimum value must be updated at +- # run-time. NB: Don't set the default value since we can't tell if +- # the tunable value is set by user or not [BZ #27069]. ++ # Since there is overhead to set up REP MOVSB operation, REP ++ # MOVSB isn't faster on short data. The memcpy micro benchmark ++ # in glibc shows that 2KB is the approximate value above which ++ # REP MOVSB becomes faster than SSE2 optimization on processors ++ # with Enhanced REP MOVSB. Since larger register size can move ++ # more data with a single load and store, the threshold is ++ # higher with larger register size. Micro benchmarks show AVX ++ # REP MOVSB becomes faster apprximately at 8KB. The AVX512 ++ # threshold is extrapolated to 16KB. For machines with FSRM the ++ # threshold is universally set at 2112 bytes. Note: Since the ++ # REP MOVSB threshold must be greater than 8 times of vector ++ # size and the default value is 4096 * (vector size / 16), the ++ # default value and the minimum value must be updated at ++ # run-time. NB: Don't set the default value since we can't tell ++ # if the tunable value is set by user or not [BZ #27069]. + minval: 1 + } + x86_rep_stosb_threshold { diff --git a/glibc-upstream-2.34-183.patch b/glibc-upstream-2.34-183.patch new file mode 100644 index 0000000..a1a7285 --- /dev/null +++ b/glibc-upstream-2.34-183.patch @@ -0,0 +1,2423 @@ +commit 7cb126e7e7febf9dc3e369cc3e4885e34fb9433b +Author: Noah Goldstein +Date: Wed Nov 10 16:18:56 2021 -0600 + + x86: Shrink memcmp-sse4.S code size + + No bug. + + This implementation refactors memcmp-sse4.S primarily with minimizing + code size in mind. It does this by removing the lookup table logic and + removing the unrolled check from (256, 512] bytes. + + memcmp-sse4 code size reduction : -3487 bytes + wmemcmp-sse4 code size reduction: -1472 bytes + + The current memcmp-sse4.S implementation has a large code size + cost. This has serious adverse affects on the ICache / ITLB. While + in micro-benchmarks the implementations appears fast, traces of + real-world code have shown that the speed in micro benchmarks does not + translate when the ICache/ITLB are not primed, and that the cost + of the code size has measurable negative affects on overall + application performance. + + See https://research.google/pubs/pub48320/ for more details. + + Signed-off-by: Noah Goldstein + Reviewed-by: H.J. Lu + (cherry picked from commit 2f9062d7171850451e6044ef78d91ff8c017b9c0) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +index b7ac034569ec6178..97c102a9c5ab2b91 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S +@@ -25,14 +25,14 @@ + # define MEMCMP __memcmp_sse4_1 + # endif + +-# define JMPTBL(I, B) (I - B) ++#ifdef USE_AS_WMEMCMP ++# define CMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++#else ++# define CMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++#endif + +-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +- lea TABLE(%rip), %r11; \ +- movslq (%r11, INDEX, SCALE), %rcx; \ +- add %r11, %rcx; \ +- _CET_NOTRACK jmp *%rcx; \ +- ud2 + + /* Warning! + wmemcmp has to use SIGNED comparison for elements. +@@ -47,33 +47,253 @@ ENTRY (MEMCMP) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +- pxor %xmm0, %xmm0 + cmp $79, %RDX_LP + ja L(79bytesormore) ++ ++ cmp $CHAR_SIZE, %RDX_LP ++ jbe L(firstbyte) ++ ++ /* N in (CHAR_SIZE, 79) bytes. */ ++ cmpl $32, %edx ++ ja L(more_32_bytes) ++ ++ cmpl $16, %edx ++ jae L(16_to_32_bytes) ++ + # ifndef USE_AS_WMEMCMP +- cmp $1, %RDX_LP +- je L(firstbyte) ++ cmpl $8, %edx ++ jae L(8_to_16_bytes) ++ ++ cmpl $4, %edx ++ jb L(2_to_3_bytes) ++ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ ++ bswap %eax ++ bswap %ecx ++ ++ shlq $32, %rax ++ shlq $32, %rcx ++ ++ movl -4(%rdi, %rdx), %edi ++ movl -4(%rsi, %rdx), %esi ++ ++ bswap %edi ++ bswap %esi ++ ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(2_to_3_bytes): ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ subl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(8_to_16_bytes): ++ movq (%rdi), %rax ++ movq (%rsi), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ jne L(8_to_16_bytes_done) ++ ++ movq -8(%rdi, %rdx), %rax ++ movq -8(%rsi, %rdx), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ ++L(8_to_16_bytes_done): ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++# else ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl 4(%rdi), %ecx ++ cmpl 4(%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl -4(%rdi, %rdx), %ecx ++ cmpl -4(%rsi, %rdx), %ecx ++ jne L(8_to_16_bytes_done) ++ ret + # endif +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-# ifndef USE_AS_WMEMCMP +- .p2align 4 ++ .p2align 4,, 3 ++L(ret_zero): ++ xorl %eax, %eax ++L(zero): ++ ret ++ ++ .p2align 4,, 8 + L(firstbyte): ++ jb L(ret_zero) ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ je L(zero) ++L(8_to_16_bytes_done): ++ setg %al ++ leal -1(%rax, %rax), %eax ++# else + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax ++# endif + ret ++ ++ .p2align 4 ++L(vec_return_begin_48): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin_32): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl 32(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl 32(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl 32(%rsi, %rax), %ecx ++ movzbl 32(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_begin_16): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_end_16): ++ subl $16, %edx ++L(vec_return_end): ++ bsfl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -16(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -16(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -16(%rsi, %rax), %ecx ++ movzbl -16(%rdi, %rax), %eax ++ subl %ecx, %eax + # endif ++ ret ++ ++ .p2align 4,, 8 ++L(more_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ cmpl $64, %edx ++ jbe L(32_to_64_bytes) ++ movdqu 32(%rdi), %xmm0 ++ movdqu 32(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ .p2align 4,, 6 ++L(32_to_64_bytes): ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ ++ .p2align 4 ++L(16_to_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ + + .p2align 4 + L(79bytesormore): ++ movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 +- movdqu (%rdi), %xmm2 +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi +@@ -86,1694 +306,499 @@ L(79bytesormore): + + cmp $128, %rdx + ja L(128bytesormore) +-L(less128bytes): +- sub $64, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) + +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(less128bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(last_64_bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + ++ .p2align 4 + L(128bytesormore): +- cmp $512, %rdx +- ja L(512bytesormore) + cmp $256, %rdx +- ja L(less512bytes) ++ ja L(unaligned_loop) + L(less256bytes): +- sub $128, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi +- +- cmp $64, %rdx +- jae L(less128bytes) +- +- cmp $32, %rdx +- jb L(less32bytesin128) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +-L(less512bytes): +- sub $256, %rdx +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqu 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqu 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqu 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqu 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqu 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqu 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqu 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqu 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytes) ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytes) ++ ja L(less128bytes) + + cmp $32, %rdx +- jb L(less32bytesin256) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormore): ++L(unaligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_unaglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_unaligned) + sub $64, %rdx + .p2align 4 + L(64bytesormore_loop): +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loop) ++ ja L(64bytesormore_loop) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(loop_tail): ++ addq %rdx, %rdi ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ addq %rdx, %rsi ++ movdqu (%rsi), %xmm4 ++ movdqu 16(%rsi), %xmm5 ++ movdqu 32(%rsi), %xmm6 ++ movdqu 48(%rsi), %xmm7 ++ ++ CMPEQ %xmm4, %xmm0 ++ CMPEQ %xmm5, %xmm1 ++ CMPEQ %xmm6, %xmm2 ++ CMPEQ %xmm7, %xmm3 ++ ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 ++ ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) ++ ret + +-L(L2_L3_cache_unaglined): +- sub $64, %rdx ++L(L2_L3_cache_unaligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(L2_L3_unaligned_128bytes_loop) ++ ja L(L2_L3_unaligned_128bytes_loop) ++ jmp L(loop_tail) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-/* +- * This case is for machines which are sensitive for unaligned instructions. +- */ ++ /* This case is for machines which are sensitive for unaligned ++ * instructions. */ + .p2align 4 + L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) + L(less128bytesin2aligned): +- sub $64, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64in2alinged) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64in2alinged): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(aligned_last_64_bytes): ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 + L(128bytesormorein2aligned): +- cmp $512, %rdx +- ja L(512bytesormorein2aligned) + cmp $256, %rdx +- ja L(256bytesormorein2aligned) ++ ja L(aligned_loop) + L(less256bytesin2alinged): +- sub $128, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytesin2aligned) ++ ja L(less128bytesin2aligned) + + cmp $32, %rdx +- jb L(less32bytesin128in2aligned) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128in2aligned): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +- .p2align 4 +-L(256bytesormorein2aligned): +- +- sub $256, %rdx +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqa 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqa 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqa 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqa 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqa 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqa 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqa 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqa 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytesin2alinged) +- +- cmp $64, %rdx +- jae L(less128bytesin2aligned) +- +- cmp $32, %rdx +- jb L(less32bytesin256in2alinged) +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256in2alinged): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(aligned_last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormorein2aligned): ++L(aligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_aglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_aligned) + + sub $64, %rdx + .p2align 4 + L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loopin2aligned) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +-L(L2_L3_cache_aglined): +- sub $64, %rdx ++ ja L(64bytesormore_loopin2aligned) ++ jmp L(loop_tail) + ++L(L2_L3_cache_aligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- jae L(L2_L3_aligned_128bytes_loop) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + ++ addq $64, %rsi ++ addq $64, %rdi ++ subq $64, %rdx ++ ja L(L2_L3_aligned_128bytes_loop) ++ jmp L(loop_tail) + + .p2align 4 + L(64bytesormore_loop_end): +- add $16, %rdi +- add $16, %rsi +- ptest %xmm2, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm3, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm4, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- jmp L(16bytes) +- +-L(256bytesin256): +- add $256, %rdi +- add $256, %rsi +- jmp L(16bytes) +-L(240bytesin256): +- add $240, %rdi +- add $240, %rsi +- jmp L(16bytes) +-L(224bytesin256): +- add $224, %rdi +- add $224, %rsi +- jmp L(16bytes) +-L(208bytesin256): +- add $208, %rdi +- add $208, %rsi +- jmp L(16bytes) +-L(192bytesin256): +- add $192, %rdi +- add $192, %rsi +- jmp L(16bytes) +-L(176bytesin256): +- add $176, %rdi +- add $176, %rsi +- jmp L(16bytes) +-L(160bytesin256): +- add $160, %rdi +- add $160, %rsi +- jmp L(16bytes) +-L(144bytesin256): +- add $144, %rdi +- add $144, %rsi +- jmp L(16bytes) +-L(128bytesin256): +- add $128, %rdi +- add $128, %rsi +- jmp L(16bytes) +-L(112bytesin256): +- add $112, %rdi +- add $112, %rsi +- jmp L(16bytes) +-L(96bytesin256): +- add $96, %rdi +- add $96, %rsi +- jmp L(16bytes) +-L(80bytesin256): +- add $80, %rdi +- add $80, %rsi +- jmp L(16bytes) +-L(64bytesin256): +- add $64, %rdi +- add $64, %rsi +- jmp L(16bytes) +-L(48bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(32bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytes): +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(8bytes): +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(12bytes): +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(4bytes): +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +-L(0bytes): +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal case for wmemcmp */ +- .p2align 4 +-L(65bytes): +- movdqu -65(%rdi), %xmm1 +- movdqu -65(%rsi), %xmm2 +- mov $-65, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(49bytes): +- movdqu -49(%rdi), %xmm1 +- movdqu -49(%rsi), %xmm2 +- mov $-49, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(33bytes): +- movdqu -33(%rdi), %xmm1 +- movdqu -33(%rsi), %xmm2 +- mov $-33, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(17bytes): +- mov -17(%rdi), %rax +- mov -17(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(9bytes): +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(13bytes): +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(5bytes): +- mov -5(%rdi), %eax +- mov -5(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(66bytes): +- movdqu -66(%rdi), %xmm1 +- movdqu -66(%rsi), %xmm2 +- mov $-66, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(50bytes): +- movdqu -50(%rdi), %xmm1 +- movdqu -50(%rsi), %xmm2 +- mov $-50, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(34bytes): +- movdqu -34(%rdi), %xmm1 +- movdqu -34(%rsi), %xmm2 +- mov $-34, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(18bytes): +- mov -18(%rdi), %rax +- mov -18(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(10bytes): +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(14bytes): +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(6bytes): +- mov -6(%rdi), %eax +- mov -6(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +-L(2bytes): +- movzwl -2(%rsi), %ecx +- movzwl -2(%rdi), %eax +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(67bytes): +- movdqu -67(%rdi), %xmm2 +- movdqu -67(%rsi), %xmm1 +- mov $-67, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(51bytes): +- movdqu -51(%rdi), %xmm2 +- movdqu -51(%rsi), %xmm1 +- mov $-51, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(35bytes): +- movdqu -35(%rsi), %xmm1 +- movdqu -35(%rdi), %xmm2 +- mov $-35, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(19bytes): +- mov -19(%rdi), %rax +- mov -19(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(11bytes): +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(15bytes): +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(7bytes): +- mov -7(%rdi), %eax +- mov -7(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(3bytes): +- movzwl -3(%rdi), %eax +- movzwl -3(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin2bytes) +-L(1bytes): +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +-# endif +- +- .p2align 4 +-L(68bytes): +- movdqu -68(%rdi), %xmm2 +- movdqu -68(%rsi), %xmm1 +- mov $-68, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(52bytes): +- movdqu -52(%rdi), %xmm2 +- movdqu -52(%rsi), %xmm1 +- mov $-52, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(36bytes): +- movdqu -36(%rdi), %xmm2 +- movdqu -36(%rsi), %xmm1 +- mov $-36, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(20bytes): +- movdqu -20(%rdi), %xmm2 +- movdqu -20(%rsi), %xmm1 +- mov $-20, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -4(%rsi), %ecx +- +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(69bytes): +- movdqu -69(%rsi), %xmm1 +- movdqu -69(%rdi), %xmm2 +- mov $-69, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(53bytes): +- movdqu -53(%rsi), %xmm1 +- movdqu -53(%rdi), %xmm2 +- mov $-53, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(37bytes): +- movdqu -37(%rsi), %xmm1 +- movdqu -37(%rdi), %xmm2 +- mov $-37, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(21bytes): +- movdqu -21(%rsi), %xmm1 +- movdqu -21(%rdi), %xmm2 +- mov $-21, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(70bytes): +- movdqu -70(%rsi), %xmm1 +- movdqu -70(%rdi), %xmm2 +- mov $-70, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(54bytes): +- movdqu -54(%rsi), %xmm1 +- movdqu -54(%rdi), %xmm2 +- mov $-54, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(38bytes): +- movdqu -38(%rsi), %xmm1 +- movdqu -38(%rdi), %xmm2 +- mov $-38, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(22bytes): +- movdqu -22(%rsi), %xmm1 +- movdqu -22(%rdi), %xmm2 +- mov $-22, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(71bytes): +- movdqu -71(%rsi), %xmm1 +- movdqu -71(%rdi), %xmm2 +- mov $-71, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(55bytes): +- movdqu -55(%rdi), %xmm2 +- movdqu -55(%rsi), %xmm1 +- mov $-55, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(39bytes): +- movdqu -39(%rdi), %xmm2 +- movdqu -39(%rsi), %xmm1 +- mov $-39, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(23bytes): +- movdqu -23(%rdi), %xmm2 +- movdqu -23(%rsi), %xmm1 +- mov $-23, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- +- .p2align 4 +-L(72bytes): +- movdqu -72(%rsi), %xmm1 +- movdqu -72(%rdi), %xmm2 +- mov $-72, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(56bytes): +- movdqu -56(%rdi), %xmm2 +- movdqu -56(%rsi), %xmm1 +- mov $-56, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(40bytes): +- movdqu -40(%rdi), %xmm2 +- movdqu -40(%rsi), %xmm1 +- mov $-40, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(24bytes): +- movdqu -24(%rdi), %xmm2 +- movdqu -24(%rsi), %xmm1 +- mov $-24, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -8(%rsi), %rcx +- mov -8(%rdi), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(73bytes): +- movdqu -73(%rsi), %xmm1 +- movdqu -73(%rdi), %xmm2 +- mov $-73, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(57bytes): +- movdqu -57(%rdi), %xmm2 +- movdqu -57(%rsi), %xmm1 +- mov $-57, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(41bytes): +- movdqu -41(%rdi), %xmm2 +- movdqu -41(%rsi), %xmm1 +- mov $-41, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(25bytes): +- movdqu -25(%rdi), %xmm2 +- movdqu -25(%rsi), %xmm1 +- mov $-25, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(74bytes): +- movdqu -74(%rsi), %xmm1 +- movdqu -74(%rdi), %xmm2 +- mov $-74, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(58bytes): +- movdqu -58(%rdi), %xmm2 +- movdqu -58(%rsi), %xmm1 +- mov $-58, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(42bytes): +- movdqu -42(%rdi), %xmm2 +- movdqu -42(%rsi), %xmm1 +- mov $-42, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(26bytes): +- movdqu -26(%rdi), %xmm2 +- movdqu -26(%rsi), %xmm1 +- mov $-26, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- jmp L(diffin2bytes) +- +- .p2align 4 +-L(75bytes): +- movdqu -75(%rsi), %xmm1 +- movdqu -75(%rdi), %xmm2 +- mov $-75, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(59bytes): +- movdqu -59(%rdi), %xmm2 +- movdqu -59(%rsi), %xmm1 +- mov $-59, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(43bytes): +- movdqu -43(%rdi), %xmm2 +- movdqu -43(%rsi), %xmm1 +- mov $-43, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(27bytes): +- movdqu -27(%rdi), %xmm2 +- movdqu -27(%rsi), %xmm1 +- mov $-27, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(76bytes): +- movdqu -76(%rsi), %xmm1 +- movdqu -76(%rdi), %xmm2 +- mov $-76, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(60bytes): +- movdqu -60(%rdi), %xmm2 +- movdqu -60(%rsi), %xmm1 +- mov $-60, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(44bytes): +- movdqu -44(%rdi), %xmm2 +- movdqu -44(%rsi), %xmm1 +- mov $-44, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(28bytes): +- movdqu -28(%rdi), %xmm2 +- movdqu -28(%rsi), %xmm1 +- mov $-28, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(77bytes): +- movdqu -77(%rsi), %xmm1 +- movdqu -77(%rdi), %xmm2 +- mov $-77, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(61bytes): +- movdqu -61(%rdi), %xmm2 +- movdqu -61(%rsi), %xmm1 +- mov $-61, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(45bytes): +- movdqu -45(%rdi), %xmm2 +- movdqu -45(%rsi), %xmm1 +- mov $-45, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(29bytes): +- movdqu -29(%rdi), %xmm2 +- movdqu -29(%rsi), %xmm1 +- mov $-29, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(78bytes): +- movdqu -78(%rsi), %xmm1 +- movdqu -78(%rdi), %xmm2 +- mov $-78, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(62bytes): +- movdqu -62(%rdi), %xmm2 +- movdqu -62(%rsi), %xmm1 +- mov $-62, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(46bytes): +- movdqu -46(%rdi), %xmm2 +- movdqu -46(%rsi), %xmm1 +- mov $-46, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(30bytes): +- movdqu -30(%rdi), %xmm2 +- movdqu -30(%rsi), %xmm1 +- mov $-30, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(79bytes): +- movdqu -79(%rsi), %xmm1 +- movdqu -79(%rdi), %xmm2 +- mov $-79, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(63bytes): +- movdqu -63(%rdi), %xmm2 +- movdqu -63(%rsi), %xmm1 +- mov $-63, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(47bytes): +- movdqu -47(%rdi), %xmm2 +- movdqu -47(%rsi), %xmm1 +- mov $-47, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(31bytes): +- movdqu -31(%rdi), %xmm2 +- movdqu -31(%rsi), %xmm1 +- mov $-31, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(64bytes): +- movdqu -64(%rdi), %xmm2 +- movdqu -64(%rsi), %xmm1 +- mov $-64, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(48bytes): +- movdqu -48(%rdi), %xmm2 +- movdqu -48(%rsi), %xmm1 +- mov $-48, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(32bytes): +- movdqu -32(%rdi), %xmm2 +- movdqu -32(%rsi), %xmm1 +- mov $-32, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-/* +- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. +- */ +- .p2align 3 +-L(less16bytes): +- movsbq %dl, %rdx +- mov (%rsi, %rdx), %rcx +- mov (%rdi, %rdx), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov 8(%rsi, %rdx), %rcx +- mov 8(%rdi, %rdx), %rax +-L(diffin8bytes): +- cmp %eax, %ecx +- jne L(diffin4bytes) +- shr $32, %rcx +- shr $32, %rax +- ++ pmovmskb %xmm0, %ecx ++ incw %cx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm1, %ecx ++ notw %cx ++ sall $16, %ecx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm2, %ecx ++ notw %cx ++ shlq $32, %rcx ++ jnz L(loop_end_ret) ++ ++ addq $48, %rdi ++ addq $48, %rsi ++ movq %rax, %rcx ++ ++ .p2align 4,, 6 ++L(loop_end_ret): ++ bsfq %rcx, %rcx + # ifdef USE_AS_WMEMCMP +-/* for wmemcmp */ +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- +-L(diffin4bytes): +-# ifndef USE_AS_WMEMCMP +- cmp %cx, %ax +- jne L(diffin2bytes) +- shr $16, %ecx +- shr $16, %eax +-L(diffin2bytes): +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(end): +- and $0xff, %eax +- and $0xff, %ecx +- sub %ecx, %eax +- ret ++ movl (%rdi, %rcx), %eax ++ xorl %edx, %edx ++ cmpl (%rsi, %rcx), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- +-/* for wmemcmp */ +- mov $1, %eax +- jl L(nequal_bigger) +- neg %eax +- ret +- +- .p2align 4 +-L(nequal_bigger): +- ret +- +-L(unreal_case): +- xor %eax, %eax +- ret ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +- ++ ret + END (MEMCMP) +- +- .section .rodata.sse4.1,"a",@progbits +- .p2align 3 +-# ifndef USE_AS_WMEMCMP +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(1bytes), L(table_64bytes)) +- .int JMPTBL (L(2bytes), L(table_64bytes)) +- .int JMPTBL (L(3bytes), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(5bytes), L(table_64bytes)) +- .int JMPTBL (L(6bytes), L(table_64bytes)) +- .int JMPTBL (L(7bytes), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(9bytes), L(table_64bytes)) +- .int JMPTBL (L(10bytes), L(table_64bytes)) +- .int JMPTBL (L(11bytes), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(13bytes), L(table_64bytes)) +- .int JMPTBL (L(14bytes), L(table_64bytes)) +- .int JMPTBL (L(15bytes), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(17bytes), L(table_64bytes)) +- .int JMPTBL (L(18bytes), L(table_64bytes)) +- .int JMPTBL (L(19bytes), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(21bytes), L(table_64bytes)) +- .int JMPTBL (L(22bytes), L(table_64bytes)) +- .int JMPTBL (L(23bytes), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(25bytes), L(table_64bytes)) +- .int JMPTBL (L(26bytes), L(table_64bytes)) +- .int JMPTBL (L(27bytes), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(29bytes), L(table_64bytes)) +- .int JMPTBL (L(30bytes), L(table_64bytes)) +- .int JMPTBL (L(31bytes), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(33bytes), L(table_64bytes)) +- .int JMPTBL (L(34bytes), L(table_64bytes)) +- .int JMPTBL (L(35bytes), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(37bytes), L(table_64bytes)) +- .int JMPTBL (L(38bytes), L(table_64bytes)) +- .int JMPTBL (L(39bytes), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(41bytes), L(table_64bytes)) +- .int JMPTBL (L(42bytes), L(table_64bytes)) +- .int JMPTBL (L(43bytes), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(45bytes), L(table_64bytes)) +- .int JMPTBL (L(46bytes), L(table_64bytes)) +- .int JMPTBL (L(47bytes), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(49bytes), L(table_64bytes)) +- .int JMPTBL (L(50bytes), L(table_64bytes)) +- .int JMPTBL (L(51bytes), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(53bytes), L(table_64bytes)) +- .int JMPTBL (L(54bytes), L(table_64bytes)) +- .int JMPTBL (L(55bytes), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(57bytes), L(table_64bytes)) +- .int JMPTBL (L(58bytes), L(table_64bytes)) +- .int JMPTBL (L(59bytes), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(61bytes), L(table_64bytes)) +- .int JMPTBL (L(62bytes), L(table_64bytes)) +- .int JMPTBL (L(63bytes), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(65bytes), L(table_64bytes)) +- .int JMPTBL (L(66bytes), L(table_64bytes)) +- .int JMPTBL (L(67bytes), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(69bytes), L(table_64bytes)) +- .int JMPTBL (L(70bytes), L(table_64bytes)) +- .int JMPTBL (L(71bytes), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(73bytes), L(table_64bytes)) +- .int JMPTBL (L(74bytes), L(table_64bytes)) +- .int JMPTBL (L(75bytes), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(77bytes), L(table_64bytes)) +- .int JMPTBL (L(78bytes), L(table_64bytes)) +- .int JMPTBL (L(79bytes), L(table_64bytes)) +-# else +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +-# endif + #endif diff --git a/glibc-upstream-2.34-184.patch b/glibc-upstream-2.34-184.patch new file mode 100644 index 0000000..805f91e --- /dev/null +++ b/glibc-upstream-2.34-184.patch @@ -0,0 +1,104 @@ +commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706 +Author: Noah Goldstein +Date: Fri Dec 3 15:29:25 2021 -0800 + + x86-64: Use notl in EVEX strcmp [BZ #28646] + + Must use notl %edi here as lower bits are for CHAR comparisons + potentially out of range thus can be 0 without indicating mismatch. + This fixes BZ #28646. + + Co-Authored-By: H.J. Lu + (cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02) + +diff --git a/string/test-strcmp.c b/string/test-strcmp.c +index 7feababf4ddc5603..a0255b9625fbcedd 100644 +--- a/string/test-strcmp.c ++++ b/string/test-strcmp.c +@@ -25,6 +25,7 @@ + # define TEST_NAME "strcmp" + #endif + #include "test-string.h" ++#include + + #ifdef WIDE + # include +@@ -392,6 +393,32 @@ check2 (void) + } + } + ++static void ++check3 (void) ++{ ++ size_t size = 0xd000 + 0x4000; ++ CHAR *s1, *s2; ++ CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANON, -1, 0); ++ CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANON, -1, 0); ++ if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED) ++ error (EXIT_UNSUPPORTED, errno, "mmap failed"); ++ ++ s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR)); ++ s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR)); ++ ++ STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java")); ++ STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java")); ++ ++ int exp_result = SIMPLE_STRCMP (s1, s2); ++ FOR_EACH_IMPL (impl, 0) ++ check_result (impl, s1, s2, exp_result); ++ ++ munmap ((void *) buffer1, size); ++ munmap ((void *) buffer2, size); ++} ++ + int + test_main (void) + { +@@ -400,6 +427,7 @@ test_main (void) + test_init (); + check(); + check2 (); ++ check3 (); + + printf ("%23s", ""); + FOR_EACH_IMPL (impl, 0) +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 82f12ac89bcae20b..6f5c4bf984da2b80 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -656,12 +656,13 @@ L(loop_cross_page): + in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ + VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP +@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec): + in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ + VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP diff --git a/glibc-upstream-2.34-185.patch b/glibc-upstream-2.34-185.patch new file mode 100644 index 0000000..f06f86f --- /dev/null +++ b/glibc-upstream-2.34-185.patch @@ -0,0 +1,30 @@ +commit f3a99b2216114f89b20329ae7664b764248b4bbd +Author: H.J. Lu +Date: Mon Dec 6 07:14:12 2021 -0800 + + x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI + + Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since + they won't lower CPU frequency when ZMM load and store instructions are + used. + + (cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index f4d4049e391cbabd..09590d8794b1c6fb 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -566,8 +566,11 @@ disable_tsx: + |= bit_arch_Prefer_No_VZEROUPPER; + else + { +- cpu_features->preferred[index_arch_Prefer_No_AVX512] +- |= bit_arch_Prefer_No_AVX512; ++ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency ++ when ZMM load and store instructions are used. */ ++ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI)) ++ cpu_features->preferred[index_arch_Prefer_No_AVX512] ++ |= bit_arch_Prefer_No_AVX512; + + /* Avoid RTM abort triggered by VZEROUPPER inside a + transactionally executing RTM region. */ diff --git a/glibc-upstream-2.34-186.patch b/glibc-upstream-2.34-186.patch new file mode 100644 index 0000000..a046844 --- /dev/null +++ b/glibc-upstream-2.34-186.patch @@ -0,0 +1,384 @@ +commit c796418d00f65c8c5fbed477f3ba6da2bee64ece +Author: Noah Goldstein +Date: Fri Dec 24 18:54:41 2021 -0600 + + x86: Optimize L(less_vec) case in memcmp-evex-movbe.S + + No bug. + Optimizations are twofold. + + 1) Replace page cross and 0/1 checks with masked load instructions in + L(less_vec). In applications this reduces branch-misses in the + hot [0, 32] case. + 2) Change controlflow so that L(less_vec) case gets the fall through. + + Change 2) helps copies in the [0, 32] size range but comes at the cost + of copies in the [33, 64] size range. From profiles of GCC and + Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this + appears to the the right tradeoff. + + Signed-off-by: Noah Goldstein + Reviewed-by: H.J. Lu + (cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 640f6757fac8a356..d2899e7c7078cd41 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -62,15 +62,18 @@ Latency: + # define VMOVU vmovdqu64 + + # ifdef USE_AS_WMEMCMP ++# define VMOVU_MASK vmovdqu32 + # define CHAR_SIZE 4 + # define VPCMP vpcmpd + # define VPTEST vptestmd + # else ++# define VMOVU_MASK vmovdqu8 + # define CHAR_SIZE 1 + # define VPCMP vpcmpub + # define VPTEST vptestmb + # endif + ++ + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6) + movl %edx, %edx + # endif + cmp $CHAR_PER_VEC, %RDX_LP +- jb L(less_vec) ++ /* Fall through for [0, VEC_SIZE] as its the hottest. */ ++ ja L(more_1x_vec) ++ ++ /* Create mask for CHAR's we want to compare. This allows us to ++ avoid having to include page cross logic. */ ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k2 ++ ++ /* Safe to load full ymm with mask. */ ++ VMOVU_MASK (%rsi), %YMM2{%k2} ++ VPCMP $4,(%rdi), %YMM2, %k1{%k2} ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) ++ ret + ++ .p2align 4 ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ ++ .p2align 4 ++L(more_1x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM1 + /* Use compare not equals to directly check for mismatch. */ +- VPCMP $4, (%rdi), %YMM1, %k1 ++ VPCMP $4,(%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to + L(return_vec_[0,2]). For L(return_vec_3) destination register +@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6) + + /* Check third and fourth VEC no matter what. */ + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_3) +@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6) + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while + oring with YMM1. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + + /* Or together YMM2, YMM3, and YMM4 into YMM4. */ + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 +@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6) + /* NB: eax must be zero to reach here. */ + ret + +- .p2align 4 ++ ++ .p2align 4,, 8 + L(8x_end_return_vec_0_1_2_3): + movq %rdx, %rdi + L(8x_return_vec_0_1_2_3): +@@ -222,23 +262,6 @@ L(return_vec_3): + # endif + ret + +- .p2align 4 +-L(return_vec_0): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret + + .p2align 4 + L(return_vec_1): +@@ -297,7 +320,7 @@ L(loop_4x_vec): + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -324,7 +347,7 @@ L(loop_4x_vec): + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -336,14 +359,14 @@ L(loop_4x_vec): + /* Only entry is from L(more_8x_vec). */ + .p2align 4,, 10 + L(8x_last_2x_vec): +- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_2) + /* Naturally aligned to 16 bytes. */ + L(8x_last_1x_vec): + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 +- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_3) +@@ -392,7 +415,9 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4,, 10 ++ ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_1_end): + /* Use bsf to save code size. This is necessary to have + L(one_or_less) fit in aligning bytes between. */ +@@ -411,31 +436,8 @@ L(return_vec_1_end): + # endif + ret + +- /* NB: L(one_or_less) fits in alignment padding between +- L(return_vec_1_end) and L(return_vec_0_end). */ +-# ifdef USE_AS_WMEMCMP +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax +- ret +-# else +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +- ret +-# endif +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_0_end): + tzcntl %eax, %eax + addl %edx, %eax +@@ -451,146 +453,7 @@ L(return_vec_0_end): + subl %ecx, %eax + # endif + ret ++ /* 1-byte until next cache line. */ + +- .p2align 4 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size == 0 +- but is also faster for size == CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) +- +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) +- +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Check if any matches where in bounds. Intentionally not +- storing result in eax to limit dependency chain if it goes to +- L(return_vec_0_lv). */ +- bzhil %edx, %eax, %edx +- jnz L(return_vec_0_lv) +- xorl %eax, %eax +- ret +- +- /* Essentially duplicate of L(return_vec_0). Ends up not costing +- any code as shrinks L(less_vec) by allowing 2-byte encoding of +- the jump and ends up fitting in aligning bytes. As well fits on +- same cache line as L(less_vec) so also saves a line from having +- to be fetched on cold calls to memcmp. */ +- .p2align 4,, 4 +-L(return_vec_0_lv): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(page_cross_less_vec): +- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +- bytes. */ +- cmpl $(16 / CHAR_SIZE), %edx +- jae L(between_16_31) +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(between_8_15) +- cmpl $4, %edx +- jb L(between_2_3) +- +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- /* edx is guranteed to be positive int32 in range [4, 7]. */ +- cmovne %edx, %eax +- /* ecx is -1 if rcx > rax. Otherwise 0. */ +- sbbl %ecx, %ecx +- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == +- rax then eax and ecx are zero. If rax < rax then ecx is -1 so +- eax doesn't matter. */ +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(between_8_15): +-# endif +- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %xmm1 +- vmovq (%rsi), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 +- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +- .p2align 4,, 8 +-L(between_16_31): +- /* From 16 to 31 bytes. No branch when size == 16. */ +- +- /* Use movups to save code size. */ +- vmovdqu (%rsi), %xmm2 +- VPCMP $4, (%rdi), %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 +- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +-# ifndef USE_AS_WMEMCMP +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +-# endif + END (MEMCMP) + #endif diff --git a/glibc-upstream-2.34-187.patch b/glibc-upstream-2.34-187.patch new file mode 100644 index 0000000..6186aeb --- /dev/null +++ b/glibc-upstream-2.34-187.patch @@ -0,0 +1,42 @@ +commit 9681691402052b727e01ae3375c73e0f76566593 +Author: Adhemerval Zanella +Date: Wed Apr 27 13:59:26 2022 -0300 + + linux: Fix missing internal 64 bit time_t stat usage + + These are two missing spots initially done by 52a5fe70a2c77935. + + Checked on i686-linux-gnu. + + (cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9) + +diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c +index 13160d32499c4e58..00e4ce7f80ee2dfe 100644 +--- a/sysdeps/unix/sysv/linux/faccessat.c ++++ b/sysdeps/unix/sysv/linux/faccessat.c +@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag) + if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure))) + return INLINE_SYSCALL (faccessat, 3, fd, file, mode); + +- struct stat64 stats; +- if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW)) ++ struct __stat64_t64 stats; ++ if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW)) + return -1; + + mode &= (X_OK | W_OK | R_OK); /* Clear any bogus bits. */ +diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c +index b599a66c930cad4d..f79930303118ebcd 100644 +--- a/sysdeps/unix/sysv/linux/pathconf.c ++++ b/sysdeps/unix/sysv/linux/pathconf.c +@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd) + && strcmp (mntbuf.mnt_type, "ext4") != 0) + continue; + +- struct stat64 fsst; +- if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0 ++ struct __stat64_t64 fsst; ++ if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0 + && st.st_dev == fsst.st_dev) + { + if (strcmp (mntbuf.mnt_type, "ext4") == 0) diff --git a/glibc-upstream-2.34-188.patch b/glibc-upstream-2.34-188.patch new file mode 100644 index 0000000..8b49369 --- /dev/null +++ b/glibc-upstream-2.34-188.patch @@ -0,0 +1,39 @@ +commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe +Author: Carlos O'Donell +Date: Tue Apr 26 10:52:41 2022 -0400 + + i386: Regenerate ulps + + These failures were caught while building glibc master for Fedora + Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse' + using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon + processor. + + (cherry picked from commit e465d97653311c3687aee49de782177353acfe86) + +diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps +index 7601049110789201..84e6686eba5fe79a 100644 +--- a/sysdeps/i386/fpu/libm-test-ulps ++++ b/sysdeps/i386/fpu/libm-test-ulps +@@ -668,7 +668,7 @@ ldouble: 4 + + Function: Imaginary part of "clog10": + double: 2 +-float: 1 ++float: 2 + float128: 2 + ldouble: 2 + +diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps +index a39c89cec1141935..cc21e6907fe8b6a3 100644 +--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps ++++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps +@@ -668,7 +668,7 @@ ldouble: 4 + + Function: Imaginary part of "clog10": + double: 2 +-float: 1 ++float: 2 + float128: 2 + ldouble: 2 + diff --git a/glibc.spec b/glibc.spec index b0f53f7..4df755d 100644 --- a/glibc.spec +++ b/glibc.spec @@ -148,7 +148,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 30%{?dist} +Release: 31%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch Patch176: glibc-rh2058230.patch Patch177: glibc-rh2054789.patch Patch178: glibc-upstream-2.34-108.patch -Patch179: glibc-upstream-2.34-110.patch # glibc-2.34-109-gd64b08d5ba only changes NEWS. +Patch179: glibc-upstream-2.34-110.patch Patch180: glibc-upstream-2.34-111.patch Patch181: glibc-upstream-2.34-112.patch Patch182: glibc-upstream-2.34-113.patch Patch183: glibc-upstream-2.34-114.patch +# glibc-2.34-115-gd5d1c95aaf only changes NEWS. +# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch. Patch184: glibc-upstream-2.34-117.patch Patch185: glibc-upstream-2.34-118.patch Patch186: glibc-upstream-2.34-119.patch -# glibc-2.34-115-gd5d1c95aaf only changes NEWS. -# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch. Patch187: glibc-upstream-2.34-120.patch Patch188: glibc-upstream-2.34-121.patch Patch189: glibc-upstream-2.34-122.patch @@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch Patch230: glibc-upstream-2.34-164.patch Patch231: glibc-upstream-2.34-165.patch Patch232: glibc-upstream-2.34-166.patch +Patch233: glibc-upstream-2.34-167.patch +Patch234: glibc-upstream-2.34-168.patch +Patch235: glibc-upstream-2.34-169.patch +Patch236: glibc-upstream-2.34-170.patch +Patch237: glibc-upstream-2.34-171.patch +Patch238: glibc-upstream-2.34-172.patch +Patch239: glibc-upstream-2.34-173.patch +Patch240: glibc-upstream-2.34-174.patch +Patch241: glibc-upstream-2.34-175.patch +Patch242: glibc-upstream-2.34-176.patch +Patch243: glibc-upstream-2.34-177.patch +Patch244: glibc-upstream-2.34-178.patch +Patch245: glibc-upstream-2.34-179.patch +Patch246: glibc-upstream-2.34-180.patch +Patch247: glibc-upstream-2.34-181.patch +Patch248: glibc-upstream-2.34-182.patch +Patch249: glibc-upstream-2.34-183.patch +Patch250: glibc-upstream-2.34-184.patch +Patch251: glibc-upstream-2.34-185.patch +Patch252: glibc-upstream-2.34-186.patch +Patch253: glibc-upstream-2.34-187.patch +Patch254: glibc-upstream-2.34-188.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2493,6 +2515,32 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Wed Apr 27 2022 Carlos O'Donell - 2.34-31 +- Sync with upstream branch release/2.34/master, + commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe: +- i386: Regenerate ulps +- linux: Fix missing internal 64 bit time_t stat usage +- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S +- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI +- x86-64: Use notl in EVEX strcmp [BZ #28646] +- x86: Shrink memcmp-sse4.S code size +- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h +- x86: Optimize memmove-vec-unaligned-erms.S +- x86-64: Replace movzx with movzbl +- x86-64: Remove Prefer_AVX2_STRCMP +- x86-64: Improve EVEX strcmp with masked load +- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S +- x86: Optimize memset-vec-unaligned-erms.S +- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size +- x86: Modify ENTRY in sysdep.h so that p2align can be specified +- x86-64: Optimize load of all bits set into ZMM register [BZ #28252] +- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier +- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) +- INSTALL: Rephrase -with-default-link documentation +- misc: Fix rare fortify crash on wchar funcs. [BZ 29030] +- Default to --with-default-link=no (bug 25812) +- scripts: Add glibcelf.py module + * Thu Apr 21 2022 Carlos O'Donell - 2.34-30 - Sync with upstream branch release/2.34/master, commit 71326f1f2fd09dafb9c34404765fb88129e94237: