From 329e925ee910a8efdb405d67b5405384d6573670 Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 12 May 2022 20:17:16 +0200 Subject: [PATCH] Sync with upstream branch release/2.34/master Upstream commit: 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23 - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h --- glibc-upstream-2.34-191.patch | 35 + glibc-upstream-2.34-192.patch | 27 + glibc-upstream-2.34-193.patch | 28 + glibc-upstream-2.34-194.patch | 337 ++++++ glibc-upstream-2.34-195.patch | 27 + glibc-upstream-2.34-196.patch | 27 + glibc-upstream-2.34-197.patch | 26 + glibc-upstream-2.34-198.patch | 21 + glibc-upstream-2.34-199.patch | 21 + glibc-upstream-2.34-200.patch | 29 + glibc-upstream-2.34-201.patch | 1789 +++++++++++++++++++++++++++++ glibc-upstream-2.34-202.patch | 1987 +++++++++++++++++++++++++++++++++ glibc-upstream-2.34-203.patch | 29 + glibc-upstream-2.34-204.patch | 29 + glibc-upstream-2.34-205.patch | 451 ++++++++ glibc-upstream-2.34-206.patch | 35 + glibc-upstream-2.34-207.patch | 719 ++++++++++++ glibc-upstream-2.34-208.patch | 29 + glibc-upstream-2.34-209.patch | 76 ++ glibc-upstream-2.34-210.patch | 71 ++ glibc-upstream-2.34-211.patch | 170 +++ glibc-upstream-2.34-212.patch | 256 +++++ glibc.spec | 50 +- 23 files changed, 6268 insertions(+), 1 deletion(-) create mode 100644 glibc-upstream-2.34-191.patch create mode 100644 glibc-upstream-2.34-192.patch create mode 100644 glibc-upstream-2.34-193.patch create mode 100644 glibc-upstream-2.34-194.patch create mode 100644 glibc-upstream-2.34-195.patch create mode 100644 glibc-upstream-2.34-196.patch create mode 100644 glibc-upstream-2.34-197.patch create mode 100644 glibc-upstream-2.34-198.patch create mode 100644 glibc-upstream-2.34-199.patch create mode 100644 glibc-upstream-2.34-200.patch create mode 100644 glibc-upstream-2.34-201.patch create mode 100644 glibc-upstream-2.34-202.patch create mode 100644 glibc-upstream-2.34-203.patch create mode 100644 glibc-upstream-2.34-204.patch create mode 100644 glibc-upstream-2.34-205.patch create mode 100644 glibc-upstream-2.34-206.patch create mode 100644 glibc-upstream-2.34-207.patch create mode 100644 glibc-upstream-2.34-208.patch create mode 100644 glibc-upstream-2.34-209.patch create mode 100644 glibc-upstream-2.34-210.patch create mode 100644 glibc-upstream-2.34-211.patch create mode 100644 glibc-upstream-2.34-212.patch diff --git a/glibc-upstream-2.34-191.patch b/glibc-upstream-2.34-191.patch new file mode 100644 index 0000000..55b6a81 --- /dev/null +++ b/glibc-upstream-2.34-191.patch @@ -0,0 +1,35 @@ +commit bc6fba3c8048b11c9f73db03339c97a2fec3f0cf +Author: Joseph Myers +Date: Wed Nov 17 14:25:16 2021 +0000 + + Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h + + Linux 5.15 adds a new address / protocol family PF_MCTP / AF_MCTP; add + these constants to bits/socket.h. + + Tested for x86_64. + + (cherry picked from commit bdeb7a8fa9989d18dab6310753d04d908125dc1d) + +diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h +index a011a8c0959b9970..7bb9e863d7329da9 100644 +--- a/sysdeps/unix/sysv/linux/bits/socket.h ++++ b/sysdeps/unix/sysv/linux/bits/socket.h +@@ -86,7 +86,8 @@ typedef __socklen_t socklen_t; + #define PF_QIPCRTR 42 /* Qualcomm IPC Router. */ + #define PF_SMC 43 /* SMC sockets. */ + #define PF_XDP 44 /* XDP sockets. */ +-#define PF_MAX 45 /* For now.. */ ++#define PF_MCTP 45 /* Management component transport protocol. */ ++#define PF_MAX 46 /* For now.. */ + + /* Address families. */ + #define AF_UNSPEC PF_UNSPEC +@@ -137,6 +138,7 @@ typedef __socklen_t socklen_t; + #define AF_QIPCRTR PF_QIPCRTR + #define AF_SMC PF_SMC + #define AF_XDP PF_XDP ++#define AF_MCTP PF_MCTP + #define AF_MAX PF_MAX + + /* Socket level values. Others are defined in the appropriate headers. diff --git a/glibc-upstream-2.34-192.patch b/glibc-upstream-2.34-192.patch new file mode 100644 index 0000000..5a89460 --- /dev/null +++ b/glibc-upstream-2.34-192.patch @@ -0,0 +1,27 @@ +commit fd5dbfd1cd98cb2f12f9e9f7004a4d25ab0c977f +Author: Joseph Myers +Date: Mon Nov 22 15:30:12 2021 +0000 + + Update kernel version to 5.15 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.15. (There are no new MAP_* constants covered by this test in + 5.15 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 5c3ece451d46a7d8721311609bfcb6faafacb39e) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index 810433c238f31c25..eeccdfd04dae57ab 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 14) ++ linux_version_glibc = (5, 15) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-193.patch b/glibc-upstream-2.34-193.patch new file mode 100644 index 0000000..d056d36 --- /dev/null +++ b/glibc-upstream-2.34-193.patch @@ -0,0 +1,28 @@ +commit 5146b73d72ced9bab125e986aa99ef5fe2f88475 +Author: Joseph Myers +Date: Mon Dec 20 15:38:32 2021 +0000 + + Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h + + Add the constant ARPHRD_MCTP, from Linux 5.15, to net/if_arp.h, along + with ARPHRD_CAN which was added to Linux in version 2.6.25 (commit + cd05acfe65ed2cf2db683fa9a6adb8d35635263b, "[CAN]: Allocate protocol + numbers for PF_CAN") but apparently missed for glibc at the time. + + Tested for x86_64. + + (cherry picked from commit a94d9659cd69dbc70d3494b1cbbbb5a1551675c5) + +diff --git a/sysdeps/unix/sysv/linux/net/if_arp.h b/sysdeps/unix/sysv/linux/net/if_arp.h +index 2a8933cde7cf236d..42910b776660def1 100644 +--- a/sysdeps/unix/sysv/linux/net/if_arp.h ++++ b/sysdeps/unix/sysv/linux/net/if_arp.h +@@ -95,6 +95,8 @@ struct arphdr + #define ARPHRD_ROSE 270 + #define ARPHRD_X25 271 /* CCITT X.25. */ + #define ARPHRD_HWX25 272 /* Boards with X.25 in firmware. */ ++#define ARPHRD_CAN 280 /* Controller Area Network. */ ++#define ARPHRD_MCTP 290 + #define ARPHRD_PPP 512 + #define ARPHRD_CISCO 513 /* Cisco HDLC. */ + #define ARPHRD_HDLC ARPHRD_CISCO diff --git a/glibc-upstream-2.34-194.patch b/glibc-upstream-2.34-194.patch new file mode 100644 index 0000000..0437f53 --- /dev/null +++ b/glibc-upstream-2.34-194.patch @@ -0,0 +1,337 @@ +commit 6af165658d0999ac2c4e9ce88bee020fbc2ee49f +Author: Joseph Myers +Date: Wed Mar 23 17:11:56 2022 +0000 + + Update syscall lists for Linux 5.17 + + Linux 5.17 has one new syscall, set_mempolicy_home_node. Update + syscall-names.list and regenerate the arch-syscall.h headers with + build-many-glibcs.py update-syscalls. + + Tested with build-many-glibcs.py. + + (cherry picked from commit 8ef9196b26793830515402ea95aca2629f7721ec) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h +index 9905ebedf298954c..4fcb6da80af37e9e 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h +@@ -236,6 +236,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h +index ee8085be69958b25..0cf74c1a96bb1235 100644 +--- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h +@@ -391,6 +391,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 431 ++#define __NR_set_mempolicy_home_node 560 + #define __NR_set_robust_list 466 + #define __NR_set_tid_address 411 + #define __NR_setdomainname 166 +diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h +index 1b626d97705d545a..c1207aaa12be6a51 100644 +--- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h +@@ -238,6 +238,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h +index 96ef8db9368e7de4..e7ba04c106d8af7d 100644 +--- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h +@@ -302,6 +302,7 @@ + #define __NR_sendmsg 296 + #define __NR_sendto 290 + #define __NR_set_mempolicy 321 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 338 + #define __NR_set_tid_address 256 + #define __NR_set_tls 983045 +diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h +index 96910154ed6a5c1b..dc9383758ebc641b 100644 +--- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h +@@ -250,6 +250,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_thread_area 244 + #define __NR_set_tid_address 96 +diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h +index 36675fd48e6f50c5..767f1287a30b473e 100644 +--- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h +@@ -289,6 +289,7 @@ + #define __NR_sendmsg 183 + #define __NR_sendto 82 + #define __NR_set_mempolicy 262 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 289 + #define __NR_set_tid_address 237 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h +index c86ccbda4681066c..1998f0d76a444cac 100644 +--- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h +@@ -323,6 +323,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_thread_area 243 + #define __NR_set_tid_address 258 +diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h +index d898bce404955ef0..b2eab1b93d70b9de 100644 +--- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h +@@ -272,6 +272,7 @@ + #define __NR_sendmsg 1205 + #define __NR_sendto 1199 + #define __NR_set_mempolicy 1261 ++#define __NR_set_mempolicy_home_node 1474 + #define __NR_set_robust_list 1298 + #define __NR_set_tid_address 1233 + #define __NR_setdomainname 1129 +diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h +index fe721b809076abeb..5fc3723772f92516 100644 +--- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h +@@ -310,6 +310,7 @@ + #define __NR_sendmsg 367 + #define __NR_sendto 366 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_thread_area 334 + #define __NR_set_tid_address 253 +diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h +index 6e10c3661db96a1e..b6e9b007e496cd80 100644 +--- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h +@@ -326,6 +326,7 @@ + #define __NR_sendmsg 360 + #define __NR_sendto 353 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_thread_area 243 + #define __NR_set_tid_address 258 +diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h +index 26a6d594a2222f15..b3a3871f8ab8a23e 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h +@@ -308,6 +308,7 @@ + #define __NR_sendmsg 4179 + #define __NR_sendto 4180 + #define __NR_set_mempolicy 4270 ++#define __NR_set_mempolicy_home_node 4450 + #define __NR_set_robust_list 4309 + #define __NR_set_thread_area 4283 + #define __NR_set_tid_address 4252 +diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h +index 83e0d49c5e3ca1bc..b462182723aff286 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h +@@ -288,6 +288,7 @@ + #define __NR_sendmsg 6045 + #define __NR_sendto 6043 + #define __NR_set_mempolicy 6233 ++#define __NR_set_mempolicy_home_node 6450 + #define __NR_set_robust_list 6272 + #define __NR_set_thread_area 6246 + #define __NR_set_tid_address 6213 +diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h +index d6747c542f63202b..a9d6b94572e93001 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h +@@ -270,6 +270,7 @@ + #define __NR_sendmsg 5045 + #define __NR_sendto 5043 + #define __NR_set_mempolicy 5229 ++#define __NR_set_mempolicy_home_node 5450 + #define __NR_set_robust_list 5268 + #define __NR_set_thread_area 5242 + #define __NR_set_tid_address 5212 +diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h +index 4ee209bc4475ea7d..809a219ef32a45ef 100644 +--- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h +@@ -250,6 +250,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h +index 497299fbc47a708c..627831ebae1b9e90 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h +@@ -319,6 +319,7 @@ + #define __NR_sendmsg 341 + #define __NR_sendto 335 + #define __NR_set_mempolicy 261 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 232 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h +index e840279f171b10b9..bae597199d79eaad 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h +@@ -298,6 +298,7 @@ + #define __NR_sendmsg 341 + #define __NR_sendto 335 + #define __NR_set_mempolicy 261 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 232 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h +index 73ef74c005e5a2bb..bf4be80f8d380963 100644 +--- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h +@@ -228,6 +228,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h +index 919a79ee91177459..d656aedcc2be6009 100644 +--- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h +@@ -235,6 +235,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h +index 005c0ada7aab85a1..57025107e82c9439 100644 +--- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h +@@ -311,6 +311,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_tid_address 252 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h +index 9131fddcc16116e4..72e19c6d569fbf9b 100644 +--- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h +@@ -278,6 +278,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_tid_address 252 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h +index d8fb041568ecb4da..d52b522d9cac87ef 100644 +--- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h +@@ -303,6 +303,7 @@ + #define __NR_sendmsg 355 + #define __NR_sendto 349 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_tid_address 258 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h +index 2bc014fe6a1a1f4a..d3f4d8aa3edb4795 100644 +--- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h +@@ -310,6 +310,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 305 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 166 + #define __NR_setdomainname 163 +diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h +index 76dbbe595ffe868f..2cc03d7a24453335 100644 +--- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h +@@ -286,6 +286,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 305 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 166 + #define __NR_setdomainname 163 +diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list +index 0bc2af37dfa1eeb5..e2743c649586d97a 100644 +--- a/sysdeps/unix/sysv/linux/syscall-names.list ++++ b/sysdeps/unix/sysv/linux/syscall-names.list +@@ -21,8 +21,8 @@ + # This file can list all potential system calls. The names are only + # used if the installed kernel headers also provide them. + +-# The list of system calls is current as of Linux 5.16. +-kernel 5.16 ++# The list of system calls is current as of Linux 5.17. ++kernel 5.17 + + FAST_atomic_update + FAST_cmpxchg +@@ -523,6 +523,7 @@ sendmmsg + sendmsg + sendto + set_mempolicy ++set_mempolicy_home_node + set_robust_list + set_thread_area + set_tid_address +diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h +index 28558279b48a1ef4..b4ab892ec183e32d 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h +@@ -278,6 +278,7 @@ + #define __NR_sendmsg 46 + #define __NR_sendto 44 + #define __NR_set_mempolicy 238 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 273 + #define __NR_set_thread_area 205 + #define __NR_set_tid_address 218 +diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h +index c1ab8ec45e8b8fd3..772559c87b3625b8 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h +@@ -270,6 +270,7 @@ + #define __NR_sendmsg 1073742342 + #define __NR_sendto 1073741868 + #define __NR_set_mempolicy 1073742062 ++#define __NR_set_mempolicy_home_node 1073742274 + #define __NR_set_robust_list 1073742354 + #define __NR_set_thread_area 1073742029 + #define __NR_set_tid_address 1073742042 diff --git a/glibc-upstream-2.34-195.patch b/glibc-upstream-2.34-195.patch new file mode 100644 index 0000000..d2b7afb --- /dev/null +++ b/glibc-upstream-2.34-195.patch @@ -0,0 +1,27 @@ +commit 81181ba5d916fc49bd737f603e28a3c2dc8430b4 +Author: Joseph Myers +Date: Wed Feb 16 14:19:24 2022 +0000 + + Update kernel version to 5.16 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.16. (There are no new MAP_* constants covered by this test in + 5.16 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 790a607e234aa10d4b977a1b80aebe8a2acac970) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index eeccdfd04dae57ab..8102d80b6660e523 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 15) ++ linux_version_glibc = (5, 16) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-196.patch b/glibc-upstream-2.34-196.patch new file mode 100644 index 0000000..5294eea --- /dev/null +++ b/glibc-upstream-2.34-196.patch @@ -0,0 +1,27 @@ +commit 0499c3a95fb864284fef36d3e9c5a54f6646b2db +Author: Joseph Myers +Date: Thu Mar 24 15:35:27 2022 +0000 + + Update kernel version to 5.17 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.17. (There are no new MAP_* constants covered by this test in + 5.17 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 23808a422e6036accaba7236fd3b9a0d7ab7e8ee) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index 8102d80b6660e523..724c7375c3a1623b 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 16) ++ linux_version_glibc = (5, 17) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-197.patch b/glibc-upstream-2.34-197.patch new file mode 100644 index 0000000..afe47ec --- /dev/null +++ b/glibc-upstream-2.34-197.patch @@ -0,0 +1,26 @@ +commit f858bc309315a03ff6b1a048f59405c159d23430 +Author: Joseph Myers +Date: Mon Feb 21 22:49:36 2022 +0000 + + Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h + + Linux 5.16 adds constants SOL_MPTCP and SOL_MCTP to the getsockopt / + setsockopt levels; add these constants to bits/socket.h. + + Tested for x86_64. + + (cherry picked from commit fdc1ae67fef27eea1445bab4bdfe2f0fb3bc7aa1) + +diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h +index 7bb9e863d7329da9..c81fab840918924e 100644 +--- a/sysdeps/unix/sysv/linux/bits/socket.h ++++ b/sysdeps/unix/sysv/linux/bits/socket.h +@@ -169,6 +169,8 @@ typedef __socklen_t socklen_t; + #define SOL_KCM 281 + #define SOL_TLS 282 + #define SOL_XDP 283 ++#define SOL_MPTCP 284 ++#define SOL_MCTP 285 + + /* Maximum queue length specifiable by listen. */ + #define SOMAXCONN 4096 diff --git a/glibc-upstream-2.34-198.patch b/glibc-upstream-2.34-198.patch new file mode 100644 index 0000000..67ab10c --- /dev/null +++ b/glibc-upstream-2.34-198.patch @@ -0,0 +1,21 @@ +commit c108e87026d61d6744e3e55704e0bea937243f5a +Author: Szabolcs Nagy +Date: Tue Dec 14 11:15:07 2021 +0000 + + aarch64: Add HWCAP2_ECV from Linux 5.16 + + Indicates the availability of enhanced counter virtualization extension + of armv8.6-a with self-synchronized virtual counter CNTVCTSS_EL0 usable + in userspace. + + (cherry picked from commit 5a1be8ebdf6f02d4efec6e5f12ad06db17511f90) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +index 30fda0a4a347695e..04cc762015a7230a 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h ++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +@@ -74,3 +74,4 @@ + #define HWCAP2_RNG (1 << 16) + #define HWCAP2_BTI (1 << 17) + #define HWCAP2_MTE (1 << 18) ++#define HWCAP2_ECV (1 << 19) diff --git a/glibc-upstream-2.34-199.patch b/glibc-upstream-2.34-199.patch new file mode 100644 index 0000000..02675fc --- /dev/null +++ b/glibc-upstream-2.34-199.patch @@ -0,0 +1,21 @@ +commit 97cb8227b864b8ea0d99a4a50e4163baad3e1c72 +Author: Joseph Myers +Date: Mon Mar 28 13:16:48 2022 +0000 + + Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h + + Add the new HWCAP2_AFP and HWCAP2_RPRES constants from Linux 5.17. + Tested with build-many-glibcs.py for aarch64-linux-gnu. + + (cherry picked from commit 866c599182e87f116440b5d854f9e99533c48eb3) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +index 04cc762015a7230a..9a5c4116b3fe9903 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h ++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +@@ -75,3 +75,5 @@ + #define HWCAP2_BTI (1 << 17) + #define HWCAP2_MTE (1 << 18) + #define HWCAP2_ECV (1 << 19) ++#define HWCAP2_AFP (1 << 20) ++#define HWCAP2_RPRES (1 << 21) diff --git a/glibc-upstream-2.34-200.patch b/glibc-upstream-2.34-200.patch new file mode 100644 index 0000000..7ad14c9 --- /dev/null +++ b/glibc-upstream-2.34-200.patch @@ -0,0 +1,29 @@ +commit 31af92b9c8cf753992d45c801a855a02060afc08 +Author: Siddhesh Poyarekar +Date: Wed May 4 15:56:47 2022 +0530 + + manual: Clarify that abbreviations of long options are allowed + + The man page and code comments clearly state that abbreviations of long + option names are recognized correctly as long as they are unique. + Document this fact in the glibc manual as well. + + Signed-off-by: Siddhesh Poyarekar + Reviewed-by: Florian Weimer + Reviewed-by: Andreas Schwab + (cherry picked from commit db1efe02c9f15affc3908d6ae73875b82898a489) + +diff --git a/manual/getopt.texi b/manual/getopt.texi +index 5485fc46946631f7..b4c0b15ac2060560 100644 +--- a/manual/getopt.texi ++++ b/manual/getopt.texi +@@ -250,7 +250,8 @@ option, and stores the option's argument (if it has one) in @code{optarg}. + + When @code{getopt_long} encounters a long option, it takes actions based + on the @code{flag} and @code{val} fields of the definition of that +-option. ++option. The option name may be abbreviated as long as the abbreviation is ++unique. + + If @code{flag} is a null pointer, then @code{getopt_long} returns the + contents of @code{val} to indicate which option it found. You should diff --git a/glibc-upstream-2.34-201.patch b/glibc-upstream-2.34-201.patch new file mode 100644 index 0000000..68ca969 --- /dev/null +++ b/glibc-upstream-2.34-201.patch @@ -0,0 +1,1789 @@ +commit 0d5b36c8cc15f064e302d29692853f8a760e1547 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S + + Optimization are primarily to the loop logic and how the page cross + logic interacts with the loop. + + The page cross logic is at times more expensive for short strings near + the end of a page but not crossing the page. This is done to retest + the page cross conditions with a non-faulty check and to improve the + logic for entering the loop afterwards. This is only particular cases, + however, and is general made up for by more than 10x improvements on + the transition from the page cross -> loop case. + + The non-page cross cases are improved most for smaller sizes [0, 128] + and go about even for (128, 4096]. The loop page cross logic is + improved so some more significant speedup is seen there as well. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + + Signed-off-by: Noah Goldstein + (cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index fa70c994fc25dfd8..a0d1c65db11028bc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -26,35 +26,57 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ ++ /* Compare packed dwords. */ + # define VPCMPEQ vpcmpeqd +-/* Compare packed dwords and store minimum. */ ++ /* Compare packed dwords and store minimum. */ + # define VPMINU vpminud +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ ++ /* Compare packed bytes. */ + # define VPCMPEQ vpcmpeqb +-/* Compare packed bytes and store minimum. */ ++ /* Compare packed bytes and store minimum. */ + # define VPMINU vpminub +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# if defined USE_AS_STRNCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ ++# define xmmZERO xmm15 ++# define ymmZERO ymm15 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif +@@ -79,783 +101,1049 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRCMP) ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx ++# endif + cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ + movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ ++ ++ /* Multiplying length by sizeof(wchar_t) can result in overflow. ++ Check if that is possible. All cases where overflow are possible ++ are cases where length is large enough that it can never be a ++ bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz OVERFLOW_STRCMP +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++ jnz __wcscmp_avx2 ++ ++ leaq (, %rdx, 4), %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP + # endif ++ vpxor %xmmZERO, %xmmZERO, %xmmZERO + movl %edi, %eax +- xorl %edx, %edx +- /* Make %xmm7 (%ymm7) all zeros in this function. */ +- vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ +- vmovdqu (%rdi), %ymm1 +- VPCMPEQ (%rsi), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- je L(next_3_vectors) +- tzcntl %ecx, %edx ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (%rdi), %ymm0 ++ /* 1s where s1 and s2 equal. */ ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s at null CHAR. */ ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ /* 1s where s1 and s2 equal AND not null CHAR. */ ++ vpandn %ymm1, %ymm2, %ymm1 ++ ++ /* All 1s -> keep going, any 0s -> return. */ ++ vpmovmskb %ymm1, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $VEC_SIZE, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* All 1s represents all equals. incl will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ incl %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(return_vec_size): +- tzcntl %ecx, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 8 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ VZEROUPPER_RETURN ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_avx2 ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ ++ jnbe __strcmp_avx2 ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif ++L(ret1): ++ ret + # endif +- VZEROUPPER_RETURN + +- .p2align 4 +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of ++ overflow. */ ++ addq $-VEC_SIZE, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_3_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++L(return_vec_3): ++ salq $32, %rcx ++# endif ++ ++L(return_vec_2): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 10 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_2) ++ ++ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- .p2align 4 +-L(next_3_vectors): +- vmovdqu VEC_SIZE(%rdi), %ymm6 +- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 +- VPMINU %ymm6, %ymm3, %ymm3 +- VPCMPEQ %ymm7, %ymm3, %ymm3 +- vpmovmskb %ymm3, %ecx +- testl %ecx, %ecx +- jne L(return_vec_size) +- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 +- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 +- VPMINU %ymm5, %ymm2, %ymm2 +- VPCMPEQ %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm2, %ymm2 +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jne L(return_2_vec_size) +- VPMINU %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ /* any non-zero positive value that doesn't inference with 0x1. + */ +- subq %rdx, %r11 +- jbe L(zero) +-# endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) ++ movl $2, %r8d + ++# else ++ xorl %r8d, %r8d ++# endif ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ ++# ifdef USE_AS_STRNCMP ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++# endif ++L(prepare_loop_no_len): ++ ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++ addq %rdi, %rsi ++ ++# ifdef USE_AS_STRNCMP ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) +-# endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- vmovdqa (%rax), %ymm0 +- vmovdqa VEC_SIZE(%rax), %ymm3 +- VPCMPEQ (%rdx), %ymm0, %ymm4 +- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 +- VPMINU %ymm0, %ymm4, %ymm4 +- VPMINU %ymm3, %ymm1, %ymm1 +- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 +- VPMINU %ymm1, %ymm4, %ymm0 +- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPMINU %ymm5, %ymm0, %ymm0 +- VPMINU %ymm6, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- vpmovmskb %ymm0, %ecx ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ ++ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 ++ ++ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ ++ ++ /* If any mismatches or null CHAR then 0 CHAR, otherwise non- ++ zero. */ ++ vpand %ymm0, %ymm1, %ymm1 ++ ++ ++ vpand %ymm2, %ymm3, %ymm3 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ ++ VPMINU %ymm1, %ymm3, %ymm3 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ ++ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ ++ VPMINU %ymm3, %ymm7, %ymm7 ++ ++ /* If any 0 CHAR then done. */ ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jz L(loop) ++ ++ /* Find which VEC has the mismatch of end of string. */ ++ VPCMPEQ %ymm1, %ymmZERO, %ymm1 ++ vpmovmskb %ymm1, %ecx + testl %ecx, %ecx +- je L(loop) +- VPCMPEQ %ymm7, %ymm4, %ymm0 +- vpmovmskb %ymm0, %edi +- testl %edi, %edi +- je L(test_vec) +- tzcntl %edi, %ecx ++ jnz L(return_vec_0_end) ++ ++ ++ VPCMPEQ %ymm3, %ymmZERO, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_1_end) ++ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++ VPCMPEQ %ymm5, %ymmZERO, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_2_end) ++ ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++ tzcntl %LOOP_REG, %LOOP_REG ++ ++# ifdef USE_AS_STRNCMP ++ subl $-(VEC_SIZE), %LOOP_REG ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_vec): + # ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) ++ .p2align 4,, 2 ++L(ret_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- vpmovmskb %ymm1, %ecx +- testl %ecx, %ecx +- je L(test_2_vec) +- tzcntl %ecx, %edi ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++L(return_vec_1_end): ++ salq $32, %rcx ++# endif ++L(return_vec_0_end): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (%rsi, %rcx), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax ++# endif ++L(ret6): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(test_2_vec): ++ .p2align 4,, 10 ++L(return_vec_2_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- vpmovmskb %ymm5, %ecx +- testl %ecx, %ecx +- je L(test_3_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret11) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret11): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_3_vec): ++ ++ /* Page cross in rsi in next 4x VEC. */ ++ ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ ++ ++ /* Optimistically rsi and rdi and both aligned inwhich case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) ++ ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) ++ ++ VMOVA (%rdi), %ymm0 ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 ++ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ ++ movl $-1, %r10d ++ shlxl %esi, %r10d, %r10d ++ notl %ecx ++ + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) +-# endif +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- vpmovmskb %ymm6, %esi +- tzcntl %esi, %ecx ++ cmpq %rax, %rdx ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + VZEROUPPER_RETURN + +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 +-# endif +- +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) +- +- vmovdqu (%rax, %r10), %ymm2 +- vmovdqu VEC_SIZE(%rax, %r10), %ymm3 +- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 +- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 +- VPMINU %ymm2, %ymm0, %ymm0 +- VPMINU %ymm3, %ymm1, %ymm1 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- +- vpmovmskb %ymm0, %edi +- vpmovmskb %ymm1, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi +- +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrq %cl, %rdi +- +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 +- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- +- vpmovmskb %ymm5, %edi +- vpmovmskb %ymm6, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* Skip ECX bytes. */ +- shrq %cl, %rdi +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi +- +- testq %rdi, %rdi ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1_end) ++ + # ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) +-# else +- je L(back_to_loop) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif +- tzcntq %rdi, %rcx +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx ++ ++ subl $-(VEC_SIZE * 4), %eax ++ ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_1) ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ /* Must check length here as length might proclude reading next ++ page. */ ++ cmpq %rax, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# endif ++ ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ VZEROUPPER_RETURN + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ jmp L(loop_skip_page_cross_check) + # endif +- VZEROUPPER_RETURN + ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# else ++ addl %eax, %ecx + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ movl VEC_OFFSET(%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx + subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++L(ret9): ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif ++ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ xorl %r8d, %r8d + # endif +- /* Check null char. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ ++ .p2align 4,, 10 ++L(page_cross_loop): ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ ++ jnz L(check_ret_vec_page_cross) ++ addl $VEC_SIZE, %OFFSET_REG ++# ifdef USE_AS_STRNCMP ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VZEROUPPER_RETURN ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++# ifdef USE_AS_STRNCMP ++ leal VEC_SIZE(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++ addq %rdi, %rdx ++# endif ++ incl %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- VZEROUPPER_RETURN ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ incl %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- VZEROUPPER_RETURN ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $16, %eax ++ ja L(less_16_till_page) ++ ++ VMOVU (%rdi), %xmm0 ++ VPCMPEQ (%rsi), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ movl $16, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif +- tzcntl %ecx, %edx ++ ++ VMOVU (%rdi, %OFFSET_REG64), %xmm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ addl $16, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi + # endif +-# ifdef USE_AS_WCSCMP ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ ret + # endif +- VZEROUPPER_RETURN + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- vmovdqu (%rdi, %rdx), %ymm1 +- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) + +- addl $VEC_SIZE, %edx ++ .p2align 4,, 10 ++L(less_16_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $24, %eax ++ ja L(less_8_till_page) + +- addl $VEC_SIZE, %eax +-# ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- vmovdqu (%rdi, %rdx), %xmm1 +- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $8, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif ++ movl $24, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ ++ ++ ++ vmovq (%rdi, %OFFSET_REG64), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %xmm1 +- vmovq (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ addl $8, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx + +- addl $8, %edx +- addl $8, %eax ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): ++# ifdef USE_AS_WCSCMP ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addq %rdi, %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ + # endif ++ testl %eax, %eax ++ jnz L(prepare_loop_no_len) ++ ret + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %xmm1 +- vmovd (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ ++# else ++ ++ /* Find largest load size we can use. */ ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $28, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ ++ ++ vmovd (%rdi, %OFFSET_REG64), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) ++ ++# ifdef USE_AS_STRNCMP ++ addl $4, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax +- VZEROUPPER_RETURN +-END (STRCMP) ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(VEC_SIZE * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax ++ ret ++# endif ++END(STRCMP) + #endif diff --git a/glibc-upstream-2.34-202.patch b/glibc-upstream-2.34-202.patch new file mode 100644 index 0000000..9357b6f --- /dev/null +++ b/glibc-upstream-2.34-202.patch @@ -0,0 +1,1987 @@ +commit c41a66767d23b7f219fb943be6fab5ddf822d7da +Author: Noah Goldstein +Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S + + Optimization are primarily to the loop logic and how the page cross + logic interacts with the loop. + + The page cross logic is at times more expensive for short strings near + the end of a page but not crossing the page. This is done to retest + the page cross conditions with a non-faulty check and to improve the + logic for entering the loop afterwards. This is only particular cases, + however, and is general made up for by more than 10x improvements on + the transition from the page cross -> loop case. + + The non-page cross cases as well are nearly universally improved. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + + Signed-off-by: Noah Goldstein + (cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 6f5c4bf984da2b80..99d8409af27327ad 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -26,54 +26,69 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif +- +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ +-# define VPCMP vpcmpd ++# define TESTEQ subl $0xff, ++ /* Compare packed dwords. */ ++# define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd +-# define SHIFT_REG32 r8d +-# define SHIFT_REG64 r8 +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ +-# define VPCMP vpcmpb ++# define TESTEQ incl ++ /* Compare packed bytes. */ ++# define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb +-# define SHIFT_REG32 ecx +-# define SHIFT_REG64 rcx +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ + # define XMMZERO xmm16 +-# define XMM0 xmm17 +-# define XMM1 xmm18 ++# define XMM0 xmm17 ++# define XMM1 xmm18 + + # define YMMZERO ymm16 +-# define YMM0 ymm17 +-# define YMM1 ymm18 +-# define YMM2 ymm19 +-# define YMM3 ymm20 +-# define YMM4 ymm21 +-# define YMM5 ymm22 +-# define YMM6 ymm23 +-# define YMM7 ymm24 +-# define YMM8 ymm25 +-# define YMM9 ymm26 +-# define YMM10 ymm27 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -96,985 +111,1096 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.evex,"ax",@progbits +-ENTRY (STRCMP) ++ .section .text.evex, "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) +-# ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ +- movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ +- shrq $56, %rcx +- jnz __wcscmp_evex +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP ++ cmp $1, %RDX_LP ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # endif + movl %edi, %eax +- xorl %edx, %edx +- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ ++ /* Shift out the bits irrelivant to page boundary ([63:12]). */ ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %YMM0 +- +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ + VPCMP $0, (%rsi), %YMM0, %k1{%k2} +- + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(next_3_vectors) +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for ++ wcscmp/wcsncmp. */ ++ ++ /* All 1s represents all equals. TESTEQ will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ TESTEQ %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + ret + +-L(return_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 4 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ ret ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_evex ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __strcmp_evex ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret1): + ret ++# endif + +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_STRNCMP ++ /* rdx must be > CHAR_PER_VEC so its safe to subtract without ++ worrying about underflow. */ ++ addq $-CHAR_PER_VEC, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): ++ ret ++ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++L(return_vec_3): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_2): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +-L(return_3_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + ret ++# endif + +- .p2align 4 +-L(next_3_vectors): +- VMOVU VEC_SIZE(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ /* 32 byte align here ensures the main loop is ideally aligned ++ for DSB. */ ++ .p2align 5 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) + # endif +- jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- jne L(return_2_vec_size) ++ TESTEQ %ecx ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_3) ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d ++ + # else +- incl %ecx ++ xorl %r8d, %r8d + # endif +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. +- */ +- subq %rdx, %r11 +- jbe L(zero) ++# ifdef USE_AS_WCSCMP ++L(prepare_loop_no_len): ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ shrl $2, %ecx ++ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx ++# else ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++L(prepare_loop_no_len): ++# endif ++# else ++L(prepare_loop_no_len): + # endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) + ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++L(prepare_loop_readj): ++ addq %rdi, %rsi ++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ vpxorq %YMMZERO, %YMMZERO, %YMMZERO ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_zero) + # endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- VMOVA (%rax), %YMM0 +- VMOVA VEC_SIZE(%rax), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rax), %YMM4 +- VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + + VPMINU %YMM0, %YMM2, %YMM8 + VPMINU %YMM4, %YMM6, %YMM9 + +- /* A zero CHAR in YMM8 means that there is a null CHAR. */ +- VPMINU %YMM8, %YMM9, %YMM8 ++ /* A zero CHAR in YMM9 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM8. */ +- VPTESTM %YMM8, %YMM8, %k1 ++ VPTESTM %YMM9, %YMM9, %k1 + +- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ +- vpxorq (%rdx), %YMM0, %YMM1 +- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 +- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 ++ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while ++ oring with YMM1. Result is stored in YMM6. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + +- vporq %YMM1, %YMM3, %YMM9 +- vporq %YMM5, %YMM7, %YMM10 ++ /* Or together YMM3, YMM5, and YMM6. */ ++ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + +- /* A non-zero CHAR in YMM9 represents a mismatch. */ +- vporq %YMM9, %YMM10, %YMM9 + +- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ +- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} +- kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(loop) ++ /* A non-zero CHAR in YMM6 represents a mismatch. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG + +- /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ TESTEQ %LOOP_REG ++ jz L(loop) ++ ++ ++ /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM0 and (%rdx). */ + VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_vec) +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) + +- .p2align 4 +-L(test_vec): +-# ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) +-# endif +- /* Each bit set in K1 represents a non-null CHAR in YMM2. */ + VPTESTM %YMM2, %YMM2, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM2 and VEC_SIZE(%rdx). */ + VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_2_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi +-# endif +-# ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- .p2align 4 +-L(test_2_vec): ++ ++ /* Handle VEC 2 and 3 without branches. */ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ + VPTESTM %YMM4, %YMM4, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM4 and (VEC_SIZE * 2)(%rdx). */ + VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ TESTEQ %ecx ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %LOOP_REG ++ orl %ecx, %LOOP_REG + # else +- incl %ecx ++ salq $CHAR_PER_VEC, %LOOP_REG64 ++ orq %rcx, %LOOP_REG64 ++# endif ++L(return_vec_3_end): ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++# if CHAR_PER_VEC <= 16 ++ tzcntl %LOOP_REG, %LOOP_REG ++# else ++ tzcntq %LOOP_REG64, %LOOP_REG64 ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) + # endif +- je L(test_3_vec) +- tzcntl %ecx, %edi ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi ++ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ xorl %eax, %eax ++ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): ++ ret ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 2 ++L(ret_zero_end): + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ ret ++# endif ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 ++# ifdef USE_AS_STRNCMP ++L(return_vec_1_end): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_0_end): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +- .p2align 4 +-L(test_3_vec): + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM6. */ +- VPTESTM %YMM6, %YMM6, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM6 and (VEC_SIZE * 3)(%rdx). */ +- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} +- kmovd %k0, %ecx ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ /* This is the non-zero case for `eax` so just xorl with `r8d` ++ flip is `rdi` and `rsi` where swapped. */ ++ xorl %r8d, %eax + # else +- incl %ecx ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross ++ logic. Subtract `r8d` after xor for zero case. */ ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret6): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): + tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + ret +- +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 + # endif + +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) + +- VMOVU (%rax, %r10), %YMM2 +- VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ /* Page cross in rsi in next 4x VEC. */ + +- /* Each bit set in K2 represents a non-null CHAR in YMM2. */ +- VPTESTM %YMM2, %YMM2, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM2 and 32 bytes at (%rdx, %r10). */ +- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + +- /* Each bit set in K4 represents a non-null CHAR in YMM3. */ +- VPTESTM %YMM3, %YMM3, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ +- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi +-# endif ++ /* Optimistically rsi and rdi and both aligned in which case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG32 +- sarl $2, %SHIFT_REG32 +- +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi +-# endif ++ VMOVA (%rdi), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ + +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrxq %SHIFT_REG64, %rdi, %rdi +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx ++ movl $-1, %r10d ++ movl %esi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ shrl $2, %ecx ++ shlxl %ecx, %r10d, %ecx ++ movzbl %cl, %r10d ++# else ++ movl $-1, %ecx ++ shlxl %esi, %ecx, %r10d + # endif ++ ++ kmovd %k1, %ecx ++ notl %ecx ++ ++ + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx + # else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax ++ cmpq %rax, %rdx + # endif ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ ++ /* Readjust eax before potentially returning to the loop. */ ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ ++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + ret + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 +- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_WCSCMP ++ sall $2, %edx ++# endif ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) ++ xorl %eax, %eax ++ ret ++# endif ++ + ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- VPTESTM %YMM1, %YMM1, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi ++ subl $-(VEC_SIZE * 4), %eax + +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_1) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi ++# ifdef USE_AS_STRNCMP ++ /* Must check length here as length might proclude reading next ++ page. */ ++# ifdef USE_AS_WCSCMP ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx ++# else ++ cmpq %rax, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-# ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in RDI represent 4 +- bytes. */ +- sarl $2, %ecx +- /* Skip ECX bytes. */ +- shrl %cl, %edi ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ VPTESTM %YMM9, %YMM9, %k1 ++ ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 ++ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG ++ TESTEQ %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): ++ xorl %eax, %eax ++ ret + # else +- /* Skip ECX bytes. */ +- shrq %cl, %rdi ++ jmp L(loop_skip_page_cross_check) + # endif +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + +- testq %rdi, %rdi +-# ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_STRNCMP ++# ifdef USE_AS_WCSCMP ++ /* Must divide ecx instead of multiply rdx due to overflow. */ ++ movl %ecx, %eax ++ shrl $2, %eax ++ cmpq %rax, %rdx ++# else ++ cmpq %rcx, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) ++# endif + # else +- je L(back_to_loop) ++ addl %eax, %ecx + # endif +- tzcntq %rdi, %rcx ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret9): + ret + +-# ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- subl %ecx, %eax ++ xorl %r8d, %r8d + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ .p2align 4,, 8 ++L(page_cross_loop): ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(check_ret_vec_page_cross) ++ addl $CHAR_PER_VEC, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ shrl $2, %eax + # endif +- /* Check null CHAR. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ ++ kmovd %k1, %ecx ++# ifdef USE_AS_STRNCMP ++ leal CHAR_PER_VEC(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++# ifdef USE_AS_WCSCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++# else ++ addq %rdi, %rdx ++# endif + # endif +- ret ++ TESTEQ %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- ret ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax ++ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ ret ++ + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ TESTEQ %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + ret ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- ret ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi +-# ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++# ifdef USE_AS_WCSCMP ++ shrl $2, %eax + # endif +- tzcntl %ecx, %edx ++ /* Find largest load size we can use. */ ++ cmpl $(16 / SIZE_OF_CHAR), %eax ++ ja L(less_16_till_page) ++ ++ /* Use 16 byte comparison. */ ++ vmovdqu (%rdi), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ subl $0xf, %ecx ++# else ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif ++ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ subl $0xf, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): ++ xorl %eax, %eax + ret ++# endif + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- VMOVU (%rdi, %rdx), %YMM0 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} ++ .p2align 4,, 10 ++L(less_16_till_page): ++ cmpl $(24 / SIZE_OF_CHAR), %eax ++ ja L(less_8_till_page) ++ ++ /* Use 8 byte comparison. */ ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ subl $0x3, %ecx + # else +- incl %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) + +- addl $VEC_SIZE, %edx + +- addl $VEC_SIZE, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $(8 / SIZE_OF_CHAR), %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- VMOVU (%rdi, %rdx), %XMM0 ++ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and 16 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xf, %ecx ++ subl $0x3, %ecx + # else +- subl $0xffff, %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++ + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi + # endif ++ jmp L(prepare_loop_aligned) + +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %XMM0 +- vmovq (%rsi, %rdx), %XMM1 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} +- kmovb %k1, %ecx ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): + # ifdef USE_AS_WCSCMP +- subl $0x3, %ecx ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) ++# ifdef USE_AS_STRNCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ ++# endif ++ testl %eax, %eax ++ jnz L(prepare_loop) ++ ret ++ ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ + # else +- subl $0xff, %ecx +-# endif +- jne L(last_vector) ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ kmovd %k1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $8, %edx +- addl $8, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %XMM0 +- vmovd (%rsi, %rdx), %XMM1 +- +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0x1, %ecx +-# else + subl $0xf, %ecx +-# endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret + # endif + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(CHAR_PER_VEC * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax + ret +-END (STRCMP) ++# endif ++END(STRCMP) + #endif diff --git a/glibc-upstream-2.34-203.patch b/glibc-upstream-2.34-203.patch new file mode 100644 index 0000000..e45b588 --- /dev/null +++ b/glibc-upstream-2.34-203.patch @@ -0,0 +1,29 @@ +commit d299032743e05571ef326c838a5ecf6ef5b3e9c3 +Author: H.J. Lu +Date: Fri Feb 4 11:09:10 2022 -0800 + + x86-64: Fix strcmp-avx2.S + + Change "movl %edx, %rdx" to "movl %edx, %edx" in: + + commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 + Author: Noah Goldstein + Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S + + (cherry picked from commit c15efd011cea3d8f0494269eb539583215a1feed) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index a0d1c65db11028bc..cdded412a70bad10 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -106,7 +106,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also diff --git a/glibc-upstream-2.34-204.patch b/glibc-upstream-2.34-204.patch new file mode 100644 index 0000000..4250493 --- /dev/null +++ b/glibc-upstream-2.34-204.patch @@ -0,0 +1,29 @@ +commit 53ddafe917a8af17b16beb794c29e5b09b86d534 +Author: H.J. Lu +Date: Fri Feb 4 11:11:08 2022 -0800 + + x86-64: Fix strcmp-evex.S + + Change "movl %edx, %rdx" to "movl %edx, %edx" in: + + commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 + Author: Noah Goldstein + Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S + + (cherry picked from commit 0e0199a9e02ebe42e2b36958964d63f03573c382) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 99d8409af27327ad..ed56af8ecdad48b2 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -116,7 +116,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also diff --git a/glibc-upstream-2.34-205.patch b/glibc-upstream-2.34-205.patch new file mode 100644 index 0000000..6cf18b8 --- /dev/null +++ b/glibc-upstream-2.34-205.patch @@ -0,0 +1,451 @@ +commit ea19c490a3f5628d55ded271cbb753e66b2f05e8 +Author: Noah Goldstein +Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + + No bug. + + Split vec generation into multiple steps. This allows the + broadcast in AVX2 to use 'xmm' registers for the L(less_vec) + case. This saves an expensive lane-cross instruction and removes + the need for 'vzeroupper'. + + For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for + byte broadcast. + + Results for memset-avx2 small (geomean of N = 20 benchset runs). + + size, New Time, Old Time, New / Old + 0, 4.100, 3.831, 0.934 + 1, 5.074, 4.399, 0.867 + 2, 4.433, 4.411, 0.995 + 4, 4.487, 4.415, 0.984 + 8, 4.454, 4.396, 0.987 + 16, 4.502, 4.443, 0.987 + + All relevant string/wcsmbs tests are passing. + Reviewed-by: H.J. Lu + + (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -28,17 +28,22 @@ + #define VMOVU movups + #define VMOVA movaps + +-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- punpcklbw %xmm0, %xmm0; \ +- punpcklwd %xmm0, %xmm0; \ +- pshufd $0, %xmm0, %xmm0 ++ pxor %xmm1, %xmm1; \ ++ pshufb %xmm1, %xmm0; \ ++ movq r, %rax + +-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- pshufd $0, %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + #define SECTION(p) p + +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 1af668af0aeda59e..c0bf2875d03d51ab 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -10,15 +10,18 @@ + # define VMOVU vmovdqu + # define VMOVA vmovdqa + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastb %xmm0, %ymm0 ++ movq r, %rax; + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 ++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 + + # ifndef SECTION + # define SECTION(p) p##.avx +@@ -30,5 +33,6 @@ + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif + ++# define USE_XMM_LESS_VEC + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index f14d6f8493c21a36..5241216a77bf72b7 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 64b09e77cc20cc42..637002150659123c 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index e723413a664c088f..c8db87dcbf69f0d8 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -58,8 +58,10 @@ + #ifndef MOVQ + # if VEC_SIZE > 16 + # define MOVQ vmovq ++# define MOVD vmovd + # else + # define MOVQ movq ++# define MOVD movd + # endif + #endif + +@@ -72,9 +74,17 @@ + #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + # define END_REG rcx + # define LOOP_REG rdi ++# define LESS_VEC_REG rax + #else + # define END_REG rdi + # define LOOP_REG rdx ++# define LESS_VEC_REG rdi ++#endif ++ ++#ifdef USE_XMM_LESS_VEC ++# define XMM_SMALL 1 ++#else ++# define XMM_SMALL 0 + #endif + + #define PAGE_SIZE 4096 +@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shl $2, %RDX_LP +- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- jmp L(entry_from_bzero) ++ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) ++ WMEMSET_VDUP_TO_VEC0_LOW() ++ cmpq $VEC_SIZE, %rdx ++ jb L(less_vec_no_vdup) ++ WMEMSET_VDUP_TO_VEC0_HIGH() ++ jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH() ++L(entry_from_wmemset): + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH () + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. +- */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + +- .p2align 4,, 10 ++ .p2align 4,, 4 + L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE +- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) +- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + #else + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) +@@ -212,6 +228,7 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): ++L(less_vec_no_vdup): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -262,28 +279,18 @@ L(stosb_more_2x_vec): + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ + L(more_2x_vec): +- +- /* Two different methods of setting up pointers / compare. The +- two methods are based on the fact that EVEX/AVX512 mov +- instructions take more bytes then AVX2/SSE2 mov instructions. As +- well that EVEX/AVX512 machines also have fast LEA_BID. Both +- setup and END_REG to avoid complex address mode. For EVEX/AVX512 +- this saves code size and keeps a few targets in one fetch block. +- For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + +- LOOP_4X_OFFSET) with LEA_BID. */ +- +- /* END_REG is rcx for EVEX/AVX512. */ +- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +-#endif +- +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), VEC_SIZE(%rax) ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + + ++ /* Two different methods of setting up pointers / compare. The two ++ methods are based on the fact that EVEX/AVX512 mov instructions take ++ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 ++ machines also have fast LEA_BID. Both setup and END_REG to avoid complex ++ address mode. For EVEX/AVX512 this saves code size and keeps a few ++ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU ++ bottlenecks. */ + #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +@@ -292,6 +299,15 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with ++ LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) +@@ -355,65 +371,93 @@ L(stosb_local): + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 + L(less_vec): ++ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to ++ xmm). This is only does anything for AVX2. */ ++ MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_no_vdup): + #endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +- jae L(between_32_63) ++ jge L(between_32_63) + #endif + #if VEC_SIZE > 16 + cmpl $16, %edx +- jae L(between_16_31) ++ jge L(between_16_31) ++#endif ++#ifndef USE_XMM_LESS_VEC ++ MOVQ %XMM0, %rcx + #endif +- MOVQ %XMM0, %rdi + cmpl $8, %edx +- jae L(between_8_15) ++ jge L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) ++ jge L(between_4_7) + cmpl $1, %edx +- ja L(between_2_3) +- jb L(return) +- movb %sil, (%rax) +- VZEROUPPER_RETURN ++ jg L(between_2_3) ++ jl L(between_0_0) ++ movb %sil, (%LESS_VEC_REG) ++L(between_0_0): ++ ret + +- /* Align small targets only if not doing so would cross a fetch +- line. */ ++ /* Align small targets only if not doing so would cross a fetch line. ++ */ + #if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, (%rax) +- VMOVU %YMM0, -32(%rax, %rdx) ++ VMOVU %YMM0, (%LESS_VEC_REG) ++ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VZEROUPPER_RETURN + #endif + + #if VEC_SIZE >= 32 +- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) + L(between_16_31): + /* From 16 to 31. No branch when size == 16. */ +- VMOVU %XMM0, (%rax) +- VMOVU %XMM0, -16(%rax, %rdx) +- VZEROUPPER_RETURN ++ VMOVU %XMM0, (%LESS_VEC_REG) ++ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) ++ ret + #endif + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq %rdi, (%rax) +- movq %rdi, -8(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVQ %XMM0, (%rdi) ++ MOVQ %XMM0, -8(%rdi, %rdx) ++#else ++ movq %rcx, (%LESS_VEC_REG) ++ movq %rcx, -8(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) ++ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %edi, (%rax) +- movl %edi, -4(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVD %XMM0, (%rdi) ++ MOVD %XMM0, -4(%rdi, %rdx) ++#else ++ movl %ecx, (%LESS_VEC_REG) ++ movl %ecx, -4(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* 4 * XMM_SMALL for the third mov for AVX2. */ ++ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %di, (%rax) +- movb %dil, -1(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ movb %sil, (%rdi) ++ movb %sil, 1(%rdi) ++ movb %sil, -1(%rdi, %rdx) ++#else ++ movw %cx, (%LESS_VEC_REG) ++ movb %sil, -1(%LESS_VEC_REG, %rdx) ++#endif ++ ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/glibc-upstream-2.34-206.patch b/glibc-upstream-2.34-206.patch new file mode 100644 index 0000000..ed9f37b --- /dev/null +++ b/glibc-upstream-2.34-206.patch @@ -0,0 +1,35 @@ +commit 190ea5f7e4e7e98b9b6e3f29835ae8b1f6a5442e +Author: Noah Goldstein +Date: Mon Feb 7 00:32:23 2022 -0600 + + x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) + + commit b62ace2740a106222e124cc86956448fa07abf4d + Author: Noah Goldstein + Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + + Revert usage of 'pshufb' in broadcast logic as it is an SSSE3 + instruction and memset.S is restricted to only SSE2 instructions. + + (cherry picked from commit 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 34ee0bfdcb81fb39..954471e5a5bf225b 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -30,9 +30,10 @@ + + # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- pxor %xmm1, %xmm1; \ +- pshufb %xmm1, %xmm0; \ +- movq r, %rax ++ movq r, %rax; \ ++ punpcklbw %xmm0, %xmm0; \ ++ punpcklwd %xmm0, %xmm0; \ ++ pshufd $0, %xmm0, %xmm0 + + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ diff --git a/glibc-upstream-2.34-207.patch b/glibc-upstream-2.34-207.patch new file mode 100644 index 0000000..9818f5d --- /dev/null +++ b/glibc-upstream-2.34-207.patch @@ -0,0 +1,719 @@ +commit 5cb6329652696e79d6d576165ea87e332c9de106 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + + memset with zero as the value to set is by far the majority value (99%+ + for Python3 and GCC). + + bzero can be slightly more optimized for this case by using a zero-idiom + xor for broadcasting the set value to a register (vector or GPR). + + Co-developed-by: Noah Goldstein + (cherry picked from commit 3d9f171bfb5325bd5f427e9fc386453358c6e840) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 954471e5a5bf225b..0358210c7ff3a976 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -35,6 +35,9 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + ++# define BZERO_ZERO_VEC0() \ ++ pxor %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -53,6 +56,10 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) __bzero ++#endif ++ + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -63,6 +70,7 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) ++weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 26be40959ce62895..37d8d6f0bd2d10cc 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,85 +1,130 @@ + ifeq ($(subdir),string) + +-sysdep_routines += strncat-c stpncpy-c strncpy-c \ +- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ +- strcmp-sse4_2 strcmp-avx2 \ +- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ +- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ +- memrchr-sse2 memrchr-avx2 \ +- memcmp-sse2 \ +- memcmp-avx2-movbe \ +- memcmp-sse4 memcpy-ssse3 \ +- memmove-ssse3 \ +- memcpy-ssse3-back \ +- memmove-ssse3-back \ +- memmove-avx512-no-vzeroupper \ +- strcasecmp_l-sse2 strcasecmp_l-ssse3 \ +- strcasecmp_l-sse4_2 strcasecmp_l-avx \ +- strncase_l-sse2 strncase_l-ssse3 \ +- strncase_l-sse4_2 strncase_l-avx \ +- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ +- strrchr-sse2 strrchr-avx2 \ +- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ +- strcat-avx2 strncat-avx2 \ +- strcat-ssse3 strncat-ssse3\ +- strcpy-avx2 strncpy-avx2 \ +- strcpy-sse2 stpcpy-sse2 \ +- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ +- strcpy-sse2-unaligned strncpy-sse2-unaligned \ +- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ +- stpcpy-avx2 stpncpy-avx2 \ +- strcat-sse2 \ +- strcat-sse2-unaligned strncat-sse2-unaligned \ +- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ +- strcspn-sse2 strpbrk-sse2 strspn-sse2 \ +- strcspn-c strpbrk-c strspn-c varshift \ +- memset-avx512-no-vzeroupper \ +- memmove-sse2-unaligned-erms \ +- memmove-avx-unaligned-erms \ +- memmove-avx512-unaligned-erms \ +- memset-sse2-unaligned-erms \ +- memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms \ +- memchr-avx2-rtm \ +- memcmp-avx2-movbe-rtm \ +- memmove-avx-unaligned-erms-rtm \ +- memrchr-avx2-rtm \ +- memset-avx2-unaligned-erms-rtm \ +- rawmemchr-avx2-rtm \ +- strchr-avx2-rtm \ +- strcmp-avx2-rtm \ +- strchrnul-avx2-rtm \ +- stpcpy-avx2-rtm \ +- stpncpy-avx2-rtm \ +- strcat-avx2-rtm \ +- strcpy-avx2-rtm \ +- strlen-avx2-rtm \ +- strncat-avx2-rtm \ +- strncmp-avx2-rtm \ +- strncpy-avx2-rtm \ +- strnlen-avx2-rtm \ +- strrchr-avx2-rtm \ +- memchr-evex \ +- memcmp-evex-movbe \ +- memmove-evex-unaligned-erms \ +- memrchr-evex \ +- memset-evex-unaligned-erms \ +- rawmemchr-evex \ +- stpcpy-evex \ +- stpncpy-evex \ +- strcat-evex \ +- strchr-evex \ +- strchrnul-evex \ +- strcmp-evex \ +- strcpy-evex \ +- strlen-evex \ +- strncat-evex \ +- strncmp-evex \ +- strncpy-evex \ +- strnlen-evex \ +- strrchr-evex \ +- memchr-evex-rtm \ +- rawmemchr-evex-rtm ++sysdep_routines += \ ++ bzero \ ++ memchr-avx2 \ ++ memchr-avx2-rtm \ ++ memchr-evex \ ++ memchr-evex-rtm \ ++ memchr-sse2 \ ++ memcmp-avx2-movbe \ ++ memcmp-avx2-movbe-rtm \ ++ memcmp-evex-movbe \ ++ memcmp-sse2 \ ++ memcmp-sse4 \ ++ memcmp-ssse3 \ ++ memcpy-ssse3 \ ++ memcpy-ssse3-back \ ++ memmove-avx-unaligned-erms \ ++ memmove-avx-unaligned-erms-rtm \ ++ memmove-avx512-no-vzeroupper \ ++ memmove-avx512-unaligned-erms \ ++ memmove-evex-unaligned-erms \ ++ memmove-sse2-unaligned-erms \ ++ memmove-ssse3 \ ++ memmove-ssse3-back \ ++ memrchr-avx2 \ ++ memrchr-avx2-rtm \ ++ memrchr-evex \ ++ memrchr-sse2 \ ++ memset-avx2-unaligned-erms \ ++ memset-avx2-unaligned-erms-rtm \ ++ memset-avx512-no-vzeroupper \ ++ memset-avx512-unaligned-erms \ ++ memset-evex-unaligned-erms \ ++ memset-sse2-unaligned-erms \ ++ rawmemchr-avx2 \ ++ rawmemchr-avx2-rtm \ ++ rawmemchr-evex \ ++ rawmemchr-evex-rtm \ ++ rawmemchr-sse2 \ ++ stpcpy-avx2 \ ++ stpcpy-avx2-rtm \ ++ stpcpy-evex \ ++ stpcpy-sse2 \ ++ stpcpy-sse2-unaligned \ ++ stpcpy-ssse3 \ ++ stpncpy-avx2 \ ++ stpncpy-avx2-rtm \ ++ stpncpy-c \ ++ stpncpy-evex \ ++ stpncpy-sse2-unaligned \ ++ stpncpy-ssse3 \ ++ strcasecmp_l-avx \ ++ strcasecmp_l-sse2 \ ++ strcasecmp_l-sse4_2 \ ++ strcasecmp_l-ssse3 \ ++ strcat-avx2 \ ++ strcat-avx2-rtm \ ++ strcat-evex \ ++ strcat-sse2 \ ++ strcat-sse2-unaligned \ ++ strcat-ssse3 \ ++ strchr-avx2 \ ++ strchr-avx2-rtm \ ++ strchr-evex \ ++ strchr-sse2 \ ++ strchr-sse2-no-bsf \ ++ strchrnul-avx2 \ ++ strchrnul-avx2-rtm \ ++ strchrnul-evex \ ++ strchrnul-sse2 \ ++ strcmp-avx2 \ ++ strcmp-avx2-rtm \ ++ strcmp-evex \ ++ strcmp-sse2 \ ++ strcmp-sse2-unaligned \ ++ strcmp-sse4_2 \ ++ strcmp-ssse3 \ ++ strcpy-avx2 \ ++ strcpy-avx2-rtm \ ++ strcpy-evex \ ++ strcpy-sse2 \ ++ strcpy-sse2-unaligned \ ++ strcpy-ssse3 \ ++ strcspn-c \ ++ strcspn-sse2 \ ++ strlen-avx2 \ ++ strlen-avx2-rtm \ ++ strlen-evex \ ++ strlen-sse2 \ ++ strncase_l-avx \ ++ strncase_l-sse2 \ ++ strncase_l-sse4_2 \ ++ strncase_l-ssse3 \ ++ strncat-avx2 \ ++ strncat-avx2-rtm \ ++ strncat-c \ ++ strncat-evex \ ++ strncat-sse2-unaligned \ ++ strncat-ssse3 \ ++ strncmp-avx2 \ ++ strncmp-avx2-rtm \ ++ strncmp-evex \ ++ strncmp-sse2 \ ++ strncmp-sse4_2 \ ++ strncmp-ssse3 \ ++ strncpy-avx2 \ ++ strncpy-avx2-rtm \ ++ strncpy-c \ ++ strncpy-evex \ ++ strncpy-sse2-unaligned \ ++ strncpy-ssse3 \ ++ strnlen-avx2 \ ++ strnlen-avx2-rtm \ ++ strnlen-evex \ ++ strnlen-sse2 \ ++ strpbrk-c \ ++ strpbrk-sse2 \ ++ strrchr-avx2 \ ++ strrchr-avx2-rtm \ ++ strrchr-evex \ ++ strrchr-sse2 \ ++ strspn-c \ ++ strspn-sse2 \ ++ strstr-sse2-unaligned \ ++ varshift \ ++# sysdep_routines + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +new file mode 100644 +index 0000000000000000..13e399a9a1fbdeb2 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/bzero.c +@@ -0,0 +1,108 @@ ++/* Multiple versions of bzero. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define __bzero __redirect___bzero ++# include ++# undef __bzero ++ ++/* OPTIMIZE1 definition required for bzero patch. */ ++# define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) ++# define SYMBOL_NAME __bzero ++# include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) ++ attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx512_unaligned_erms); ++ ++ return OPTIMIZE1 (avx512_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (evex_unaligned_erms); ++ ++ return OPTIMIZE1 (evex_unaligned); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE1 (avx2_unaligned_rtm); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms); ++ ++ return OPTIMIZE1 (avx2_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (sse2_unaligned_erms); ++ ++ return OPTIMIZE1 (sse2_unaligned); ++} ++ ++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); ++ ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 39ab10613bb0ffea..4992d7bd3206a7c0 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + ++ /* Support sysdeps/x86_64/multiarch/bzero.c. */ ++ IFUNC_IMPL (i, name, bzero, ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_erms_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned) ++ ) ++ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 8ac3e479bba488be..5a5ee6f67299400b 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,6 +5,7 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index c0bf2875d03d51ab..a093a2831f3dfa0d 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,6 +14,9 @@ + vmovd d, %xmm0; \ + movq r, %rax; + ++# define BZERO_ZERO_VEC0() \ ++ vpxor %xmm0, %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -29,6 +32,9 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif ++# ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) p##_avx2_##s ++# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 5241216a77bf72b7..727c92133a15900f 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 637002150659123c..5d8fa78f05476b10 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index e4e95fc19fe48d2d..bac74ac37fd3c144 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,6 +22,7 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index c8db87dcbf69f0d8..39a096a594ccb5b6 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -26,6 +26,10 @@ + + #include + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) ++#endif ++ + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -87,6 +91,18 @@ + # define XMM_SMALL 0 + #endif + ++#ifdef USE_LESS_VEC_MASK_STORE ++# define SET_REG64 rcx ++# define SET_REG32 ecx ++# define SET_REG16 cx ++# define SET_REG8 cl ++#else ++# define SET_REG64 rsi ++# define SET_REG32 esi ++# define SET_REG16 si ++# define SET_REG8 sil ++#endif ++ + #define PAGE_SIZE 4096 + + /* Macro to calculate size of small memset block for aligning +@@ -96,18 +112,6 @@ + + #ifndef SECTION + # error SECTION is not defined! +-#endif +- +- .section SECTION(.text),"ax",@progbits +-#if VEC_SIZE == 16 && IS_IN (libc) +-ENTRY (__bzero) +- mov %RDI_LP, %RAX_LP /* Set return value. */ +- mov %RSI_LP, %RDX_LP /* Set n. */ +- xorl %esi, %esi +- pxor %XMM0, %XMM0 +- jmp L(entry_from_bzero) +-END (__bzero) +-weak_alias (__bzero, bzero) + #endif + + #if IS_IN (libc) +@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx +- jb L(less_vec_no_vdup) ++ jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + ++ENTRY (BZERO_SYMBOL(__bzero, unaligned)) ++#if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++#ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++#ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++#if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned)) ++ + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +-L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +@@ -187,6 +215,31 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + ++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) ++# if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++# ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++# ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++# if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(stosb_more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned_erms)) ++ + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -229,6 +282,7 @@ L(last_2x_vec): + .p2align 4,, 10 + L(less_vec): + L(less_vec_no_vdup): ++L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -374,8 +428,11 @@ L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_from_wmemset): ++#if VEC_SIZE > 16 + L(less_vec_no_vdup): + #endif ++#endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +@@ -386,7 +443,10 @@ L(cross_page): + jge L(between_16_31) + #endif + #ifndef USE_XMM_LESS_VEC +- MOVQ %XMM0, %rcx ++ MOVQ %XMM0, %SET_REG64 ++#endif ++#if VEC_SIZE <= 16 ++L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) +@@ -395,7 +455,7 @@ L(cross_page): + cmpl $1, %edx + jg L(between_2_3) + jl L(between_0_0) +- movb %sil, (%LESS_VEC_REG) ++ movb %SET_REG8, (%LESS_VEC_REG) + L(between_0_0): + ret + +@@ -428,8 +488,8 @@ L(between_8_15): + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) + #else +- movq %rcx, (%LESS_VEC_REG) +- movq %rcx, -8(%LESS_VEC_REG, %rdx) ++ movq %SET_REG64, (%LESS_VEC_REG) ++ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -442,8 +502,8 @@ L(between_4_7): + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) + #else +- movl %ecx, (%LESS_VEC_REG) +- movl %ecx, -4(%LESS_VEC_REG, %rdx) ++ movl %SET_REG32, (%LESS_VEC_REG) ++ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -452,12 +512,12 @@ L(between_4_7): + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + #ifdef USE_XMM_LESS_VEC +- movb %sil, (%rdi) +- movb %sil, 1(%rdi) +- movb %sil, -1(%rdi, %rdx) ++ movb %SET_REG8, (%rdi) ++ movb %SET_REG8, 1(%rdi) ++ movb %SET_REG8, -1(%rdi, %rdx) + #else +- movw %cx, (%LESS_VEC_REG) +- movb %sil, -1(%LESS_VEC_REG, %rdx) ++ movw %SET_REG16, (%LESS_VEC_REG) ++ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/glibc-upstream-2.34-208.patch b/glibc-upstream-2.34-208.patch new file mode 100644 index 0000000..d4d9b52 --- /dev/null +++ b/glibc-upstream-2.34-208.patch @@ -0,0 +1,29 @@ +commit 70509f9b4807295b2b4b43bffe110580fc0381ef +Author: Noah Goldstein +Date: Sat Feb 12 00:45:00 2022 -0600 + + x86: Set .text section in memset-vec-unaligned-erms + + commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 + Author: H.J. Lu + Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + + Remove setting the .text section for the code. This commit + adds that back. + + (cherry picked from commit 7912236f4a597deb092650ca79f33504ddb4af28) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 39a096a594ccb5b6..d9c577fb5ff9700f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -114,6 +114,7 @@ + # error SECTION is not defined! + #endif + ++ .section SECTION(.text), "ax", @progbits + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) diff --git a/glibc-upstream-2.34-209.patch b/glibc-upstream-2.34-209.patch new file mode 100644 index 0000000..4874143 --- /dev/null +++ b/glibc-upstream-2.34-209.patch @@ -0,0 +1,76 @@ +commit 5373c90f2ea3c3fa9931a684c9b81c648dfbe8d7 +Author: Noah Goldstein +Date: Tue Feb 15 20:27:21 2022 -0600 + + x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] + + Logic can read before the start of `s1` / `s2` if both `s1` and `s2` + are near the start of a page. To avoid having the result contimated by + these comparisons the `strcmp` variants would mask off these + comparisons. This was missing in the `strncmp` variants causing + the bug. This commit adds the masking to `strncmp` so that out of + range comparisons don't affect the result. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as + well a full xcheck on x86_64 linux. + Reviewed-by: H.J. Lu + + (cherry picked from commit e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf) + +diff --git a/string/test-strncmp.c b/string/test-strncmp.c +index 97e831d88fd24316..56e23670ae7f90e4 100644 +--- a/string/test-strncmp.c ++++ b/string/test-strncmp.c +@@ -438,13 +438,23 @@ check3 (void) + static void + check4 (void) + { +- const CHAR *s1 = L ("abc"); +- CHAR *s2 = STRDUP (s1); ++ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of ++ the end of the page. 2) For there to be no mismatch/null byte before the ++ first page cross. 3) For length (`n`) to be large enough for one string to ++ cross the page. And 4) for there to be either mismatch/null bytes before ++ the start of the strings. */ ++ ++ size_t size = 10; ++ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); ++ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); ++ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); ++ int exp_result; + ++ STRCPY (s1, L ("tst-tlsmod%")); ++ STRCPY (s2, L ("tst-tls-manydynamic73mod")); ++ exp_result = SIMPLE_STRNCMP (s1, s2, size); + FOR_EACH_IMPL (impl, 0) +- check_result (impl, s1, s2, SIZE_MAX, 0); +- +- free (s2); ++ check_result (impl, s1, s2, size, exp_result); + } + + int +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index cdded412a70bad10..f9bdc5ccd03aa1f9 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -661,6 +661,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ed56af8ecdad48b2..0dfa62bd149c02b4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -689,6 +689,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx + # ifdef USE_AS_WCSCMP diff --git a/glibc-upstream-2.34-210.patch b/glibc-upstream-2.34-210.patch new file mode 100644 index 0000000..4898d45 --- /dev/null +++ b/glibc-upstream-2.34-210.patch @@ -0,0 +1,71 @@ +commit e123f08ad5ea4691bc37430ce536988c221332d6 +Author: Noah Goldstein +Date: Thu Mar 24 15:50:33 2022 -0500 + + x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] + + Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not + __wcscmp_avx2. + + commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 + Author: Noah Goldstein + Date: Sun Jan 9 16:02:21 2022 -0600 + + x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] + + Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set + to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which + can cause spurious aborts. + + This change will need to be backported. + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 9fef7039a7d04947bc89296ee0d187bc8d89b772) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index aef9866cf2fbe774..ba6543be8ce13927 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -70,6 +70,16 @@ function_overflow (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow2 (void) ++{ ++ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +@@ -77,5 +87,10 @@ do_test (void) + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); ++ if (status != EXIT_SUCCESS) ++ return status; + return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index f9bdc5ccd03aa1f9..09a73942086f9c9f 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -122,7 +122,7 @@ ENTRY(STRCMP) + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx + # endif diff --git a/glibc-upstream-2.34-211.patch b/glibc-upstream-2.34-211.patch new file mode 100644 index 0000000..1221458 --- /dev/null +++ b/glibc-upstream-2.34-211.patch @@ -0,0 +1,170 @@ +commit e4a2fb76efb45210c541ee3f8ef32f317783c3a8 +Author: Florian Weimer +Date: Wed May 11 20:30:49 2022 +0200 + + manual: Document the dlinfo function + + Reviewed-by: Carlos O'Donell + Tested-by: Carlos O'Donell + (cherry picked from commit 93804a1ee084d4bdc620b2b9f91615c7da0fabe1) + + Also includes partial backport of commit 5d28a8962dcb6ec056b81d730e + (the addition of manual/dynlink.texi). + +diff --git a/manual/Makefile b/manual/Makefile +index e83444341e282916..31678681ef059e0f 100644 +--- a/manual/Makefile ++++ b/manual/Makefile +@@ -39,7 +39,7 @@ chapters = $(addsuffix .texi, \ + pipe socket terminal syslog math arith time \ + resource setjmp signal startup process ipc job \ + nss users sysinfo conf crypt debug threads \ +- probes tunables) ++ dynlink probes tunables) + appendices = lang.texi header.texi install.texi maint.texi platform.texi \ + contrib.texi + licenses = freemanuals.texi lgpl-2.1.texi fdl-1.3.texi +diff --git a/manual/dynlink.texi b/manual/dynlink.texi +new file mode 100644 +index 0000000000000000..dbf3de11769d8e57 +--- /dev/null ++++ b/manual/dynlink.texi +@@ -0,0 +1,100 @@ ++@node Dynamic Linker ++@c @node Dynamic Linker, Internal Probes, Threads, Top ++@c %MENU% Loading programs and shared objects. ++@chapter Dynamic Linker ++@cindex dynamic linker ++@cindex dynamic loader ++ ++The @dfn{dynamic linker} is responsible for loading dynamically linked ++programs and their dependencies (in the form of shared objects). The ++dynamic linker in @theglibc{} also supports loading shared objects (such ++as plugins) later at run time. ++ ++Dynamic linkers are sometimes called @dfn{dynamic loaders}. ++ ++@menu ++* Dynamic Linker Introspection:: Interfaces for querying mapping information. ++@end menu ++ ++@node Dynamic Linker Introspection ++@section Dynamic Linker Introspection ++ ++@Theglibc{} provides various functions for querying information from the ++dynamic linker. ++ ++@deftypefun {int} dlinfo (void *@var{handle}, int @var{request}, void *@var{arg}) ++@safety{@mtsafe{}@asunsafe{@asucorrupt{}}@acunsafe{@acucorrupt{}}} ++@standards{GNU, dlfcn.h} ++This function returns information about @var{handle} in the memory ++location @var{arg}, based on @var{request}. The @var{handle} argument ++must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must ++not have been closed by @code{dlclose}. ++ ++On success, @code{dlinfo} returns 0. If there is an error, the function ++returns @math{-1}, and @code{dlerror} can be used to obtain a ++corresponding error message. ++ ++The following operations are defined for use with @var{request}: ++ ++@vtable @code ++@item RTLD_DI_LINKMAP ++The corresponding @code{struct link_map} pointer for @var{handle} is ++written to @code{*@var{arg}}. The @var{arg} argument must be the ++address of an object of type @code{struct link_map *}. ++ ++@item RTLD_DI_LMID ++The namespace identifier of @var{handle} is written to ++@code{*@var{arg}}. The @var{arg} argument must be the address of an ++object of type @code{Lmid_t}. ++ ++@item RTLD_DI_ORIGIN ++The value of the @code{$ORIGIN} dynamic string token for @var{handle} is ++written to the character array starting at @var{arg} as a ++null-terminated string. ++ ++This request type should not be used because it is prone to buffer ++overflows. ++ ++@item RTLD_DI_SERINFO ++@itemx RTLD_DI_SERINFOSIZE ++These requests can be used to obtain search path information for ++@var{handle}. For both requests, @var{arg} must point to a ++@code{Dl_serinfo} object. The @code{RTLD_DI_SERINFOSIZE} request must ++be made first; it updates the @code{dls_size} and @code{dls_cnt} members ++of the @code{Dl_serinfo} object. The caller should then allocate memory ++to store at least @code{dls_size} bytes and pass that buffer to a ++@code{RTLD_DI_SERINFO} request. This second request fills the ++@code{dls_serpath} array. The number of array elements was returned in ++the @code{dls_cnt} member in the initial @code{RTLD_DI_SERINFOSIZE} ++request. The caller is responsible for freeing the allocated buffer. ++ ++This interface is prone to buffer overflows in multi-threaded processes ++because the required size can change between the ++@code{RTLD_DI_SERINFOSIZE} and @code{RTLD_DI_SERINFO} requests. ++ ++@item RTLD_DI_TLS_DATA ++This request writes the address of the TLS block (in the current thread) ++for the shared object identified by @var{handle} to @code{*@var{arg}}. ++The argument @var{arg} must be the address of an object of type ++@code{void *}. A null pointer is written if the object does not have ++any associated TLS block. ++ ++@item RTLD_DI_TLS_MODID ++This request writes the TLS module ID for the shared object @var{handle} ++to @code{*@var{arg}}. The argument @var{arg} must be the address of an ++object of type @code{size_t}. The module ID is zero if the object ++does not have an associated TLS block. ++@end vtable ++ ++The @code{dlinfo} function is a GNU extension. ++@end deftypefun ++ ++@c FIXME these are undocumented: ++@c dladdr ++@c dladdr1 ++@c dlclose ++@c dlerror ++@c dlmopen ++@c dlopen ++@c dlsym ++@c dlvsym +diff --git a/manual/libdl.texi b/manual/libdl.texi +deleted file mode 100644 +index e3fe0452d9f41d47..0000000000000000 +--- a/manual/libdl.texi ++++ /dev/null +@@ -1,10 +0,0 @@ +-@c FIXME these are undocumented: +-@c dladdr +-@c dladdr1 +-@c dlclose +-@c dlerror +-@c dlinfo +-@c dlmopen +-@c dlopen +-@c dlsym +-@c dlvsym +diff --git a/manual/probes.texi b/manual/probes.texi +index 4aae76b81921f347..ee019e651706f492 100644 +--- a/manual/probes.texi ++++ b/manual/probes.texi +@@ -1,5 +1,5 @@ + @node Internal Probes +-@c @node Internal Probes, Tunables, Threads, Top ++@c @node Internal Probes, Tunables, Dynamic Linker, Top + @c %MENU% Probes to monitor libc internal behavior + @chapter Internal probes + +diff --git a/manual/threads.texi b/manual/threads.texi +index 06b6b277a1228af1..7f166bfa87e88c36 100644 +--- a/manual/threads.texi ++++ b/manual/threads.texi +@@ -1,5 +1,5 @@ + @node Threads +-@c @node Threads, Internal Probes, Debugging Support, Top ++@c @node Threads, Dynamic Linker, Debugging Support, Top + @c %MENU% Functions, constants, and data types for working with threads + @chapter Threads + @cindex threads diff --git a/glibc-upstream-2.34-212.patch b/glibc-upstream-2.34-212.patch new file mode 100644 index 0000000..000023f --- /dev/null +++ b/glibc-upstream-2.34-212.patch @@ -0,0 +1,256 @@ +commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23 +Author: Florian Weimer +Date: Fri Apr 29 17:00:53 2022 +0200 + + dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo + + The information is theoretically available via dl_iterate_phdr as + well, but that approach is very slow if there are many shared + objects. + + Reviewed-by: Carlos O'Donell + Tested-by: Carlos O'Donell + (cherry picked from commit d056c212130280c0a54d9a4f72170ec621b70ce5) + +diff --git a/dlfcn/Makefile b/dlfcn/Makefile +index 6bbfbb8344da05cb..d3965427dabed898 100644 +--- a/dlfcn/Makefile ++++ b/dlfcn/Makefile +@@ -73,6 +73,10 @@ tststatic3-ENV = $(tststatic-ENV) + tststatic4-ENV = $(tststatic-ENV) + tststatic5-ENV = $(tststatic-ENV) + ++tests-internal += \ ++ tst-dlinfo-phdr \ ++ # tests-internal ++ + ifneq (,$(CXX)) + modules-names += bug-atexit3-lib + else +diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h +index 4a3b870a487ea789..24388cfedae4dd67 100644 +--- a/dlfcn/dlfcn.h ++++ b/dlfcn/dlfcn.h +@@ -162,7 +162,12 @@ enum + segment, or if the calling thread has not allocated a block for it. */ + RTLD_DI_TLS_DATA = 10, + +- RTLD_DI_MAX = 10 ++ /* Treat ARG as const ElfW(Phdr) **, and store the address of the ++ program header array at that location. The dlinfo call returns ++ the number of program headers in the array. */ ++ RTLD_DI_PHDR = 11, ++ ++ RTLD_DI_MAX = 11 + }; + + +diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c +index 47d2daa96fa5986f..1842925fb7c594dd 100644 +--- a/dlfcn/dlinfo.c ++++ b/dlfcn/dlinfo.c +@@ -28,6 +28,10 @@ struct dlinfo_args + void *handle; + int request; + void *arg; ++ ++ /* This is the value that is returned from dlinfo if no error is ++ signaled. */ ++ int result; + }; + + static void +@@ -40,6 +44,7 @@ dlinfo_doit (void *argsblock) + { + case RTLD_DI_CONFIGADDR: + default: ++ args->result = -1; + _dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request")); + break; + +@@ -75,6 +80,11 @@ dlinfo_doit (void *argsblock) + *(void **) args->arg = data; + break; + } ++ ++ case RTLD_DI_PHDR: ++ *(const ElfW(Phdr) **) args->arg = l->l_phdr; ++ args->result = l->l_phnum; ++ break; + } + } + +@@ -82,7 +92,8 @@ static int + dlinfo_implementation (void *handle, int request, void *arg) + { + struct dlinfo_args args = { handle, request, arg }; +- return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0; ++ _dlerror_run (&dlinfo_doit, &args); ++ return args.result; + } + + #ifdef SHARED +diff --git a/dlfcn/tst-dlinfo-phdr.c b/dlfcn/tst-dlinfo-phdr.c +new file mode 100644 +index 0000000000000000..a15a7d48ebd3b976 +--- /dev/null ++++ b/dlfcn/tst-dlinfo-phdr.c +@@ -0,0 +1,125 @@ ++/* Test for dlinfo (RTLD_DI_PHDR). ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* Used to verify that the program header array appears as expected ++ among the dl_iterate_phdr callback invocations. */ ++ ++struct dlip_callback_args ++{ ++ struct link_map *l; /* l->l_addr is used to find the object. */ ++ const ElfW(Phdr) *phdr; /* Expected program header pointed. */ ++ int phnum; /* Expected program header count. */ ++ bool found; /* True if l->l_addr has been found. */ ++}; ++ ++static int ++dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure) ++{ ++ TEST_COMPARE (sizeof (*dlpi), size); ++ struct dlip_callback_args *args = closure; ++ ++ if (dlpi->dlpi_addr == args->l->l_addr) ++ { ++ TEST_VERIFY (!args->found); ++ args->found = true; ++ TEST_VERIFY (args->phdr == dlpi->dlpi_phdr); ++ TEST_COMPARE (args->phnum, dlpi->dlpi_phnum); ++ } ++ ++ return 0; ++} ++ ++static int ++do_test (void) ++{ ++ /* Avoid a copy relocation. */ ++ struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug"); ++ struct link_map *l = (struct link_map *) debug->r_map; ++ TEST_VERIFY_EXIT (l != NULL); ++ ++ do ++ { ++ printf ("info: checking link map %p (%p) for \"%s\"\n", ++ l, l->l_phdr, l->l_name); ++ ++ /* Cause dlerror () to return an error message. */ ++ dlsym (RTLD_DEFAULT, "does-not-exist"); ++ ++ /* Use the extension that link maps are valid dlopen handles. */ ++ const ElfW(Phdr) *phdr; ++ int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr); ++ TEST_VERIFY (phnum >= 0); ++ /* Verify that the error message has been cleared. */ ++ TEST_COMPARE_STRING (dlerror (), NULL); ++ ++ TEST_VERIFY (phdr == l->l_phdr); ++ TEST_COMPARE (phnum, l->l_phnum); ++ ++ /* Check that we can find PT_DYNAMIC among the array. */ ++ { ++ bool dynamic_found = false; ++ for (int i = 0; i < phnum; ++i) ++ if (phdr[i].p_type == PT_DYNAMIC) ++ { ++ dynamic_found = true; ++ TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr); ++ } ++ TEST_VERIFY (dynamic_found); ++ } ++ ++ /* Check that dl_iterate_phdr finds the link map with the same ++ program headers. */ ++ { ++ struct dlip_callback_args args = ++ { ++ .l = l, ++ .phdr = phdr, ++ .phnum = phnum, ++ .found = false, ++ }; ++ TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0); ++ TEST_VERIFY (args.found); ++ } ++ ++ if (l->l_prev == NULL) ++ { ++ /* This is the executable, so the information is also ++ available via getauxval. */ ++ TEST_COMPARE_STRING (l->l_name, ""); ++ TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR)); ++ TEST_COMPARE (phnum, getauxval (AT_PHNUM)); ++ } ++ ++ l = l->l_next; ++ } ++ while (l != NULL); ++ ++ return 0; ++} ++ ++#include +diff --git a/manual/dynlink.texi b/manual/dynlink.texi +index dbf3de11769d8e57..7dcac64889e389fd 100644 +--- a/manual/dynlink.texi ++++ b/manual/dynlink.texi +@@ -30,9 +30,9 @@ location @var{arg}, based on @var{request}. The @var{handle} argument + must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must + not have been closed by @code{dlclose}. + +-On success, @code{dlinfo} returns 0. If there is an error, the function +-returns @math{-1}, and @code{dlerror} can be used to obtain a +-corresponding error message. ++On success, @code{dlinfo} returns 0 for most request types; exceptions ++are noted below. If there is an error, the function returns @math{-1}, ++and @code{dlerror} can be used to obtain a corresponding error message. + + The following operations are defined for use with @var{request}: + +@@ -84,6 +84,15 @@ This request writes the TLS module ID for the shared object @var{handle} + to @code{*@var{arg}}. The argument @var{arg} must be the address of an + object of type @code{size_t}. The module ID is zero if the object + does not have an associated TLS block. ++ ++@item RTLD_DI_PHDR ++This request writes the address of the program header array to ++@code{*@var{arg}}. The argument @var{arg} must be the address of an ++object of type @code{const ElfW(Phdr) *} (that is, ++@code{const Elf32_Phdr *} or @code{const Elf64_Phdr *}, as appropriate ++for the current architecture). For this request, the value returned by ++@code{dlinfo} is the number of program headers in the program header ++array. + @end vtable + + The @code{dlinfo} function is a GNU extension. diff --git a/glibc.spec b/glibc.spec index 61f2ecc..691cc83 100644 --- a/glibc.spec +++ b/glibc.spec @@ -148,7 +148,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 32%{?dist} +Release: 33%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -461,6 +461,28 @@ Patch253: glibc-upstream-2.34-187.patch Patch254: glibc-upstream-2.34-188.patch Patch255: glibc-upstream-2.34-189.patch Patch256: glibc-upstream-2.34-190.patch +Patch257: glibc-upstream-2.34-191.patch +Patch258: glibc-upstream-2.34-192.patch +Patch259: glibc-upstream-2.34-193.patch +Patch260: glibc-upstream-2.34-194.patch +Patch261: glibc-upstream-2.34-195.patch +Patch262: glibc-upstream-2.34-196.patch +Patch263: glibc-upstream-2.34-197.patch +Patch264: glibc-upstream-2.34-198.patch +Patch265: glibc-upstream-2.34-199.patch +Patch266: glibc-upstream-2.34-200.patch +Patch267: glibc-upstream-2.34-201.patch +Patch268: glibc-upstream-2.34-202.patch +Patch269: glibc-upstream-2.34-203.patch +Patch270: glibc-upstream-2.34-204.patch +Patch271: glibc-upstream-2.34-205.patch +Patch272: glibc-upstream-2.34-206.patch +Patch273: glibc-upstream-2.34-207.patch +Patch274: glibc-upstream-2.34-208.patch +Patch275: glibc-upstream-2.34-209.patch +Patch276: glibc-upstream-2.34-210.patch +Patch277: glibc-upstream-2.34-211.patch +Patch278: glibc-upstream-2.34-212.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2517,6 +2539,32 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Thu May 12 2022 Florian Weimer - 2.34-33 +- Sync with upstream branch release/2.34/master, + commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: +- dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo +- manual: Document the dlinfo function +- x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] +- x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] +- x86: Set .text section in memset-vec-unaligned-erms +- x86-64: Optimize bzero +- x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) +- x86: Improve vec generation in memset-vec-unaligned-erms.S +- x86-64: Fix strcmp-evex.S +- x86-64: Fix strcmp-avx2.S +- x86: Optimize strcmp-evex.S +- x86: Optimize strcmp-avx2.S +- manual: Clarify that abbreviations of long options are allowed +- Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h +- aarch64: Add HWCAP2_ECV from Linux 5.16 +- Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h +- Update kernel version to 5.17 in tst-mman-consts.py +- Update kernel version to 5.16 in tst-mman-consts.py +- Update syscall lists for Linux 5.17 +- Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h +- Update kernel version to 5.15 in tst-mman-consts.py +- Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h + * Thu Apr 28 2022 Carlos O'Donell - 2.34-32 - Sync with upstream branch release/2.34/master, commit c66c92181ddbd82306537a608e8c0282587131de: