From 329e925ee910a8efdb405d67b5405384d6573670 Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Thu, 12 May 2022 20:17:16 +0200
Subject: [PATCH] Sync with upstream branch release/2.34/master

Upstream commit: 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23

- dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo
- manual: Document the dlinfo function
- x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
- x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
- x86: Set .text section in memset-vec-unaligned-erms
- x86-64: Optimize bzero
- x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only)
- x86: Improve vec generation in memset-vec-unaligned-erms.S
- x86-64: Fix strcmp-evex.S
- x86-64: Fix strcmp-avx2.S
- x86: Optimize strcmp-evex.S
- x86: Optimize strcmp-avx2.S
- manual: Clarify that abbreviations of long options are allowed
- Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h
- aarch64: Add HWCAP2_ECV from Linux 5.16
- Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h
- Update kernel version to 5.17 in tst-mman-consts.py
- Update kernel version to 5.16 in tst-mman-consts.py
- Update syscall lists for Linux 5.17
- Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h
- Update kernel version to 5.15 in tst-mman-consts.py
- Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h
---
 glibc-upstream-2.34-191.patch |   35 +
 glibc-upstream-2.34-192.patch |   27 +
 glibc-upstream-2.34-193.patch |   28 +
 glibc-upstream-2.34-194.patch |  337 ++++++
 glibc-upstream-2.34-195.patch |   27 +
 glibc-upstream-2.34-196.patch |   27 +
 glibc-upstream-2.34-197.patch |   26 +
 glibc-upstream-2.34-198.patch |   21 +
 glibc-upstream-2.34-199.patch |   21 +
 glibc-upstream-2.34-200.patch |   29 +
 glibc-upstream-2.34-201.patch | 1789 +++++++++++++++++++++++++++++
 glibc-upstream-2.34-202.patch | 1987 +++++++++++++++++++++++++++++++++
 glibc-upstream-2.34-203.patch |   29 +
 glibc-upstream-2.34-204.patch |   29 +
 glibc-upstream-2.34-205.patch |  451 ++++++++
 glibc-upstream-2.34-206.patch |   35 +
 glibc-upstream-2.34-207.patch |  719 ++++++++++++
 glibc-upstream-2.34-208.patch |   29 +
 glibc-upstream-2.34-209.patch |   76 ++
 glibc-upstream-2.34-210.patch |   71 ++
 glibc-upstream-2.34-211.patch |  170 +++
 glibc-upstream-2.34-212.patch |  256 +++++
 glibc.spec                    |   50 +-
 23 files changed, 6268 insertions(+), 1 deletion(-)
 create mode 100644 glibc-upstream-2.34-191.patch
 create mode 100644 glibc-upstream-2.34-192.patch
 create mode 100644 glibc-upstream-2.34-193.patch
 create mode 100644 glibc-upstream-2.34-194.patch
 create mode 100644 glibc-upstream-2.34-195.patch
 create mode 100644 glibc-upstream-2.34-196.patch
 create mode 100644 glibc-upstream-2.34-197.patch
 create mode 100644 glibc-upstream-2.34-198.patch
 create mode 100644 glibc-upstream-2.34-199.patch
 create mode 100644 glibc-upstream-2.34-200.patch
 create mode 100644 glibc-upstream-2.34-201.patch
 create mode 100644 glibc-upstream-2.34-202.patch
 create mode 100644 glibc-upstream-2.34-203.patch
 create mode 100644 glibc-upstream-2.34-204.patch
 create mode 100644 glibc-upstream-2.34-205.patch
 create mode 100644 glibc-upstream-2.34-206.patch
 create mode 100644 glibc-upstream-2.34-207.patch
 create mode 100644 glibc-upstream-2.34-208.patch
 create mode 100644 glibc-upstream-2.34-209.patch
 create mode 100644 glibc-upstream-2.34-210.patch
 create mode 100644 glibc-upstream-2.34-211.patch
 create mode 100644 glibc-upstream-2.34-212.patch

diff --git a/glibc-upstream-2.34-191.patch b/glibc-upstream-2.34-191.patch
new file mode 100644
index 0000000..55b6a81
--- /dev/null
+++ b/glibc-upstream-2.34-191.patch
@@ -0,0 +1,35 @@
+commit bc6fba3c8048b11c9f73db03339c97a2fec3f0cf
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Wed Nov 17 14:25:16 2021 +0000
+
+    Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h
+    
+    Linux 5.15 adds a new address / protocol family PF_MCTP / AF_MCTP; add
+    these constants to bits/socket.h.
+    
+    Tested for x86_64.
+    
+    (cherry picked from commit bdeb7a8fa9989d18dab6310753d04d908125dc1d)
+
+diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h
+index a011a8c0959b9970..7bb9e863d7329da9 100644
+--- a/sysdeps/unix/sysv/linux/bits/socket.h
++++ b/sysdeps/unix/sysv/linux/bits/socket.h
+@@ -86,7 +86,8 @@ typedef __socklen_t socklen_t;
+ #define PF_QIPCRTR	42	/* Qualcomm IPC Router.  */
+ #define PF_SMC		43	/* SMC sockets.  */
+ #define PF_XDP		44	/* XDP sockets.  */
+-#define PF_MAX		45	/* For now..  */
++#define PF_MCTP		45	/* Management component transport protocol.  */
++#define PF_MAX		46	/* For now..  */
+ 
+ /* Address families.  */
+ #define AF_UNSPEC	PF_UNSPEC
+@@ -137,6 +138,7 @@ typedef __socklen_t socklen_t;
+ #define AF_QIPCRTR	PF_QIPCRTR
+ #define AF_SMC		PF_SMC
+ #define AF_XDP		PF_XDP
++#define AF_MCTP		PF_MCTP
+ #define AF_MAX		PF_MAX
+ 
+ /* Socket level values.  Others are defined in the appropriate headers.
diff --git a/glibc-upstream-2.34-192.patch b/glibc-upstream-2.34-192.patch
new file mode 100644
index 0000000..5a89460
--- /dev/null
+++ b/glibc-upstream-2.34-192.patch
@@ -0,0 +1,27 @@
+commit fd5dbfd1cd98cb2f12f9e9f7004a4d25ab0c977f
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Mon Nov 22 15:30:12 2021 +0000
+
+    Update kernel version to 5.15 in tst-mman-consts.py
+    
+    This patch updates the kernel version in the test tst-mman-consts.py
+    to 5.15.  (There are no new MAP_* constants covered by this test in
+    5.15 that need any other header changes.)
+    
+    Tested with build-many-glibcs.py.
+    
+    (cherry picked from commit 5c3ece451d46a7d8721311609bfcb6faafacb39e)
+
+diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+index 810433c238f31c25..eeccdfd04dae57ab 100644
+--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py
++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+@@ -33,7 +33,7 @@ def main():
+                         help='C compiler (including options) to use')
+     args = parser.parse_args()
+     linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc)
+-    linux_version_glibc = (5, 14)
++    linux_version_glibc = (5, 15)
+     sys.exit(glibcextract.compare_macro_consts(
+         '#define _GNU_SOURCE 1\n'
+         '#include <sys/mman.h>\n',
diff --git a/glibc-upstream-2.34-193.patch b/glibc-upstream-2.34-193.patch
new file mode 100644
index 0000000..d056d36
--- /dev/null
+++ b/glibc-upstream-2.34-193.patch
@@ -0,0 +1,28 @@
+commit 5146b73d72ced9bab125e986aa99ef5fe2f88475
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Mon Dec 20 15:38:32 2021 +0000
+
+    Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h
+    
+    Add the constant ARPHRD_MCTP, from Linux 5.15, to net/if_arp.h, along
+    with ARPHRD_CAN which was added to Linux in version 2.6.25 (commit
+    cd05acfe65ed2cf2db683fa9a6adb8d35635263b, "[CAN]: Allocate protocol
+    numbers for PF_CAN") but apparently missed for glibc at the time.
+    
+    Tested for x86_64.
+    
+    (cherry picked from commit a94d9659cd69dbc70d3494b1cbbbb5a1551675c5)
+
+diff --git a/sysdeps/unix/sysv/linux/net/if_arp.h b/sysdeps/unix/sysv/linux/net/if_arp.h
+index 2a8933cde7cf236d..42910b776660def1 100644
+--- a/sysdeps/unix/sysv/linux/net/if_arp.h
++++ b/sysdeps/unix/sysv/linux/net/if_arp.h
+@@ -95,6 +95,8 @@ struct arphdr
+ #define ARPHRD_ROSE	270
+ #define ARPHRD_X25	271		/* CCITT X.25.  */
+ #define ARPHRD_HWX25	272		/* Boards with X.25 in firmware.  */
++#define ARPHRD_CAN	280		/* Controller Area Network.  */
++#define ARPHRD_MCTP	290
+ #define ARPHRD_PPP	512
+ #define ARPHRD_CISCO	513		/* Cisco HDLC.  */
+ #define ARPHRD_HDLC	ARPHRD_CISCO
diff --git a/glibc-upstream-2.34-194.patch b/glibc-upstream-2.34-194.patch
new file mode 100644
index 0000000..0437f53
--- /dev/null
+++ b/glibc-upstream-2.34-194.patch
@@ -0,0 +1,337 @@
+commit 6af165658d0999ac2c4e9ce88bee020fbc2ee49f
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Wed Mar 23 17:11:56 2022 +0000
+
+    Update syscall lists for Linux 5.17
+    
+    Linux 5.17 has one new syscall, set_mempolicy_home_node.  Update
+    syscall-names.list and regenerate the arch-syscall.h headers with
+    build-many-glibcs.py update-syscalls.
+    
+    Tested with build-many-glibcs.py.
+    
+    (cherry picked from commit 8ef9196b26793830515402ea95aca2629f7721ec)
+
+diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+index 9905ebedf298954c..4fcb6da80af37e9e 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+@@ -236,6 +236,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_tid_address 96
+ #define __NR_setdomainname 162
+diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
+index ee8085be69958b25..0cf74c1a96bb1235 100644
+--- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
+@@ -391,6 +391,7 @@
+ #define __NR_sendmsg 114
+ #define __NR_sendto 133
+ #define __NR_set_mempolicy 431
++#define __NR_set_mempolicy_home_node 560
+ #define __NR_set_robust_list 466
+ #define __NR_set_tid_address 411
+ #define __NR_setdomainname 166
+diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
+index 1b626d97705d545a..c1207aaa12be6a51 100644
+--- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
+@@ -238,6 +238,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_tid_address 96
+ #define __NR_setdomainname 162
+diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
+index 96ef8db9368e7de4..e7ba04c106d8af7d 100644
+--- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
+@@ -302,6 +302,7 @@
+ #define __NR_sendmsg 296
+ #define __NR_sendto 290
+ #define __NR_set_mempolicy 321
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 338
+ #define __NR_set_tid_address 256
+ #define __NR_set_tls 983045
+diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
+index 96910154ed6a5c1b..dc9383758ebc641b 100644
+--- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
+@@ -250,6 +250,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_thread_area 244
+ #define __NR_set_tid_address 96
+diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
+index 36675fd48e6f50c5..767f1287a30b473e 100644
+--- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
+@@ -289,6 +289,7 @@
+ #define __NR_sendmsg 183
+ #define __NR_sendto 82
+ #define __NR_set_mempolicy 262
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 289
+ #define __NR_set_tid_address 237
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
+index c86ccbda4681066c..1998f0d76a444cac 100644
+--- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
+@@ -323,6 +323,7 @@
+ #define __NR_sendmsg 370
+ #define __NR_sendto 369
+ #define __NR_set_mempolicy 276
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 311
+ #define __NR_set_thread_area 243
+ #define __NR_set_tid_address 258
+diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
+index d898bce404955ef0..b2eab1b93d70b9de 100644
+--- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
+@@ -272,6 +272,7 @@
+ #define __NR_sendmsg 1205
+ #define __NR_sendto 1199
+ #define __NR_set_mempolicy 1261
++#define __NR_set_mempolicy_home_node 1474
+ #define __NR_set_robust_list 1298
+ #define __NR_set_tid_address 1233
+ #define __NR_setdomainname 1129
+diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
+index fe721b809076abeb..5fc3723772f92516 100644
+--- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
+@@ -310,6 +310,7 @@
+ #define __NR_sendmsg 367
+ #define __NR_sendto 366
+ #define __NR_set_mempolicy 270
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 304
+ #define __NR_set_thread_area 334
+ #define __NR_set_tid_address 253
+diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
+index 6e10c3661db96a1e..b6e9b007e496cd80 100644
+--- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
+@@ -326,6 +326,7 @@
+ #define __NR_sendmsg 360
+ #define __NR_sendto 353
+ #define __NR_set_mempolicy 276
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 311
+ #define __NR_set_thread_area 243
+ #define __NR_set_tid_address 258
+diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
+index 26a6d594a2222f15..b3a3871f8ab8a23e 100644
+--- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
+@@ -308,6 +308,7 @@
+ #define __NR_sendmsg 4179
+ #define __NR_sendto 4180
+ #define __NR_set_mempolicy 4270
++#define __NR_set_mempolicy_home_node 4450
+ #define __NR_set_robust_list 4309
+ #define __NR_set_thread_area 4283
+ #define __NR_set_tid_address 4252
+diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
+index 83e0d49c5e3ca1bc..b462182723aff286 100644
+--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
+@@ -288,6 +288,7 @@
+ #define __NR_sendmsg 6045
+ #define __NR_sendto 6043
+ #define __NR_set_mempolicy 6233
++#define __NR_set_mempolicy_home_node 6450
+ #define __NR_set_robust_list 6272
+ #define __NR_set_thread_area 6246
+ #define __NR_set_tid_address 6213
+diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
+index d6747c542f63202b..a9d6b94572e93001 100644
+--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
+@@ -270,6 +270,7 @@
+ #define __NR_sendmsg 5045
+ #define __NR_sendto 5043
+ #define __NR_set_mempolicy 5229
++#define __NR_set_mempolicy_home_node 5450
+ #define __NR_set_robust_list 5268
+ #define __NR_set_thread_area 5242
+ #define __NR_set_tid_address 5212
+diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
+index 4ee209bc4475ea7d..809a219ef32a45ef 100644
+--- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
+@@ -250,6 +250,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_tid_address 96
+ #define __NR_setdomainname 162
+diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
+index 497299fbc47a708c..627831ebae1b9e90 100644
+--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
+@@ -319,6 +319,7 @@
+ #define __NR_sendmsg 341
+ #define __NR_sendto 335
+ #define __NR_set_mempolicy 261
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 300
+ #define __NR_set_tid_address 232
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
+index e840279f171b10b9..bae597199d79eaad 100644
+--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
+@@ -298,6 +298,7 @@
+ #define __NR_sendmsg 341
+ #define __NR_sendto 335
+ #define __NR_set_mempolicy 261
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 300
+ #define __NR_set_tid_address 232
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
+index 73ef74c005e5a2bb..bf4be80f8d380963 100644
+--- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
+@@ -228,6 +228,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_tid_address 96
+ #define __NR_setdomainname 162
+diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
+index 919a79ee91177459..d656aedcc2be6009 100644
+--- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
+@@ -235,6 +235,7 @@
+ #define __NR_sendmsg 211
+ #define __NR_sendto 206
+ #define __NR_set_mempolicy 237
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 99
+ #define __NR_set_tid_address 96
+ #define __NR_setdomainname 162
+diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
+index 005c0ada7aab85a1..57025107e82c9439 100644
+--- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
+@@ -311,6 +311,7 @@
+ #define __NR_sendmsg 370
+ #define __NR_sendto 369
+ #define __NR_set_mempolicy 270
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 304
+ #define __NR_set_tid_address 252
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
+index 9131fddcc16116e4..72e19c6d569fbf9b 100644
+--- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
+@@ -278,6 +278,7 @@
+ #define __NR_sendmsg 370
+ #define __NR_sendto 369
+ #define __NR_set_mempolicy 270
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 304
+ #define __NR_set_tid_address 252
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
+index d8fb041568ecb4da..d52b522d9cac87ef 100644
+--- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
+@@ -303,6 +303,7 @@
+ #define __NR_sendmsg 355
+ #define __NR_sendto 349
+ #define __NR_set_mempolicy 276
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 311
+ #define __NR_set_tid_address 258
+ #define __NR_setdomainname 121
+diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
+index 2bc014fe6a1a1f4a..d3f4d8aa3edb4795 100644
+--- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
+@@ -310,6 +310,7 @@
+ #define __NR_sendmsg 114
+ #define __NR_sendto 133
+ #define __NR_set_mempolicy 305
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 300
+ #define __NR_set_tid_address 166
+ #define __NR_setdomainname 163
+diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
+index 76dbbe595ffe868f..2cc03d7a24453335 100644
+--- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
+@@ -286,6 +286,7 @@
+ #define __NR_sendmsg 114
+ #define __NR_sendto 133
+ #define __NR_set_mempolicy 305
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 300
+ #define __NR_set_tid_address 166
+ #define __NR_setdomainname 163
+diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
+index 0bc2af37dfa1eeb5..e2743c649586d97a 100644
+--- a/sysdeps/unix/sysv/linux/syscall-names.list
++++ b/sysdeps/unix/sysv/linux/syscall-names.list
+@@ -21,8 +21,8 @@
+ # This file can list all potential system calls.  The names are only
+ # used if the installed kernel headers also provide them.
+ 
+-# The list of system calls is current as of Linux 5.16.
+-kernel 5.16
++# The list of system calls is current as of Linux 5.17.
++kernel 5.17
+ 
+ FAST_atomic_update
+ FAST_cmpxchg
+@@ -523,6 +523,7 @@ sendmmsg
+ sendmsg
+ sendto
+ set_mempolicy
++set_mempolicy_home_node
+ set_robust_list
+ set_thread_area
+ set_tid_address
+diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
+index 28558279b48a1ef4..b4ab892ec183e32d 100644
+--- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
+@@ -278,6 +278,7 @@
+ #define __NR_sendmsg 46
+ #define __NR_sendto 44
+ #define __NR_set_mempolicy 238
++#define __NR_set_mempolicy_home_node 450
+ #define __NR_set_robust_list 273
+ #define __NR_set_thread_area 205
+ #define __NR_set_tid_address 218
+diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
+index c1ab8ec45e8b8fd3..772559c87b3625b8 100644
+--- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
++++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
+@@ -270,6 +270,7 @@
+ #define __NR_sendmsg 1073742342
+ #define __NR_sendto 1073741868
+ #define __NR_set_mempolicy 1073742062
++#define __NR_set_mempolicy_home_node 1073742274
+ #define __NR_set_robust_list 1073742354
+ #define __NR_set_thread_area 1073742029
+ #define __NR_set_tid_address 1073742042
diff --git a/glibc-upstream-2.34-195.patch b/glibc-upstream-2.34-195.patch
new file mode 100644
index 0000000..d2b7afb
--- /dev/null
+++ b/glibc-upstream-2.34-195.patch
@@ -0,0 +1,27 @@
+commit 81181ba5d916fc49bd737f603e28a3c2dc8430b4
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Wed Feb 16 14:19:24 2022 +0000
+
+    Update kernel version to 5.16 in tst-mman-consts.py
+    
+    This patch updates the kernel version in the test tst-mman-consts.py
+    to 5.16.  (There are no new MAP_* constants covered by this test in
+    5.16 that need any other header changes.)
+    
+    Tested with build-many-glibcs.py.
+    
+    (cherry picked from commit 790a607e234aa10d4b977a1b80aebe8a2acac970)
+
+diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+index eeccdfd04dae57ab..8102d80b6660e523 100644
+--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py
++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+@@ -33,7 +33,7 @@ def main():
+                         help='C compiler (including options) to use')
+     args = parser.parse_args()
+     linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc)
+-    linux_version_glibc = (5, 15)
++    linux_version_glibc = (5, 16)
+     sys.exit(glibcextract.compare_macro_consts(
+         '#define _GNU_SOURCE 1\n'
+         '#include <sys/mman.h>\n',
diff --git a/glibc-upstream-2.34-196.patch b/glibc-upstream-2.34-196.patch
new file mode 100644
index 0000000..5294eea
--- /dev/null
+++ b/glibc-upstream-2.34-196.patch
@@ -0,0 +1,27 @@
+commit 0499c3a95fb864284fef36d3e9c5a54f6646b2db
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Thu Mar 24 15:35:27 2022 +0000
+
+    Update kernel version to 5.17 in tst-mman-consts.py
+    
+    This patch updates the kernel version in the test tst-mman-consts.py
+    to 5.17.  (There are no new MAP_* constants covered by this test in
+    5.17 that need any other header changes.)
+    
+    Tested with build-many-glibcs.py.
+    
+    (cherry picked from commit 23808a422e6036accaba7236fd3b9a0d7ab7e8ee)
+
+diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+index 8102d80b6660e523..724c7375c3a1623b 100644
+--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py
++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py
+@@ -33,7 +33,7 @@ def main():
+                         help='C compiler (including options) to use')
+     args = parser.parse_args()
+     linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc)
+-    linux_version_glibc = (5, 16)
++    linux_version_glibc = (5, 17)
+     sys.exit(glibcextract.compare_macro_consts(
+         '#define _GNU_SOURCE 1\n'
+         '#include <sys/mman.h>\n',
diff --git a/glibc-upstream-2.34-197.patch b/glibc-upstream-2.34-197.patch
new file mode 100644
index 0000000..afe47ec
--- /dev/null
+++ b/glibc-upstream-2.34-197.patch
@@ -0,0 +1,26 @@
+commit f858bc309315a03ff6b1a048f59405c159d23430
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Mon Feb 21 22:49:36 2022 +0000
+
+    Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h
+    
+    Linux 5.16 adds constants SOL_MPTCP and SOL_MCTP to the getsockopt /
+    setsockopt levels; add these constants to bits/socket.h.
+    
+    Tested for x86_64.
+    
+    (cherry picked from commit fdc1ae67fef27eea1445bab4bdfe2f0fb3bc7aa1)
+
+diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h
+index 7bb9e863d7329da9..c81fab840918924e 100644
+--- a/sysdeps/unix/sysv/linux/bits/socket.h
++++ b/sysdeps/unix/sysv/linux/bits/socket.h
+@@ -169,6 +169,8 @@ typedef __socklen_t socklen_t;
+ #define SOL_KCM		281
+ #define SOL_TLS		282
+ #define SOL_XDP		283
++#define SOL_MPTCP	284
++#define SOL_MCTP	285
+ 
+ /* Maximum queue length specifiable by listen.  */
+ #define SOMAXCONN	4096
diff --git a/glibc-upstream-2.34-198.patch b/glibc-upstream-2.34-198.patch
new file mode 100644
index 0000000..67ab10c
--- /dev/null
+++ b/glibc-upstream-2.34-198.patch
@@ -0,0 +1,21 @@
+commit c108e87026d61d6744e3e55704e0bea937243f5a
+Author: Szabolcs Nagy <szabolcs.nagy@arm.com>
+Date:   Tue Dec 14 11:15:07 2021 +0000
+
+    aarch64: Add HWCAP2_ECV from Linux 5.16
+    
+    Indicates the availability of enhanced counter virtualization extension
+    of armv8.6-a with self-synchronized virtual counter CNTVCTSS_EL0 usable
+    in userspace.
+    
+    (cherry picked from commit 5a1be8ebdf6f02d4efec6e5f12ad06db17511f90)
+
+diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
+index 30fda0a4a347695e..04cc762015a7230a 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
+@@ -74,3 +74,4 @@
+ #define HWCAP2_RNG		(1 << 16)
+ #define HWCAP2_BTI		(1 << 17)
+ #define HWCAP2_MTE		(1 << 18)
++#define HWCAP2_ECV		(1 << 19)
diff --git a/glibc-upstream-2.34-199.patch b/glibc-upstream-2.34-199.patch
new file mode 100644
index 0000000..02675fc
--- /dev/null
+++ b/glibc-upstream-2.34-199.patch
@@ -0,0 +1,21 @@
+commit 97cb8227b864b8ea0d99a4a50e4163baad3e1c72
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Mon Mar 28 13:16:48 2022 +0000
+
+    Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h
+    
+    Add the new HWCAP2_AFP and HWCAP2_RPRES constants from Linux 5.17.
+    Tested with build-many-glibcs.py for aarch64-linux-gnu.
+    
+    (cherry picked from commit 866c599182e87f116440b5d854f9e99533c48eb3)
+
+diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
+index 04cc762015a7230a..9a5c4116b3fe9903 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
+@@ -75,3 +75,5 @@
+ #define HWCAP2_BTI		(1 << 17)
+ #define HWCAP2_MTE		(1 << 18)
+ #define HWCAP2_ECV		(1 << 19)
++#define HWCAP2_AFP		(1 << 20)
++#define HWCAP2_RPRES		(1 << 21)
diff --git a/glibc-upstream-2.34-200.patch b/glibc-upstream-2.34-200.patch
new file mode 100644
index 0000000..7ad14c9
--- /dev/null
+++ b/glibc-upstream-2.34-200.patch
@@ -0,0 +1,29 @@
+commit 31af92b9c8cf753992d45c801a855a02060afc08
+Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date:   Wed May 4 15:56:47 2022 +0530
+
+    manual: Clarify that abbreviations of long options are allowed
+    
+    The man page and code comments clearly state that abbreviations of long
+    option names are recognized correctly as long as they are unique.
+    Document this fact in the glibc manual as well.
+    
+    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    Reviewed-by: Andreas Schwab <schwab@linux-m68k.org>
+    (cherry picked from commit db1efe02c9f15affc3908d6ae73875b82898a489)
+
+diff --git a/manual/getopt.texi b/manual/getopt.texi
+index 5485fc46946631f7..b4c0b15ac2060560 100644
+--- a/manual/getopt.texi
++++ b/manual/getopt.texi
+@@ -250,7 +250,8 @@ option, and stores the option's argument (if it has one) in @code{optarg}.
+ 
+ When @code{getopt_long} encounters a long option, it takes actions based
+ on the @code{flag} and @code{val} fields of the definition of that
+-option.
++option.  The option name may be abbreviated as long as the abbreviation is
++unique.
+ 
+ If @code{flag} is a null pointer, then @code{getopt_long} returns the
+ contents of @code{val} to indicate which option it found.  You should
diff --git a/glibc-upstream-2.34-201.patch b/glibc-upstream-2.34-201.patch
new file mode 100644
index 0000000..68ca969
--- /dev/null
+++ b/glibc-upstream-2.34-201.patch
@@ -0,0 +1,1789 @@
+commit 0d5b36c8cc15f064e302d29692853f8a760e1547
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:38 2022 -0600
+
+    x86: Optimize strcmp-avx2.S
+    
+    Optimization are primarily to the loop logic and how the page cross
+    logic interacts with the loop.
+    
+    The page cross logic is at times more expensive for short strings near
+    the end of a page but not crossing the page. This is done to retest
+    the page cross conditions with a non-faulty check and to improve the
+    logic for entering the loop afterwards. This is only particular cases,
+    however, and is general made up for by more than 10x improvements on
+    the transition from the page cross -> loop case.
+    
+    The non-page cross cases are improved most for smaller sizes [0, 128]
+    and go about even for (128, 4096]. The loop page cross logic is
+    improved so some more significant speedup is seen there as well.
+    
+    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index fa70c994fc25dfd8..a0d1c65db11028bc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -26,35 +26,57 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
++# define VMOVU	vmovdqu
++# define VMOVA	vmovdqa
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
++	/* Compare packed dwords.  */
+ #  define VPCMPEQ	vpcmpeqd
+-/* Compare packed dwords and store minimum.  */
++	/* Compare packed dwords and store minimum.  */
+ #  define VPMINU	vpminud
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
++	/* Compare packed bytes.  */
+ #  define VPCMPEQ	vpcmpeqb
+-/* Compare packed bytes and store minimum.  */
++	/* Compare packed bytes and store minimum.  */
+ #  define VPMINU	vpminub
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# if defined USE_AS_STRNCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
++# define xmmZERO	xmm15
++# define ymmZERO	ymm15
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+@@ -79,783 +101,1049 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCMP)
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
++#  endif
+ 	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ #  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+ 	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
++
++	/* Multiplying length by sizeof(wchar_t) can result in overflow.
++	   Check if that is possible. All cases where overflow are possible
++	   are cases where length is large enough that it can never be a
++	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	OVERFLOW_STRCMP
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++	jnz	__wcscmp_avx2
++
++	leaq	(, %rdx, 4), %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
+ # endif
++	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+-	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
+-	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	(%rsi), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(%rdi), %ymm0
++	/* 1s where s1 and s2 equal.  */
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s at null CHAR.  */
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	/* 1s where s1 and s2 equal AND not null CHAR.  */
++	vpandn	%ymm1, %ymm2, %ymm1
++
++	/* All 1s -> keep going, any 0s -> return.  */
++	vpmovmskb %ymm1, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* All 1s represents all equals. incl will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	incl	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 8
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	VZEROUPPER_RETURN
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_avx2
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++
++	jnbe	__strcmp_avx2
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
++L(ret1):
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
++	   overflow.  */
++	addq	$-VEC_SIZE, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++L(return_vec_3):
++	salq	$32, %rcx
++# endif
++
++L(return_vec_2):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 10
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_2)
++
++	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	vmovdqu	VEC_SIZE(%rdi), %ymm6
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
+-	VPMINU	%ymm6, %ymm3, %ymm3
+-	VPCMPEQ	%ymm7, %ymm3, %ymm3
+-	vpmovmskb %ymm3, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_vec_size)
+-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
+-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
+-	VPMINU	%ymm5, %ymm2, %ymm2
+-	VPCMPEQ	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm2, %ymm2
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_2_vec_size)
+-	VPMINU	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	/* any non-zero positive value that doesn't inference with 0x1.
+ 	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
+-# endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
++	movl	$2, %r8d
+ 
++# else
++	xorl	%r8d, %r8d
++# endif
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
++# ifdef USE_AS_STRNCMP
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++# endif
++L(prepare_loop_no_len):
++
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++	addq	%rdi, %rsi
++
++# ifdef USE_AS_STRNCMP
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
+-# endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	vmovdqa	(%rax), %ymm0
+-	vmovdqa	VEC_SIZE(%rax), %ymm3
+-	VPCMPEQ	(%rdx), %ymm0, %ymm4
+-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
+-	VPMINU	%ymm0, %ymm4, %ymm4
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
+-	VPMINU	%ymm1, %ymm4, %ymm0
+-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPMINU	%ymm5, %ymm0, %ymm0
+-	VPMINU	%ymm6, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	vpmovmskb %ymm0, %ecx
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
++	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
++
++	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++
++
++	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
++	   zero.  */
++	vpand	%ymm0, %ymm1, %ymm1
++
++
++	vpand	%ymm2, %ymm3, %ymm3
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++
++	VPMINU	%ymm1, %ymm3, %ymm3
++	VPMINU	%ymm5, %ymm7, %ymm7
++
++	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
++	VPMINU	%ymm3, %ymm7, %ymm7
++
++	/* If any 0 CHAR then done.  */
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jz	L(loop)
++
++	/* Find which VEC has the mismatch of end of string.  */
++	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
++	vpmovmskb %ymm1, %ecx
+ 	testl	%ecx, %ecx
+-	je	L(loop)
+-	VPCMPEQ	%ymm7, %ymm4, %ymm0
+-	vpmovmskb %ymm0, %edi
+-	testl	%edi, %edi
+-	je	L(test_vec)
+-	tzcntl	%edi, %ecx
++	jnz	L(return_vec_0_end)
++
++
++	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
++	vpmovmskb %ymm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_1_end)
++
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
++	vpmovmskb %ymm5, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_2_end)
++
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++	tzcntl	%LOOP_REG, %LOOP_REG
++
++# ifdef USE_AS_STRNCMP
++	subl	$-(VEC_SIZE), %LOOP_REG
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
++	.p2align 4,, 2
++L(ret_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-	vpmovmskb %ymm1, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++L(return_vec_1_end):
++	salq	$32, %rcx
++# endif
++L(return_vec_0_end):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++# endif
++L(ret6):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(test_2_vec):
++	.p2align 4,, 10
++L(return_vec_2_end):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	vpmovmskb %ymm5, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret11)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret11):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_3_vec):
++
++	/* Page cross in rsi in next 4x VEC.  */
++
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
++
++	/* Optimistically rsi and rdi and both aligned inwhich case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
++
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
++
++	VMOVA	(%rdi), %ymm0
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
++	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
++	movl	$-1, %r10d
++	shlxl	%esi, %r10d, %r10d
++	notl	%ecx
++
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
+-# endif
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-	vpmovmskb %ymm6, %esi
+-	tzcntl	%esi, %ecx
++	cmpq	%rax, %rdx
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+-# endif
+-
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+-
+-	vmovdqu	(%rax, %r10), %ymm2
+-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
+-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
+-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
+-	VPMINU	%ymm2, %ymm0, %ymm0
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-
+-	vpmovmskb %ymm0, %edi
+-	vpmovmskb %ymm1, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+-
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrq	%cl, %rdi
+-
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
+-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-
+-	vpmovmskb %ymm5, %edi
+-	vpmovmskb %ymm6, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+-
+-	testq	%rdi, %rdi
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1_end)
++
+ # ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
+-# else
+-	je	L(back_to_loop)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+-	tzcntq	%rdi, %rcx
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
++
++	subl	$-(VEC_SIZE * 4), %eax
++
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_1)
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	/* Must check length here as length might proclude reading next
++	   page.  */
++	cmpq	%rax, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# endif
++
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++	VPMINU	%ymm5, %ymm7, %ymm7
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	VZEROUPPER_RETURN
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-	VZEROUPPER_RETURN
+ 
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# else
++	addl	%eax, %ecx
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+ 	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++L(ret9):
++	VZEROUPPER_RETURN
++
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
++
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	xorl	%r8d, %r8d
+ # endif
+-	/* Check null char.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++
++	.p2align 4,, 10
++L(page_cross_loop):
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++
++	jnz	L(check_ret_vec_page_cross)
++	addl	$VEC_SIZE, %OFFSET_REG
++# ifdef USE_AS_STRNCMP
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VZEROUPPER_RETURN
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++# ifdef USE_AS_STRNCMP
++	leal	VEC_SIZE(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++	addq	%rdi, %rdx
++# endif
++	incl	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	VZEROUPPER_RETURN
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	incl	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	VZEROUPPER_RETURN
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$16, %eax
++	ja	L(less_16_till_page)
++
++	VMOVU	(%rdi), %xmm0
++	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++	movl	$16, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
+-	tzcntl	%ecx, %edx
++
++	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$16, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+ # endif
+-# ifdef USE_AS_WCSCMP
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	vmovdqu	(%rdi, %rdx), %ymm1
+-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
+ 
+-	addl	$VEC_SIZE, %edx
++	.p2align 4,, 10
++L(less_16_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$24, %eax
++	ja	L(less_8_till_page)
+ 
+-	addl	$VEC_SIZE, %eax
+-# ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	vmovdqu	(%rdi, %rdx), %xmm1
+-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$8, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
++	movl	$24, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++
++
++
++	vmovq	(%rdi, %OFFSET_REG64), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %xmm1
+-	vmovq	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	addl	$8, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
++# ifdef USE_AS_WCSCMP
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addq	%rdi, %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
+ #  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop_no_len)
++	ret
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %xmm1
+-	vmovd	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
++# else
++
++	/* Find largest load size we can use.  */
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$28, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++
++
++	vmovd	(%rdi, %OFFSET_REG64), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
++
++#  ifdef USE_AS_STRNCMP
++	addl	$4, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++#  ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
++#  endif
++
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
+-	VZEROUPPER_RETURN
+-END (STRCMP)
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(VEC_SIZE * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++	ret
++# endif
++END(STRCMP)
+ #endif
diff --git a/glibc-upstream-2.34-202.patch b/glibc-upstream-2.34-202.patch
new file mode 100644
index 0000000..9357b6f
--- /dev/null
+++ b/glibc-upstream-2.34-202.patch
@@ -0,0 +1,1987 @@
+commit c41a66767d23b7f219fb943be6fab5ddf822d7da
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:39 2022 -0600
+
+    x86: Optimize strcmp-evex.S
+    
+    Optimization are primarily to the loop logic and how the page cross
+    logic interacts with the loop.
+    
+    The page cross logic is at times more expensive for short strings near
+    the end of a page but not crossing the page. This is done to retest
+    the page cross conditions with a non-faulty check and to improve the
+    logic for entering the loop afterwards. This is only particular cases,
+    however, and is general made up for by more than 10x improvements on
+    the transition from the page cross -> loop case.
+    
+    The non-page cross cases as well are nearly universally improved.
+    
+    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+    
+    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 6f5c4bf984da2b80..99d8409af27327ad 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -26,54 +26,69 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
++# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
+-
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
+-#  define VPCMP		vpcmpd
++#  define TESTEQ	subl	$0xff,
++	/* Compare packed dwords.  */
++#  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
+-#  define SHIFT_REG32	r8d
+-#  define SHIFT_REG64	r8
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
+-#  define VPCMP		vpcmpb
++#  define TESTEQ	incl
++	/* Compare packed bytes.  */
++#  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
+-#  define SHIFT_REG32	ecx
+-#  define SHIFT_REG64	rcx
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
+ # define XMMZERO	xmm16
+-# define XMM0		xmm17
+-# define XMM1		xmm18
++# define XMM0	xmm17
++# define XMM1	xmm18
+ 
+ # define YMMZERO	ymm16
+-# define YMM0		ymm17
+-# define YMM1		ymm18
+-# define YMM2		ymm19
+-# define YMM3		ymm20
+-# define YMM4		ymm21
+-# define YMM5		ymm22
+-# define YMM6		ymm23
+-# define YMM7		ymm24
+-# define YMM8		ymm25
+-# define YMM9		ymm26
+-# define YMM10		ymm27
++# define YMM0	ymm17
++# define YMM1	ymm18
++# define YMM2	ymm19
++# define YMM3	ymm20
++# define YMM4	ymm21
++# define YMM5	ymm22
++# define YMM6	ymm23
++# define YMM7	ymm24
++# define YMM8	ymm25
++# define YMM9	ymm26
++# define YMM10	ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -96,985 +111,1096 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRCMP)
++	.section .text.evex, "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+-	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
+-	shrq	$56, %rcx
+-	jnz	__wcscmp_evex
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
++	cmp	$1, %RDX_LP
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ # endif
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
++	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+ 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+-
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
++	   wcscmp/wcsncmp.  */
++
++	/* All 1s represents all equals. TESTEQ will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	TESTEQ	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ 	ret
+ 
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 4
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	ret
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_evex
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__strcmp_evex
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret1):
+ 	ret
++# endif
+ 
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_STRNCMP
++	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
++	   worrying about underflow.  */
++	addq	$-CHAR_PER_VEC, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
++	ret
++
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++L(return_vec_3):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_2):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	/* 32 byte align here ensures the main loop is ideally aligned
++	   for DSB.  */
++	.p2align 5
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	jne	L(return_vec_size)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	jne	L(return_2_vec_size)
++	TESTEQ	%ecx
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_3)
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
++
+ # else
+-	incl	%ecx
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
+-	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++L(prepare_loop_no_len):
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	shrl	$2, %ecx
++	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
++#  else
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++L(prepare_loop_no_len):
++#  endif
++# else
++L(prepare_loop_no_len):
+ # endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
+ 
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++L(prepare_loop_readj):
++	addq	%rdi, %rsi
++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	VMOVA	(%rax), %YMM0
+-	VMOVA	VEC_SIZE(%rax), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 
+ 	VPMINU	%YMM0, %YMM2, %YMM8
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 
+-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+-	VPMINU	%YMM8, %YMM9, %YMM8
++	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+ 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+-	VPTESTM	%YMM8, %YMM8, %k1
++	VPTESTM	%YMM9, %YMM9, %k1
+ 
+-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+-	vpxorq	(%rdx), %YMM0, %YMM1
+-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
++	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
++	   oring with YMM1. Result is stored in YMM6.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+ 
+-	vporq	%YMM1, %YMM3, %YMM9
+-	vporq	%YMM5, %YMM7, %YMM10
++	/* Or together YMM3, YMM5, and YMM6.  */
++	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+-	vporq	%YMM9, %YMM10, %YMM9
+ 
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+-	kmovd   %k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	 L(loop)
++	/* A non-zero CHAR in YMM6 represents a mismatch.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	TESTEQ	%LOOP_REG
++	jz	L(loop)
++
++
++	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM0 and (%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_vec)
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
+ 
+-	.p2align 4
+-L(test_vec):
+-# ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
+-# endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM2 and VEC_SIZE(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	.p2align 4
+-L(test_2_vec):
++
++	/* Handle VEC 2 and 3 without branches.  */
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	TESTEQ	%ecx
++# if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %LOOP_REG
++	orl	%ecx, %LOOP_REG
+ # else
+-	incl	%ecx
++	salq	$CHAR_PER_VEC, %LOOP_REG64
++	orq	%rcx, %LOOP_REG64
++# endif
++L(return_vec_3_end):
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++# if CHAR_PER_VEC <= 16
++	tzcntl	%LOOP_REG, %LOOP_REG
++# else
++	tzcntq	%LOOP_REG64, %LOOP_REG64
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
++	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	xorl	%eax, %eax
++	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
++	ret
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 2
++L(ret_zero_end):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	ret
++# endif
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
++# ifdef USE_AS_STRNCMP
++L(return_vec_1_end):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_0_end):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-	.p2align 4
+-L(test_3_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+-	VPTESTM	%YMM6, %YMM6, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+-	kmovd	%k0, %ecx
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	/* This is the non-zero case for `eax` so just xorl with `r8d`
++	   flip is `rdi` and `rsi` where swapped.  */
++	xorl	%r8d, %eax
+ # else
+-	incl	%ecx
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
++	   logic. Subtract `r8d` after xor for zero case.  */
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret6):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
+ 	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	ret
+-
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+ # endif
+ 
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+ 
+-	VMOVU	(%rax, %r10), %YMM2
+-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	/* Page cross in rsi in next 4x VEC.  */
+ 
+-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+-	VPTESTM	%YMM2, %YMM2, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+ 
+-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+-	VPTESTM	%YMM3, %YMM3, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+-	kmovd	%k3, %edi
+-    /* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
+-# endif
++	/* Optimistically rsi and rdi and both aligned in which case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG32
+-	sarl	$2, %SHIFT_REG32
+-
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
+-# endif
++	VMOVA	(%rdi), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
+ 
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrxq	%SHIFT_REG64, %rdi, %rdi
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
++	movl	$-1, %r10d
++	movl	%esi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	shrl	$2, %ecx
++	shlxl	%ecx, %r10d, %ecx
++	movzbl	%cl, %r10d
++# else
++	movl	$-1, %ecx
++	shlxl	%esi, %ecx, %r10d
+ # endif
++
++	kmovd	%k1, %ecx
++	notl	%ecx
++
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
+ #  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	cmpq	%rax, %rdx
+ #  endif
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++
++	/* Readjust eax before potentially returning to the loop.  */
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	ret
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_WCSCMP
++	sall	$2, %edx
++#  endif
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
++	xorl	%eax, %eax
++	ret
++# endif
++
+ 
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	VPTESTM	%YMM1, %YMM1, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+-	kmovd	%k3, %edi
+-	/* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
++	subl	$-(VEC_SIZE * 4), %eax
+ 
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_1)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
++# ifdef USE_AS_STRNCMP
++	/* Must check length here as length might proclude reading next
++	   page.  */
++#  ifdef USE_AS_WCSCMP
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
++#  else
++	cmpq	%rax, %rdx
++#  endif
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+-	   bytes.  */
+-	sarl	$2, %ecx
+-	/* Skip ECX bytes.  */
+-	shrl	%cl, %edi
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
++	VPMINU	%YMM4, %YMM6, %YMM9
++	VPTESTM	%YMM9, %YMM9, %k1
++
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
++
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
++	TESTEQ	%LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
++	xorl	%eax, %eax
++	ret
+ # else
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+ 
+-	testq	%rdi, %rdi
+-# ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_STRNCMP
++#   ifdef USE_AS_WCSCMP
++	/* Must divide ecx instead of multiply rdx due to overflow.  */
++	movl	%ecx, %eax
++	shrl	$2, %eax
++	cmpq	%rax, %rdx
++#   else
++	cmpq	%rcx, %rdx
++#   endif
++	jbe	L(ret_zero_in_loop_page_cross)
++#  endif
+ # else
+-	je	L(back_to_loop)
++	addl	%eax, %ecx
+ # endif
+-	tzcntq	%rdi, %rcx
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret9):
+ 	ret
+ 
+-# ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	subl	%ecx, %eax
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++	.p2align 4,, 8
++L(page_cross_loop):
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(check_ret_vec_page_cross)
++	addl	$CHAR_PER_VEC, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	shrl	$2, %eax
+ # endif
+-	/* Check null CHAR.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++
++	kmovd	%k1, %ecx
++# ifdef USE_AS_STRNCMP
++	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++#  ifdef USE_AS_WCSCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++#  else
++	addq	%rdi, %rdx
++#  endif
+ # endif
+-	ret
++	TESTEQ	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	ret
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
++	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	ret
++
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	TESTEQ	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	ret
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
+-# ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++# ifdef USE_AS_WCSCMP
++	shrl	$2, %eax
+ # endif
+-	tzcntl	%ecx, %edx
++	/* Find largest load size we can use.  */
++	cmpl	$(16 / SIZE_OF_CHAR), %eax
++	ja	L(less_16_till_page)
++
++	/* Use 16 byte comparison.  */
++	vmovdqu	(%rdi), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	subl	$0xf, %ecx
++# else
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
++	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	subl	$0xf, %ecx
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++# ifdef USE_AS_STRNCMP
++	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	VMOVU	(%rdi, %rdx), %YMM0
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
++	.p2align 4,, 10
++L(less_16_till_page):
++	cmpl	$(24 / SIZE_OF_CHAR), %eax
++	ja	L(less_8_till_page)
++
++	/* Use 8 byte comparison.  */
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	subl	$0x3, %ecx
+ # else
+-	incl	%ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$VEC_SIZE, %edx
+ 
+-	addl	$VEC_SIZE, %eax
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$(8 / SIZE_OF_CHAR), %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	VMOVU	(%rdi, %rdx), %XMM0
++	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xf, %ecx
++	subl	$0x3, %ecx
+ # else
+-	subl	$0xffff, %ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+ # endif
++	jmp	L(prepare_loop_aligned)
+ 
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %XMM0
+-	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+-	kmovb	%k1, %ecx
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
+ # ifdef USE_AS_WCSCMP
+-	subl	$0x3, %ecx
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
++#  ifdef USE_AS_STRNCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
++#  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop)
++	ret
++
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
+ # else
+-	subl	$0xff, %ecx
+-# endif
+-	jne	L(last_vector)
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %XMM0
+-	vmovd	(%rsi, %rdx), %XMM1
+-
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0x1, %ecx
+-# else
+ 	subl	$0xf, %ecx
+-# endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++#  ifdef USE_AS_STRNCMP
++	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
+ #  endif
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ 	ret
+-END (STRCMP)
++# endif
++END(STRCMP)
+ #endif
diff --git a/glibc-upstream-2.34-203.patch b/glibc-upstream-2.34-203.patch
new file mode 100644
index 0000000..e45b588
--- /dev/null
+++ b/glibc-upstream-2.34-203.patch
@@ -0,0 +1,29 @@
+commit d299032743e05571ef326c838a5ecf6ef5b3e9c3
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Feb 4 11:09:10 2022 -0800
+
+    x86-64: Fix strcmp-avx2.S
+    
+    Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+    
+    commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Mon Jan 10 15:35:38 2022 -0600
+    
+        x86: Optimize strcmp-avx2.S
+    
+    (cherry picked from commit c15efd011cea3d8f0494269eb539583215a1feed)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index a0d1c65db11028bc..cdded412a70bad10 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -106,7 +106,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
diff --git a/glibc-upstream-2.34-204.patch b/glibc-upstream-2.34-204.patch
new file mode 100644
index 0000000..4250493
--- /dev/null
+++ b/glibc-upstream-2.34-204.patch
@@ -0,0 +1,29 @@
+commit 53ddafe917a8af17b16beb794c29e5b09b86d534
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Feb 4 11:11:08 2022 -0800
+
+    x86-64: Fix strcmp-evex.S
+    
+    Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+    
+    commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Mon Jan 10 15:35:39 2022 -0600
+    
+        x86: Optimize strcmp-evex.S
+    
+    (cherry picked from commit 0e0199a9e02ebe42e2b36958964d63f03573c382)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 99d8409af27327ad..ed56af8ecdad48b2 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -116,7 +116,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
diff --git a/glibc-upstream-2.34-205.patch b/glibc-upstream-2.34-205.patch
new file mode 100644
index 0000000..6cf18b8
--- /dev/null
+++ b/glibc-upstream-2.34-205.patch
@@ -0,0 +1,451 @@
+commit ea19c490a3f5628d55ded271cbb753e66b2f05e8
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Feb 6 00:54:18 2022 -0600
+
+    x86: Improve vec generation in memset-vec-unaligned-erms.S
+    
+    No bug.
+    
+    Split vec generation into multiple steps. This allows the
+    broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
+    case. This saves an expensive lane-cross instruction and removes
+    the need for 'vzeroupper'.
+    
+    For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
+    byte broadcast.
+    
+    Results for memset-avx2 small (geomean of N = 20 benchset runs).
+    
+    size, New Time, Old Time, New / Old
+       0,    4.100,    3.831,     0.934
+       1,    5.074,    4.399,     0.867
+       2,    4.433,    4.411,     0.995
+       4,    4.487,    4.415,     0.984
+       8,    4.454,    4.396,     0.987
+      16,    4.502,    4.443,     0.987
+    
+    All relevant string/wcsmbs tests are passing.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -28,17 +28,22 @@
+ #define VMOVU     movups
+ #define VMOVA     movaps
+ 
+-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  punpcklbw %xmm0, %xmm0; \
+-  punpcklwd %xmm0, %xmm0; \
+-  pshufd $0, %xmm0, %xmm0
++  pxor %xmm1, %xmm1; \
++  pshufb %xmm1, %xmm0; \
++  movq r, %rax
+ 
+-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  pshufd $0, %xmm0, %xmm0
++  pshufd $0, %xmm0, %xmm0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ #define SECTION(p)		p
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 1af668af0aeda59e..c0bf2875d03d51ab 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -10,15 +10,18 @@
+ # define VMOVU     vmovdqu
+ # define VMOVA     vmovdqa
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastb %xmm0, %ymm0
++  movq r, %rax;
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
++
++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+ 
+ # ifndef SECTION
+ #  define SECTION(p)		p##.avx
+@@ -30,5 +33,6 @@
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+ 
++# define USE_XMM_LESS_VEC
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index f14d6f8493c21a36..5241216a77bf72b7 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 64b09e77cc20cc42..637002150659123c 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index e723413a664c088f..c8db87dcbf69f0d8 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -58,8 +58,10 @@
+ #ifndef MOVQ
+ # if VEC_SIZE > 16
+ #  define MOVQ				vmovq
++#  define MOVD				vmovd
+ # else
+ #  define MOVQ				movq
++#  define MOVD				movd
+ # endif
+ #endif
+ 
+@@ -72,9 +74,17 @@
+ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ # define END_REG	rcx
+ # define LOOP_REG	rdi
++# define LESS_VEC_REG	rax
+ #else
+ # define END_REG	rdi
+ # define LOOP_REG	rdx
++# define LESS_VEC_REG	rdi
++#endif
++
++#ifdef USE_XMM_LESS_VEC
++# define XMM_SMALL	1
++#else
++# define XMM_SMALL	0
+ #endif
+ 
+ #define PAGE_SIZE 4096
+@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	shl	$2, %RDX_LP
+-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	jmp	L(entry_from_bzero)
++	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
++	WMEMSET_VDUP_TO_VEC0_LOW()
++	cmpq	$VEC_SIZE, %rdx
++	jb	L(less_vec_no_vdup)
++	WMEMSET_VDUP_TO_VEC0_HIGH()
++	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH()
++L(entry_from_wmemset):
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+ 	ja	L(more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH ()
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+-	 */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+-	.p2align 4,, 10
++	.p2align 4,, 4
+ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ #else
+ 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+ 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+@@ -212,6 +228,7 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
++L(less_vec_no_vdup):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
+ 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ 	   and (4x, 8x] jump to target.  */
+ L(more_2x_vec):
+-
+-	/* Two different methods of setting up pointers / compare. The
+-	   two methods are based on the fact that EVEX/AVX512 mov
+-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+-	   this saves code size and keeps a few targets in one fetch block.
+-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+-	   LOOP_4X_OFFSET) with LEA_BID.  */
+-
+-	/* END_REG is rcx for EVEX/AVX512.  */
+-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+-#endif
+-
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), VEC_SIZE(%rax)
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+ 
+ 
++	/* Two different methods of setting up pointers / compare. The two
++	   methods are based on the fact that EVEX/AVX512 mov instructions take
++	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
++	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
++	   address mode. For EVEX/AVX512 this saves code size and keeps a few
++	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
++	   bottlenecks.  */
+ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+ 	addq	%rdx, %END_REG
+@@ -292,6 +299,15 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_2x_vec)
+ 
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
++	   LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
+ 	/* Store next 2x vec regardless.  */
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+@@ -355,65 +371,93 @@ L(stosb_local):
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+ 	.p2align 4
+ L(less_vec):
++	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
++	   xmm). This is only does anything for AVX2.  */
++	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_no_vdup):
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+-	jae	L(between_32_63)
++	jge	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+ 	cmpl	$16, %edx
+-	jae	L(between_16_31)
++	jge	L(between_16_31)
++#endif
++#ifndef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, %rcx
+ #endif
+-	MOVQ	%XMM0, %rdi
+ 	cmpl	$8, %edx
+-	jae	L(between_8_15)
++	jge	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jge	L(between_4_7)
+ 	cmpl	$1, %edx
+-	ja	L(between_2_3)
+-	jb	L(return)
+-	movb	%sil, (%rax)
+-	VZEROUPPER_RETURN
++	jg	L(between_2_3)
++	jl	L(between_0_0)
++	movb	%sil, (%LESS_VEC_REG)
++L(between_0_0):
++	ret
+ 
+-	/* Align small targets only if not doing so would cross a fetch
+-	   line.  */
++	/* Align small targets only if not doing so would cross a fetch line.
++	 */
+ #if VEC_SIZE > 32
+ 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, (%rax)
+-	VMOVU	%YMM0, -32(%rax, %rdx)
++	VMOVU	%YMM0, (%LESS_VEC_REG)
++	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+ #if VEC_SIZE >= 32
+-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
+ L(between_16_31):
+ 	/* From 16 to 31.  No branch when size == 16.  */
+-	VMOVU	%XMM0, (%rax)
+-	VMOVU	%XMM0, -16(%rax, %rdx)
+-	VZEROUPPER_RETURN
++	VMOVU	%XMM0, (%LESS_VEC_REG)
++	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
++	ret
+ #endif
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	%rdi, (%rax)
+-	movq	%rdi, -8(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, (%rdi)
++	MOVQ	%XMM0, -8(%rdi, %rdx)
++#else
++	movq	%rcx, (%LESS_VEC_REG)
++	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
++	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%edi, (%rax)
+-	movl	%edi, -4(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVD	%XMM0, (%rdi)
++	MOVD	%XMM0, -4(%rdi, %rdx)
++#else
++	movl	%ecx, (%LESS_VEC_REG)
++	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* 4 * XMM_SMALL for the third mov for AVX2.  */
++	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%di, (%rax)
+-	movb	%dil, -1(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	movb	%sil, (%rdi)
++	movb	%sil, 1(%rdi)
++	movb	%sil, -1(%rdi, %rdx)
++#else
++	movw	%cx, (%LESS_VEC_REG)
++	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/glibc-upstream-2.34-206.patch b/glibc-upstream-2.34-206.patch
new file mode 100644
index 0000000..ed9f37b
--- /dev/null
+++ b/glibc-upstream-2.34-206.patch
@@ -0,0 +1,35 @@
+commit 190ea5f7e4e7e98b9b6e3f29835ae8b1f6a5442e
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Feb 7 00:32:23 2022 -0600
+
+    x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only)
+    
+    commit b62ace2740a106222e124cc86956448fa07abf4d
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Sun Feb 6 00:54:18 2022 -0600
+    
+        x86: Improve vec generation in memset-vec-unaligned-erms.S
+    
+    Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
+    instruction and memset.S is restricted to only SSE2 instructions.
+    
+    (cherry picked from commit 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 34ee0bfdcb81fb39..954471e5a5bf225b 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -30,9 +30,10 @@
+ 
+ # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  pxor %xmm1, %xmm1; \
+-  pshufb %xmm1, %xmm0; \
+-  movq r, %rax
++  movq r, %rax; \
++  punpcklbw %xmm0, %xmm0; \
++  punpcklwd %xmm0, %xmm0; \
++  pshufd $0, %xmm0, %xmm0
+ 
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
diff --git a/glibc-upstream-2.34-207.patch b/glibc-upstream-2.34-207.patch
new file mode 100644
index 0000000..9818f5d
--- /dev/null
+++ b/glibc-upstream-2.34-207.patch
@@ -0,0 +1,719 @@
+commit 5cb6329652696e79d6d576165ea87e332c9de106
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+    
+    memset with zero as the value to set is by far the majority value (99%+
+    for Python3 and GCC).
+    
+    bzero can be slightly more optimized for this case by using a zero-idiom
+    xor for broadcasting the set value to a register (vector or GPR).
+    
+    Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit 3d9f171bfb5325bd5f427e9fc386453358c6e840)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 954471e5a5bf225b..0358210c7ff3a976 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -35,6 +35,9 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
++# define BZERO_ZERO_VEC0() \
++  pxor %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -53,6 +56,10 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)	__bzero
++#endif
++
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -63,6 +70,7 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
++weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 26be40959ce62895..37d8d6f0bd2d10cc 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,85 +1,130 @@
+ ifeq ($(subdir),string)
+ 
+-sysdep_routines += strncat-c stpncpy-c strncpy-c \
+-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
+-		   strcmp-sse4_2 strcmp-avx2 \
+-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
+-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+-		   memrchr-sse2 memrchr-avx2 \
+-		   memcmp-sse2 \
+-		   memcmp-avx2-movbe \
+-		   memcmp-sse4 memcpy-ssse3 \
+-		   memmove-ssse3 \
+-		   memcpy-ssse3-back \
+-		   memmove-ssse3-back \
+-		   memmove-avx512-no-vzeroupper \
+-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
+-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
+-		   strncase_l-sse2 strncase_l-ssse3 \
+-		   strncase_l-sse4_2 strncase_l-avx \
+-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+-		   strrchr-sse2 strrchr-avx2 \
+-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+-		   strcat-avx2 strncat-avx2 \
+-		   strcat-ssse3 strncat-ssse3\
+-		   strcpy-avx2 strncpy-avx2 \
+-		   strcpy-sse2 stpcpy-sse2 \
+-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+-		   stpcpy-avx2 stpncpy-avx2 \
+-		   strcat-sse2 \
+-		   strcat-sse2-unaligned strncat-sse2-unaligned \
+-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
+-		   strcspn-c strpbrk-c strspn-c varshift \
+-		   memset-avx512-no-vzeroupper \
+-		   memmove-sse2-unaligned-erms \
+-		   memmove-avx-unaligned-erms \
+-		   memmove-avx512-unaligned-erms \
+-		   memset-sse2-unaligned-erms \
+-		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms \
+-		   memchr-avx2-rtm \
+-		   memcmp-avx2-movbe-rtm \
+-		   memmove-avx-unaligned-erms-rtm \
+-		   memrchr-avx2-rtm \
+-		   memset-avx2-unaligned-erms-rtm \
+-		   rawmemchr-avx2-rtm \
+-		   strchr-avx2-rtm \
+-		   strcmp-avx2-rtm \
+-		   strchrnul-avx2-rtm \
+-		   stpcpy-avx2-rtm \
+-		   stpncpy-avx2-rtm \
+-		   strcat-avx2-rtm \
+-		   strcpy-avx2-rtm \
+-		   strlen-avx2-rtm \
+-		   strncat-avx2-rtm \
+-		   strncmp-avx2-rtm \
+-		   strncpy-avx2-rtm \
+-		   strnlen-avx2-rtm \
+-		   strrchr-avx2-rtm \
+-		   memchr-evex \
+-		   memcmp-evex-movbe \
+-		   memmove-evex-unaligned-erms \
+-		   memrchr-evex \
+-		   memset-evex-unaligned-erms \
+-		   rawmemchr-evex \
+-		   stpcpy-evex \
+-		   stpncpy-evex \
+-		   strcat-evex \
+-		   strchr-evex \
+-		   strchrnul-evex \
+-		   strcmp-evex \
+-		   strcpy-evex \
+-		   strlen-evex \
+-		   strncat-evex \
+-		   strncmp-evex \
+-		   strncpy-evex \
+-		   strnlen-evex \
+-		   strrchr-evex \
+-		   memchr-evex-rtm \
+-		   rawmemchr-evex-rtm
++sysdep_routines += \
++  bzero \
++  memchr-avx2 \
++  memchr-avx2-rtm \
++  memchr-evex \
++  memchr-evex-rtm \
++  memchr-sse2 \
++  memcmp-avx2-movbe \
++  memcmp-avx2-movbe-rtm \
++  memcmp-evex-movbe \
++  memcmp-sse2 \
++  memcmp-sse4 \
++  memcmp-ssse3 \
++  memcpy-ssse3 \
++  memcpy-ssse3-back \
++  memmove-avx-unaligned-erms \
++  memmove-avx-unaligned-erms-rtm \
++  memmove-avx512-no-vzeroupper \
++  memmove-avx512-unaligned-erms \
++  memmove-evex-unaligned-erms \
++  memmove-sse2-unaligned-erms \
++  memmove-ssse3 \
++  memmove-ssse3-back \
++  memrchr-avx2 \
++  memrchr-avx2-rtm \
++  memrchr-evex \
++  memrchr-sse2 \
++  memset-avx2-unaligned-erms \
++  memset-avx2-unaligned-erms-rtm \
++  memset-avx512-no-vzeroupper \
++  memset-avx512-unaligned-erms \
++  memset-evex-unaligned-erms \
++  memset-sse2-unaligned-erms \
++  rawmemchr-avx2 \
++  rawmemchr-avx2-rtm \
++  rawmemchr-evex \
++  rawmemchr-evex-rtm \
++  rawmemchr-sse2 \
++  stpcpy-avx2 \
++  stpcpy-avx2-rtm \
++  stpcpy-evex \
++  stpcpy-sse2 \
++  stpcpy-sse2-unaligned \
++  stpcpy-ssse3 \
++  stpncpy-avx2 \
++  stpncpy-avx2-rtm \
++  stpncpy-c \
++  stpncpy-evex \
++  stpncpy-sse2-unaligned \
++  stpncpy-ssse3 \
++  strcasecmp_l-avx \
++  strcasecmp_l-sse2 \
++  strcasecmp_l-sse4_2 \
++  strcasecmp_l-ssse3 \
++  strcat-avx2 \
++  strcat-avx2-rtm \
++  strcat-evex \
++  strcat-sse2 \
++  strcat-sse2-unaligned \
++  strcat-ssse3 \
++  strchr-avx2 \
++  strchr-avx2-rtm \
++  strchr-evex \
++  strchr-sse2 \
++  strchr-sse2-no-bsf \
++  strchrnul-avx2 \
++  strchrnul-avx2-rtm \
++  strchrnul-evex \
++  strchrnul-sse2 \
++  strcmp-avx2 \
++  strcmp-avx2-rtm \
++  strcmp-evex \
++  strcmp-sse2 \
++  strcmp-sse2-unaligned \
++  strcmp-sse4_2 \
++  strcmp-ssse3 \
++  strcpy-avx2 \
++  strcpy-avx2-rtm \
++  strcpy-evex \
++  strcpy-sse2 \
++  strcpy-sse2-unaligned \
++  strcpy-ssse3 \
++  strcspn-c \
++  strcspn-sse2 \
++  strlen-avx2 \
++  strlen-avx2-rtm \
++  strlen-evex \
++  strlen-sse2 \
++  strncase_l-avx \
++  strncase_l-sse2 \
++  strncase_l-sse4_2 \
++  strncase_l-ssse3 \
++  strncat-avx2 \
++  strncat-avx2-rtm \
++  strncat-c \
++  strncat-evex \
++  strncat-sse2-unaligned \
++  strncat-ssse3 \
++  strncmp-avx2 \
++  strncmp-avx2-rtm \
++  strncmp-evex \
++  strncmp-sse2 \
++  strncmp-sse4_2 \
++  strncmp-ssse3 \
++  strncpy-avx2 \
++  strncpy-avx2-rtm \
++  strncpy-c \
++  strncpy-evex \
++  strncpy-sse2-unaligned \
++  strncpy-ssse3 \
++  strnlen-avx2 \
++  strnlen-avx2-rtm \
++  strnlen-evex \
++  strnlen-sse2 \
++  strpbrk-c \
++  strpbrk-sse2 \
++  strrchr-avx2 \
++  strrchr-avx2-rtm \
++  strrchr-evex \
++  strrchr-sse2 \
++  strspn-c \
++  strspn-sse2 \
++  strstr-sse2-unaligned \
++  varshift \
++# sysdep_routines
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+new file mode 100644
+index 0000000000000000..13e399a9a1fbdeb2
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/bzero.c
+@@ -0,0 +1,108 @@
++/* Multiple versions of bzero.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Define multiple versions only for the definition in libc.  */
++#if IS_IN (libc)
++# define __bzero __redirect___bzero
++# include <string.h>
++# undef __bzero
++
++/* OPTIMIZE1 definition required for bzero patch.  */
++# define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
++# define SYMBOL_NAME __bzero
++# include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
++  attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
++      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx512_unaligned_erms);
++
++	  return OPTIMIZE1 (avx512_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (evex_unaligned_erms);
++
++	  return OPTIMIZE1 (evex_unaligned);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE1 (avx2_unaligned_rtm);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms);
++
++	  return OPTIMIZE1 (avx2_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    return OPTIMIZE1 (sse2_unaligned_erms);
++
++  return OPTIMIZE1 (sse2_unaligned);
++}
++
++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
++
++weak_alias (__bzero, bzero)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 39ab10613bb0ffea..4992d7bd3206a7c0 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
++  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
++  IFUNC_IMPL (i, name, bzero,
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_erms_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned)
++	     )
++
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 8ac3e479bba488be..5a5ee6f67299400b 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,6 +5,7 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index c0bf2875d03d51ab..a093a2831f3dfa0d 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,6 +14,9 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxor %xmm0, %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -29,6 +32,9 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
++# ifndef BZERO_SYMBOL
++#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 5241216a77bf72b7..727c92133a15900f 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 637002150659123c..5d8fa78f05476b10 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index e4e95fc19fe48d2d..bac74ac37fd3c144 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,6 +22,7 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
++# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index c8db87dcbf69f0d8..39a096a594ccb5b6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -26,6 +26,10 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
++#endif
++
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -87,6 +91,18 @@
+ # define XMM_SMALL	0
+ #endif
+ 
++#ifdef USE_LESS_VEC_MASK_STORE
++# define SET_REG64	rcx
++# define SET_REG32	ecx
++# define SET_REG16	cx
++# define SET_REG8	cl
++#else
++# define SET_REG64	rsi
++# define SET_REG32	esi
++# define SET_REG16	si
++# define SET_REG8	sil
++#endif
++
+ #define PAGE_SIZE 4096
+ 
+ /* Macro to calculate size of small memset block for aligning
+@@ -96,18 +112,6 @@
+ 
+ #ifndef SECTION
+ # error SECTION is not defined!
+-#endif
+-
+-	.section SECTION(.text),"ax",@progbits
+-#if VEC_SIZE == 16 && IS_IN (libc)
+-ENTRY (__bzero)
+-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+-	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	xorl	%esi, %esi
+-	pxor	%XMM0, %XMM0
+-	jmp	L(entry_from_bzero)
+-END (__bzero)
+-weak_alias (__bzero, bzero)
+ #endif
+ 
+ #if IS_IN (libc)
+@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	WMEMSET_VDUP_TO_VEC0_LOW()
+ 	cmpq	$VEC_SIZE, %rdx
+-	jb	L(less_vec_no_vdup)
++	jb	L(less_vec_from_wmemset)
+ 	WMEMSET_VDUP_TO_VEC0_HIGH()
+ 	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
++ENTRY (BZERO_SYMBOL(__bzero, unaligned))
++#if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++#ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++#ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++#if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned))
++
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+ 	MEMSET_VDUP_TO_VEC0_HIGH()
+@@ -187,6 +215,31 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
++# if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++# ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++# ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++# if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(stosb_more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned_erms))
++
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -229,6 +282,7 @@ L(last_2x_vec):
+ 	.p2align 4,, 10
+ L(less_vec):
+ L(less_vec_no_vdup):
++L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -374,8 +428,11 @@ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_from_wmemset):
++#if VEC_SIZE > 16
+ L(less_vec_no_vdup):
+ #endif
++#endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+@@ -386,7 +443,10 @@ L(cross_page):
+ 	jge	L(between_16_31)
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+-	MOVQ	%XMM0, %rcx
++	MOVQ	%XMM0, %SET_REG64
++#endif
++#if VEC_SIZE <= 16
++L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
+@@ -395,7 +455,7 @@ L(cross_page):
+ 	cmpl	$1, %edx
+ 	jg	L(between_2_3)
+ 	jl	L(between_0_0)
+-	movb	%sil, (%LESS_VEC_REG)
++	movb	%SET_REG8, (%LESS_VEC_REG)
+ L(between_0_0):
+ 	ret
+ 
+@@ -428,8 +488,8 @@ L(between_8_15):
+ 	MOVQ	%XMM0, (%rdi)
+ 	MOVQ	%XMM0, -8(%rdi, %rdx)
+ #else
+-	movq	%rcx, (%LESS_VEC_REG)
+-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++	movq	%SET_REG64, (%LESS_VEC_REG)
++	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -442,8 +502,8 @@ L(between_4_7):
+ 	MOVD	%XMM0, (%rdi)
+ 	MOVD	%XMM0, -4(%rdi, %rdx)
+ #else
+-	movl	%ecx, (%LESS_VEC_REG)
+-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++	movl	%SET_REG32, (%LESS_VEC_REG)
++	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -452,12 +512,12 @@ L(between_4_7):
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ #ifdef USE_XMM_LESS_VEC
+-	movb	%sil, (%rdi)
+-	movb	%sil, 1(%rdi)
+-	movb	%sil, -1(%rdi, %rdx)
++	movb	%SET_REG8, (%rdi)
++	movb	%SET_REG8, 1(%rdi)
++	movb	%SET_REG8, -1(%rdi, %rdx)
+ #else
+-	movw	%cx, (%LESS_VEC_REG)
+-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++	movw	%SET_REG16, (%LESS_VEC_REG)
++	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/glibc-upstream-2.34-208.patch b/glibc-upstream-2.34-208.patch
new file mode 100644
index 0000000..d4d9b52
--- /dev/null
+++ b/glibc-upstream-2.34-208.patch
@@ -0,0 +1,29 @@
+commit 70509f9b4807295b2b4b43bffe110580fc0381ef
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sat Feb 12 00:45:00 2022 -0600
+
+    x86: Set .text section in memset-vec-unaligned-erms
+    
+    commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+    Author: H.J. Lu <hjl.tools@gmail.com>
+    Date:   Mon Feb 7 05:55:15 2022 -0800
+    
+        x86-64: Optimize bzero
+    
+    Remove setting the .text section for the code. This commit
+    adds that back.
+    
+    (cherry picked from commit 7912236f4a597deb092650ca79f33504ddb4af28)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 39a096a594ccb5b6..d9c577fb5ff9700f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -114,6 +114,7 @@
+ # error SECTION is not defined!
+ #endif
+ 
++	.section SECTION(.text), "ax", @progbits
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
diff --git a/glibc-upstream-2.34-209.patch b/glibc-upstream-2.34-209.patch
new file mode 100644
index 0000000..4874143
--- /dev/null
+++ b/glibc-upstream-2.34-209.patch
@@ -0,0 +1,76 @@
+commit 5373c90f2ea3c3fa9931a684c9b81c648dfbe8d7
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Feb 15 20:27:21 2022 -0600
+
+    x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+    
+    Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
+    are near the start of a page. To avoid having the result contimated by
+    these comparisons the `strcmp` variants would mask off these
+    comparisons. This was missing in the `strncmp` variants causing
+    the bug. This commit adds the masking to `strncmp` so that out of
+    range comparisons don't affect the result.
+    
+    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
+    well a full xcheck on x86_64 linux.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf)
+
+diff --git a/string/test-strncmp.c b/string/test-strncmp.c
+index 97e831d88fd24316..56e23670ae7f90e4 100644
+--- a/string/test-strncmp.c
++++ b/string/test-strncmp.c
+@@ -438,13 +438,23 @@ check3 (void)
+ static void
+ check4 (void)
+ {
+-  const CHAR *s1 = L ("abc");
+-  CHAR *s2 = STRDUP (s1);
++  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
++     the end of the page. 2) For there to be no mismatch/null byte before the
++     first page cross. 3) For length (`n`) to be large enough for one string to
++     cross the page. And 4) for there to be either mismatch/null bytes before
++     the start of the strings.  */
++
++  size_t size = 10;
++  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
++  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
++  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
++  int exp_result;
+ 
++  STRCPY (s1, L ("tst-tlsmod%"));
++  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
++  exp_result = SIMPLE_STRNCMP (s1, s2, size);
+   FOR_EACH_IMPL (impl, 0)
+-    check_result (impl, s1, s2, SIZE_MAX, 0);
+-
+-  free (s2);
++  check_result (impl, s1, s2, size, exp_result);
+ }
+ 
+ int
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index cdded412a70bad10..f9bdc5ccd03aa1f9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -661,6 +661,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx), %ecx
+ 	cmpl	%ecx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ed56af8ecdad48b2..0dfa62bd149c02b4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -689,6 +689,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+ #  ifdef USE_AS_WCSCMP
diff --git a/glibc-upstream-2.34-210.patch b/glibc-upstream-2.34-210.patch
new file mode 100644
index 0000000..4898d45
--- /dev/null
+++ b/glibc-upstream-2.34-210.patch
@@ -0,0 +1,71 @@
+commit e123f08ad5ea4691bc37430ce536988c221332d6
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Thu Mar 24 15:50:33 2022 -0500
+
+    x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
+    
+    Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
+    __wcscmp_avx2.
+    
+    commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Sun Jan 9 16:02:21 2022 -0600
+    
+        x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+    
+    Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
+    to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
+    can cause spurious aborts.
+    
+    This change will need to be backported.
+    
+    All string/memory tests pass.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 9fef7039a7d04947bc89296ee0d187bc8d89b772)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index aef9866cf2fbe774..ba6543be8ce13927 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -70,6 +70,16 @@ function_overflow (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow2 (void)
++{
++  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+@@ -77,5 +87,10 @@ do_test (void)
+   if (status != EXIT_SUCCESS)
+     return status;
+   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
++  if (status != EXIT_SUCCESS)
++    return status;
+   return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index f9bdc5ccd03aa1f9..09a73942086f9c9f 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -122,7 +122,7 @@ ENTRY(STRCMP)
+ 	   are cases where length is large enough that it can never be a
+ 	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ 
+ 	leaq	(, %rdx, 4), %rdx
+ #  endif
diff --git a/glibc-upstream-2.34-211.patch b/glibc-upstream-2.34-211.patch
new file mode 100644
index 0000000..1221458
--- /dev/null
+++ b/glibc-upstream-2.34-211.patch
@@ -0,0 +1,170 @@
+commit e4a2fb76efb45210c541ee3f8ef32f317783c3a8
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed May 11 20:30:49 2022 +0200
+
+    manual: Document the dlinfo function
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    Tested-by: Carlos O'Donell <carlos@rehdat.com>
+    (cherry picked from commit 93804a1ee084d4bdc620b2b9f91615c7da0fabe1)
+    
+    Also includes partial backport of commit 5d28a8962dcb6ec056b81d730e
+    (the addition of manual/dynlink.texi).
+
+diff --git a/manual/Makefile b/manual/Makefile
+index e83444341e282916..31678681ef059e0f 100644
+--- a/manual/Makefile
++++ b/manual/Makefile
+@@ -39,7 +39,7 @@ chapters = $(addsuffix .texi, \
+ 		       pipe socket terminal syslog math arith time	\
+ 		       resource setjmp signal startup process ipc job	\
+ 		       nss users sysinfo conf crypt debug threads	\
+-		       probes tunables)
++		       dynlink probes tunables)
+ appendices = lang.texi header.texi install.texi maint.texi platform.texi \
+ 	     contrib.texi
+ licenses = freemanuals.texi lgpl-2.1.texi fdl-1.3.texi
+diff --git a/manual/dynlink.texi b/manual/dynlink.texi
+new file mode 100644
+index 0000000000000000..dbf3de11769d8e57
+--- /dev/null
++++ b/manual/dynlink.texi
+@@ -0,0 +1,100 @@
++@node Dynamic Linker
++@c @node Dynamic Linker, Internal Probes, Threads, Top
++@c %MENU% Loading programs and shared objects.
++@chapter Dynamic Linker
++@cindex dynamic linker
++@cindex dynamic loader
++
++The @dfn{dynamic linker} is responsible for loading dynamically linked
++programs and their dependencies (in the form of shared objects).  The
++dynamic linker in @theglibc{} also supports loading shared objects (such
++as plugins) later at run time.
++
++Dynamic linkers are sometimes called @dfn{dynamic loaders}.
++
++@menu
++* Dynamic Linker Introspection::    Interfaces for querying mapping information.
++@end menu
++
++@node Dynamic Linker Introspection
++@section Dynamic Linker Introspection
++
++@Theglibc{} provides various functions for querying information from the
++dynamic linker.
++
++@deftypefun {int} dlinfo (void *@var{handle}, int @var{request}, void *@var{arg})
++@safety{@mtsafe{}@asunsafe{@asucorrupt{}}@acunsafe{@acucorrupt{}}}
++@standards{GNU, dlfcn.h}
++This function returns information about @var{handle} in the memory
++location @var{arg}, based on @var{request}.  The @var{handle} argument
++must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must
++not have been closed by @code{dlclose}.
++
++On success, @code{dlinfo} returns 0.  If there is an error, the function
++returns @math{-1}, and @code{dlerror} can be used to obtain a
++corresponding error message.
++
++The following operations are defined for use with @var{request}:
++
++@vtable @code
++@item RTLD_DI_LINKMAP
++The corresponding @code{struct link_map} pointer for @var{handle} is
++written to @code{*@var{arg}}.  The @var{arg} argument must be the
++address of an object of type @code{struct link_map *}.
++
++@item RTLD_DI_LMID
++The namespace identifier of @var{handle} is written to
++@code{*@var{arg}}.  The @var{arg} argument must be the address of an
++object of type @code{Lmid_t}.
++
++@item RTLD_DI_ORIGIN
++The value of the @code{$ORIGIN} dynamic string token for @var{handle} is
++written to the character array starting at @var{arg} as a
++null-terminated string.
++
++This request type should not be used because it is prone to buffer
++overflows.
++
++@item RTLD_DI_SERINFO
++@itemx RTLD_DI_SERINFOSIZE
++These requests can be used to obtain search path information for
++@var{handle}.  For both requests, @var{arg} must point to a
++@code{Dl_serinfo} object.  The @code{RTLD_DI_SERINFOSIZE} request must
++be made first; it updates the @code{dls_size} and @code{dls_cnt} members
++of the @code{Dl_serinfo} object.  The caller should then allocate memory
++to store at least @code{dls_size} bytes and pass that buffer to a
++@code{RTLD_DI_SERINFO} request.  This second request fills the
++@code{dls_serpath} array.  The number of array elements was returned in
++the @code{dls_cnt} member in the initial @code{RTLD_DI_SERINFOSIZE}
++request.  The caller is responsible for freeing the allocated buffer.
++
++This interface is prone to buffer overflows in multi-threaded processes
++because the required size can change between the
++@code{RTLD_DI_SERINFOSIZE} and @code{RTLD_DI_SERINFO} requests.
++
++@item RTLD_DI_TLS_DATA
++This request writes the address of the TLS block (in the current thread)
++for the shared object identified by @var{handle} to @code{*@var{arg}}.
++The argument @var{arg} must be the address of an object of type
++@code{void *}.  A null pointer is written if the object does not have
++any associated TLS block.
++
++@item RTLD_DI_TLS_MODID
++This request writes the TLS module ID for the shared object @var{handle}
++to @code{*@var{arg}}.  The argument @var{arg} must be the address of an
++object of type @code{size_t}.  The module ID is zero if the object
++does not have an associated TLS block.
++@end vtable
++
++The @code{dlinfo} function is a GNU extension.
++@end deftypefun
++
++@c FIXME these are undocumented:
++@c dladdr
++@c dladdr1
++@c dlclose
++@c dlerror
++@c dlmopen
++@c dlopen
++@c dlsym
++@c dlvsym
+diff --git a/manual/libdl.texi b/manual/libdl.texi
+deleted file mode 100644
+index e3fe0452d9f41d47..0000000000000000
+--- a/manual/libdl.texi
++++ /dev/null
+@@ -1,10 +0,0 @@
+-@c FIXME these are undocumented:
+-@c dladdr
+-@c dladdr1
+-@c dlclose
+-@c dlerror
+-@c dlinfo
+-@c dlmopen
+-@c dlopen
+-@c dlsym
+-@c dlvsym
+diff --git a/manual/probes.texi b/manual/probes.texi
+index 4aae76b81921f347..ee019e651706f492 100644
+--- a/manual/probes.texi
++++ b/manual/probes.texi
+@@ -1,5 +1,5 @@
+ @node Internal Probes
+-@c @node Internal Probes, Tunables, Threads, Top
++@c @node Internal Probes, Tunables, Dynamic Linker, Top
+ @c %MENU% Probes to monitor libc internal behavior
+ @chapter Internal probes
+ 
+diff --git a/manual/threads.texi b/manual/threads.texi
+index 06b6b277a1228af1..7f166bfa87e88c36 100644
+--- a/manual/threads.texi
++++ b/manual/threads.texi
+@@ -1,5 +1,5 @@
+ @node Threads
+-@c @node Threads, Internal Probes, Debugging Support, Top
++@c @node Threads, Dynamic Linker, Debugging Support, Top
+ @c %MENU% Functions, constants, and data types for working with threads
+ @chapter Threads
+ @cindex threads
diff --git a/glibc-upstream-2.34-212.patch b/glibc-upstream-2.34-212.patch
new file mode 100644
index 0000000..000023f
--- /dev/null
+++ b/glibc-upstream-2.34-212.patch
@@ -0,0 +1,256 @@
+commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Fri Apr 29 17:00:53 2022 +0200
+
+    dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo
+    
+    The information is theoretically available via dl_iterate_phdr as
+    well, but that approach is very slow if there are many shared
+    objects.
+    
+    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+    Tested-by: Carlos O'Donell <carlos@rehdat.com>
+    (cherry picked from commit d056c212130280c0a54d9a4f72170ec621b70ce5)
+
+diff --git a/dlfcn/Makefile b/dlfcn/Makefile
+index 6bbfbb8344da05cb..d3965427dabed898 100644
+--- a/dlfcn/Makefile
++++ b/dlfcn/Makefile
+@@ -73,6 +73,10 @@ tststatic3-ENV = $(tststatic-ENV)
+ tststatic4-ENV = $(tststatic-ENV)
+ tststatic5-ENV = $(tststatic-ENV)
+ 
++tests-internal += \
++  tst-dlinfo-phdr \
++  # tests-internal
++
+ ifneq (,$(CXX))
+ modules-names += bug-atexit3-lib
+ else
+diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h
+index 4a3b870a487ea789..24388cfedae4dd67 100644
+--- a/dlfcn/dlfcn.h
++++ b/dlfcn/dlfcn.h
+@@ -162,7 +162,12 @@ enum
+        segment, or if the calling thread has not allocated a block for it.  */
+     RTLD_DI_TLS_DATA = 10,
+ 
+-    RTLD_DI_MAX = 10
++    /* Treat ARG as const ElfW(Phdr) **, and store the address of the
++       program header array at that location.  The dlinfo call returns
++       the number of program headers in the array.  */
++    RTLD_DI_PHDR = 11,
++
++    RTLD_DI_MAX = 11
+   };
+ 
+ 
+diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
+index 47d2daa96fa5986f..1842925fb7c594dd 100644
+--- a/dlfcn/dlinfo.c
++++ b/dlfcn/dlinfo.c
+@@ -28,6 +28,10 @@ struct dlinfo_args
+   void *handle;
+   int request;
+   void *arg;
++
++  /* This is the value that is returned from dlinfo if no error is
++     signaled.  */
++  int result;
+ };
+ 
+ static void
+@@ -40,6 +44,7 @@ dlinfo_doit (void *argsblock)
+     {
+     case RTLD_DI_CONFIGADDR:
+     default:
++      args->result = -1;
+       _dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request"));
+       break;
+ 
+@@ -75,6 +80,11 @@ dlinfo_doit (void *argsblock)
+ 	*(void **) args->arg = data;
+ 	break;
+       }
++
++    case RTLD_DI_PHDR:
++      *(const ElfW(Phdr) **) args->arg = l->l_phdr;
++      args->result = l->l_phnum;
++      break;
+     }
+ }
+ 
+@@ -82,7 +92,8 @@ static int
+ dlinfo_implementation (void *handle, int request, void *arg)
+ {
+   struct dlinfo_args args = { handle, request, arg };
+-  return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0;
++  _dlerror_run (&dlinfo_doit, &args);
++  return args.result;
+ }
+ 
+ #ifdef SHARED
+diff --git a/dlfcn/tst-dlinfo-phdr.c b/dlfcn/tst-dlinfo-phdr.c
+new file mode 100644
+index 0000000000000000..a15a7d48ebd3b976
+--- /dev/null
++++ b/dlfcn/tst-dlinfo-phdr.c
+@@ -0,0 +1,125 @@
++/* Test for dlinfo (RTLD_DI_PHDR).
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <dlfcn.h>
++#include <link.h>
++#include <stdbool.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/auxv.h>
++
++#include <support/check.h>
++#include <support/xdlfcn.h>
++
++/* Used to verify that the program header array appears as expected
++   among the dl_iterate_phdr callback invocations.  */
++
++struct dlip_callback_args
++{
++  struct link_map *l;           /* l->l_addr is used to find the object.  */
++  const ElfW(Phdr) *phdr;       /* Expected program header pointed.  */
++  int phnum;                    /* Expected program header count.  */
++  bool found;                   /* True if l->l_addr has been found.  */
++};
++
++static int
++dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure)
++{
++  TEST_COMPARE (sizeof (*dlpi), size);
++  struct dlip_callback_args *args = closure;
++
++  if (dlpi->dlpi_addr == args->l->l_addr)
++    {
++      TEST_VERIFY (!args->found);
++      args->found = true;
++      TEST_VERIFY (args->phdr == dlpi->dlpi_phdr);
++      TEST_COMPARE (args->phnum, dlpi->dlpi_phnum);
++    }
++
++  return 0;
++}
++
++static int
++do_test (void)
++{
++  /* Avoid a copy relocation.  */
++  struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug");
++  struct link_map *l = (struct link_map *) debug->r_map;
++  TEST_VERIFY_EXIT (l != NULL);
++
++  do
++    {
++      printf ("info: checking link map %p (%p) for \"%s\"\n",
++              l, l->l_phdr, l->l_name);
++
++      /* Cause dlerror () to return an error message.  */
++      dlsym (RTLD_DEFAULT, "does-not-exist");
++
++      /* Use the extension that link maps are valid dlopen handles.  */
++      const ElfW(Phdr) *phdr;
++      int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr);
++      TEST_VERIFY (phnum >= 0);
++      /* Verify that the error message has been cleared.  */
++      TEST_COMPARE_STRING (dlerror (), NULL);
++
++      TEST_VERIFY (phdr == l->l_phdr);
++      TEST_COMPARE (phnum, l->l_phnum);
++
++      /* Check that we can find PT_DYNAMIC among the array.  */
++      {
++        bool dynamic_found = false;
++        for (int i = 0; i < phnum; ++i)
++          if (phdr[i].p_type == PT_DYNAMIC)
++            {
++              dynamic_found = true;
++              TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr);
++            }
++        TEST_VERIFY (dynamic_found);
++      }
++
++      /* Check that dl_iterate_phdr finds the link map with the same
++         program headers.  */
++      {
++        struct dlip_callback_args args =
++          {
++            .l =  l,
++            .phdr = phdr,
++            .phnum = phnum,
++            .found = false,
++          };
++        TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0);
++        TEST_VERIFY (args.found);
++      }
++
++      if (l->l_prev == NULL)
++        {
++          /* This is the executable, so the information is also
++             available via getauxval.  */
++          TEST_COMPARE_STRING (l->l_name, "");
++          TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR));
++          TEST_COMPARE (phnum, getauxval (AT_PHNUM));
++        }
++
++      l = l->l_next;
++    }
++  while (l != NULL);
++
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/manual/dynlink.texi b/manual/dynlink.texi
+index dbf3de11769d8e57..7dcac64889e389fd 100644
+--- a/manual/dynlink.texi
++++ b/manual/dynlink.texi
+@@ -30,9 +30,9 @@ location @var{arg}, based on @var{request}.  The @var{handle} argument
+ must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must
+ not have been closed by @code{dlclose}.
+ 
+-On success, @code{dlinfo} returns 0.  If there is an error, the function
+-returns @math{-1}, and @code{dlerror} can be used to obtain a
+-corresponding error message.
++On success, @code{dlinfo} returns 0 for most request types; exceptions
++are noted below.  If there is an error, the function returns @math{-1},
++and @code{dlerror} can be used to obtain a corresponding error message.
+ 
+ The following operations are defined for use with @var{request}:
+ 
+@@ -84,6 +84,15 @@ This request writes the TLS module ID for the shared object @var{handle}
+ to @code{*@var{arg}}.  The argument @var{arg} must be the address of an
+ object of type @code{size_t}.  The module ID is zero if the object
+ does not have an associated TLS block.
++
++@item RTLD_DI_PHDR
++This request writes the address of the program header array to
++@code{*@var{arg}}.  The argument @var{arg} must be the address of an
++object of type @code{const ElfW(Phdr) *} (that is,
++@code{const Elf32_Phdr *} or @code{const Elf64_Phdr *}, as appropriate
++for the current architecture).  For this request, the value returned by
++@code{dlinfo} is the number of program headers in the program header
++array.
+ @end vtable
+ 
+ The @code{dlinfo} function is a GNU extension.
diff --git a/glibc.spec b/glibc.spec
index 61f2ecc..691cc83 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -148,7 +148,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 32%{?dist}
+Release: 33%{?dist}
 
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@@ -461,6 +461,28 @@ Patch253: glibc-upstream-2.34-187.patch
 Patch254: glibc-upstream-2.34-188.patch
 Patch255: glibc-upstream-2.34-189.patch
 Patch256: glibc-upstream-2.34-190.patch
+Patch257: glibc-upstream-2.34-191.patch
+Patch258: glibc-upstream-2.34-192.patch
+Patch259: glibc-upstream-2.34-193.patch
+Patch260: glibc-upstream-2.34-194.patch
+Patch261: glibc-upstream-2.34-195.patch
+Patch262: glibc-upstream-2.34-196.patch
+Patch263: glibc-upstream-2.34-197.patch
+Patch264: glibc-upstream-2.34-198.patch
+Patch265: glibc-upstream-2.34-199.patch
+Patch266: glibc-upstream-2.34-200.patch
+Patch267: glibc-upstream-2.34-201.patch
+Patch268: glibc-upstream-2.34-202.patch
+Patch269: glibc-upstream-2.34-203.patch
+Patch270: glibc-upstream-2.34-204.patch
+Patch271: glibc-upstream-2.34-205.patch
+Patch272: glibc-upstream-2.34-206.patch
+Patch273: glibc-upstream-2.34-207.patch
+Patch274: glibc-upstream-2.34-208.patch
+Patch275: glibc-upstream-2.34-209.patch
+Patch276: glibc-upstream-2.34-210.patch
+Patch277: glibc-upstream-2.34-211.patch
+Patch278: glibc-upstream-2.34-212.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2517,6 +2539,32 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
+* Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33
+- Sync with upstream branch release/2.34/master,
+  commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23:
+- dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo
+- manual: Document the dlinfo function
+- x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
+- x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+- x86: Set .text section in memset-vec-unaligned-erms
+- x86-64: Optimize bzero
+- x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only)
+- x86: Improve vec generation in memset-vec-unaligned-erms.S
+- x86-64: Fix strcmp-evex.S
+- x86-64: Fix strcmp-avx2.S
+- x86: Optimize strcmp-evex.S
+- x86: Optimize strcmp-avx2.S
+- manual: Clarify that abbreviations of long options are allowed
+- Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h
+- aarch64: Add HWCAP2_ECV from Linux 5.16
+- Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h
+- Update kernel version to 5.17 in tst-mman-consts.py
+- Update kernel version to 5.16 in tst-mman-consts.py
+- Update syscall lists for Linux 5.17
+- Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h
+- Update kernel version to 5.15 in tst-mman-consts.py
+- Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h
+
 * Thu Apr 28 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-32
 - Sync with upstream branch release/2.34/master,
   commit c66c92181ddbd82306537a608e8c0282587131de: