From 88d869c1c2e2474b7f6610836237ec6c281ef07e Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Thu, 21 Nov 2024 20:09:48 +0100
Subject: [PATCH] Revert aarch64 memset changes (cec3aef3241cec3aef32412779e)
 (#2327564)

---
 glibc-rh2327564-1.patch | 158 +++++++++++++++++++++++
 glibc-rh2327564-2.patch | 276 ++++++++++++++++++++++++++++++++++++++++
 glibc.spec              |   7 +-
 3 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 glibc-rh2327564-1.patch
 create mode 100644 glibc-rh2327564-2.patch

diff --git a/glibc-rh2327564-1.patch b/glibc-rh2327564-1.patch
new file mode 100644
index 0000000..698bc6b
--- /dev/null
+++ b/glibc-rh2327564-1.patch
@@ -0,0 +1,158 @@
+commit d115e98ad627fae62679bc18e3bf062a898860cb
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed Nov 20 19:21:45 2024 +0100
+
+    Revert "AArch64: Remove memset-reg.h"
+    
+    This reverts commit 8ecb477ea16a387a44ace5bf59d39a7e270b238b.
+
+diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
+new file mode 100644
+index 0000000000000000..6c7f60b37edf3b11
+--- /dev/null
++++ b/sysdeps/aarch64/memset-reg.h
+@@ -0,0 +1,30 @@
++/* Register aliases for memset to be used across implementations.
++   Copyright (C) 2017-2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#define dstin	x0
++#define val	x1
++#define valw	w1
++#define count	x2
++#define dst	x3
++#define dstend	x4
++#define tmp1	x5
++#define tmp1w	w5
++#define tmp2	x6
++#define tmp2w	w6
++#define zva_len x7
++#define zva_lenw w7
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index b76dde1557ed8fb1..caafb019e2b6217b 100644
+--- a/sysdeps/aarch64/memset.S
++++ b/sysdeps/aarch64/memset.S
+@@ -30,6 +30,7 @@
+  */
+ 
+ #define dstin	x0
++#define val	x1
+ #define valw	w1
+ #define count	x2
+ #define dst	x3
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index f665b5a891433c1c..2e6d882fc931a882 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#include <sysdeps/aarch64/memset-reg.h>
+ 
+ /* Assumptions:
+  *
+@@ -35,14 +36,6 @@
+ 
+ 	.arch armv8.2-a+sve
+ 
+-#define dstin   x0
+-#define valw    w1
+-#define count   x2
+-#define dst     x3
+-#define dstend  x4
+-#define tmp1    x5
+-#define tmp2    x6
+-
+ 	.macro st1b_unroll first=0, last=7
+ 	st1b	z0.b, p0, [dst, \first, mul vl]
+ 	.if \last-\first
+diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
+index cf1b25f2edf64900..6d714ed0e1b396ef 100644
+--- a/sysdeps/aarch64/multiarch/memset_emag.S
++++ b/sysdeps/aarch64/multiarch/memset_emag.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#include "memset-reg.h"
+ 
+ /* Assumptions:
+  *
+@@ -25,13 +26,6 @@
+  *
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_emag)
+ 
+ 	PTR_ARG (0)
+diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+index f815c20b0383f057..7b215501376cbe03 100644
+--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
++++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#include <sysdeps/aarch64/memset-reg.h>
+ 
+ /* Assumptions:
+  *
+@@ -25,12 +26,6 @@
+  *
+  */
+ 
+-#define dstin	x0
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_kunpeng)
+ 
+ 	PTR_ARG (0)
+diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
+index 6fa28a9bd030a705..b43a43b54e1b3439 100644
+--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
++++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
+@@ -19,18 +19,12 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#include "memset-reg.h"
+ 
+ /* Assumptions:
+    ARMv8-a, AArch64, unaligned accesses
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_oryon1)
+ 
+ 	PTR_ARG (0)
diff --git a/glibc-rh2327564-2.patch b/glibc-rh2327564-2.patch
new file mode 100644
index 0000000..4afd43b
--- /dev/null
+++ b/glibc-rh2327564-2.patch
@@ -0,0 +1,276 @@
+commit b26c53ecc4dd3bc48b11e09f6ddc7c1441e126c2
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed Nov 20 19:21:48 2024 +0100
+
+    Revert "AArch64: Optimize memset"
+    
+    This reverts commit cec3aef32412779e207f825db0d057ebb4628ae8.
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index caafb019e2b6217b..7ef77ee8c926de21 100644
+--- a/sysdeps/aarch64/memset.S
++++ b/sysdeps/aarch64/memset.S
+@@ -1,5 +1,4 @@
+-/* Generic optimized memset using SIMD.
+-   Copyright (C) 2012-2024 Free Software Foundation, Inc.
++/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -18,6 +17,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#include "memset-reg.h"
+ 
+ #ifndef MEMSET
+ # define MEMSET memset
+@@ -25,132 +25,130 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
++ * ARMv8-a, AArch64, unaligned accesses
+  *
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-#define zva_val	x5
+-#define off	x3
+-#define dstend2	x5
+-
+ ENTRY (MEMSET)
++
+ 	PTR_ARG (0)
+ 	SIZE_ARG (2)
+ 
+ 	dup	v0.16B, valw
+-	cmp	count, 16
+-	b.lo	L(set_small)
+-
+ 	add	dstend, dstin, count
+-	cmp	count, 64
+-	b.hs	L(set_128)
+ 
+-	/* Set 16..63 bytes.  */
+-	mov	off, 16
+-	and	off, off, count, lsr 1
+-	sub	dstend2, dstend, off
+-	str	q0, [dstin]
+-	str	q0, [dstin, off]
+-	str	q0, [dstend2, -16]
+-	str	q0, [dstend, -16]
+-	ret
++	cmp	count, 96
++	b.hi	L(set_long)
++	cmp	count, 16
++	b.hs	L(set_medium)
++	mov	val, v0.D[0]
+ 
+-	.p2align 4
+ 	/* Set 0..15 bytes.  */
+-L(set_small):
+-	add	dstend, dstin, count
+-	cmp	count, 4
+-	b.lo	2f
+-	lsr	off, count, 3
+-	sub	dstend2, dstend, off, lsl 2
+-	str	s0, [dstin]
+-	str	s0, [dstin, off, lsl 2]
+-	str	s0, [dstend2, -4]
+-	str	s0, [dstend, -4]
++	tbz	count, 3, 1f
++	str	val, [dstin]
++	str	val, [dstend, -8]
++	ret
++	nop
++1:	tbz	count, 2, 2f
++	str	valw, [dstin]
++	str	valw, [dstend, -4]
+ 	ret
+-
+-	/* Set 0..3 bytes.  */
+ 2:	cbz	count, 3f
+-	lsr	off, count, 1
+ 	strb	valw, [dstin]
+-	strb	valw, [dstin, off]
+-	strb	valw, [dstend, -1]
++	tbz	count, 1, 3f
++	strh	valw, [dstend, -2]
+ 3:	ret
+ 
++	/* Set 17..96 bytes.  */
++L(set_medium):
++	str	q0, [dstin]
++	tbnz	count, 6, L(set96)
++	str	q0, [dstend, -16]
++	tbz	count, 5, 1f
++	str	q0, [dstin, 16]
++	str	q0, [dstend, -32]
++1:	ret
++
+ 	.p2align 4
+-L(set_128):
+-	bic	dst, dstin, 15
+-	cmp	count, 128
+-	b.hi	L(set_long)
+-	stp	q0, q0, [dstin]
++	/* Set 64..96 bytes.  Write 64 bytes from the start and
++	   32 bytes from the end.  */
++L(set96):
++	str	q0, [dstin, 16]
+ 	stp	q0, q0, [dstin, 32]
+-	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
++	nop
+ L(set_long):
++	and	valw, valw, 255
++	bic	dst, dstin, 15
+ 	str	q0, [dstin]
+-	str	q0, [dst, 16]
+-	tst	valw, 255
+-	b.ne	L(no_zva)
+-#ifndef ZVA64_ONLY
+-	mrs	zva_val, dczid_el0
+-	and	zva_val, zva_val, 31
+-	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+-	b.ne	L(zva_128)
+-#endif
+-	stp	q0, q0, [dst, 32]
+-	bic	dst, dstin, 63
+-	sub	count, dstend, dst	/* Count is now 64 too large.  */
+-	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+-
+-	/* Write last bytes before ZVA loop.  */
+-	stp	q0, q0, [dstend, -64]
+-	stp	q0, q0, [dstend, -32]
+-
+-	.p2align 4
+-L(zva64_loop):
+-	add	dst, dst, 64
+-	dc	zva, dst
++	cmp	count, 256
++	ccmp	valw, 0, 0, cs
++	b.eq	L(try_zva)
++L(no_zva):
++	sub	count, dstend, dst	/* Count is 16 too large.  */
++	sub	dst, dst, 16		/* Dst is biased by -32.  */
++	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
++1:	stp	q0, q0, [dst, 32]
++	stp	q0, q0, [dst, 64]!
++L(tail64):
+ 	subs	count, count, 64
+-	b.hi	L(zva64_loop)
++	b.hi	1b
++2:	stp	q0, q0, [dstend, -64]
++	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
++L(try_zva):
++#ifndef ZVA64_ONLY
+ 	.p2align 3
+-L(no_zva):
+-	sub	count, dstend, dst	/* Count is 32 too large.  */
+-	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
+-L(no_zva_loop):
++	mrs	tmp1, dczid_el0
++	tbnz	tmp1w, 4, L(no_zva)
++	and	tmp1w, tmp1w, 15
++	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
++	b.ne	 L(zva_128)
++	nop
++#endif
++	/* Write the first and last 64 byte aligned block using stp rather
++	   than using DC ZVA.  This is faster on some cores.
++	 */
++	.p2align 4
++L(zva_64):
++	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
++	bic	dst, dst, 63
+ 	stp	q0, q0, [dst, 64]
++	stp	q0, q0, [dst, 96]
++	sub	count, dstend, dst	/* Count is now 128 too large.	*/
++	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
++	add	dst, dst, 128
++1:	dc	zva, dst
+ 	add	dst, dst, 64
+ 	subs	count, count, 64
+-	b.hi	L(no_zva_loop)
++	b.hi	1b
++	stp	q0, q0, [dst, 0]
++	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+ #ifndef ZVA64_ONLY
+-	.p2align 4
++	.p2align 3
+ L(zva_128):
+-	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+-	b.ne	L(no_zva)
++	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
++	b.ne	L(zva_other)
+ 
++	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dst, 64]
+ 	stp	q0, q0, [dst, 96]
+ 	bic	dst, dst, 127
+ 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+-1:	add	dst, dst, 128
+-	dc	zva, dst
++	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
++	add	dst, dst, 128
++1:	dc	zva, dst
++	add	dst, dst, 128
+ 	subs	count, count, 128
+ 	b.hi	1b
+ 	stp	q0, q0, [dstend, -128]
+@@ -158,6 +156,35 @@ L(zva_128):
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
++
++L(zva_other):
++	mov	tmp2w, 4
++	lsl	zva_lenw, tmp2w, tmp1w
++	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
++	cmp	count, tmp1
++	blo	L(no_zva)
++
++	sub	tmp2, zva_len, 1
++	add	tmp1, dst, zva_len
++	add	dst, dst, 16
++	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
++	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
++	beq	2f
++1:	stp	q0, q0, [dst], 64
++	stp	q0, q0, [dst, -32]
++	subs	count, count, 64
++	b.hi	1b
++2:	mov	dst, tmp1
++	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
++	subs	count, count, zva_len
++	b.lo	4f
++3:	dc	zva, dst
++	add	dst, dst, zva_len
++	subs	count, count, zva_len
++	b.hs	3b
++4:	add	count, count, zva_len
++	sub	dst, dst, 32		/* Bias dst for tail loop.  */
++	b	L(tail64)
+ #endif
+ 
+ END (MEMSET)
diff --git a/glibc.spec b/glibc.spec
index 1f3b2a8..a881339 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -152,7 +152,7 @@ Version: %{glibcversion}
 # - It allows using the Release number without the %%dist tag in the dependency
 #   generator to make the generated requires interchangeable between Rawhide
 #   and ELN (.elnYY < .fcXX).
-%global baserelease 19
+%global baserelease 20
 Release: %{baserelease}%{?dist}
 
 # Licenses:
@@ -342,6 +342,8 @@ Patch13: glibc-fedora-localedata-rh61908.patch
 Patch17: glibc-cs-path.patch
 Patch23: glibc-python3.patch
 Patch24: glibc-nolink-libc.patch
+Patch25: glibc-revert-1.patch
+Patch26: glibc-revert-2.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2358,6 +2360,9 @@ update_gconv_modules_cache ()
 %endif
 
 %changelog
+* Thu Nov 21 2024 Florian Weimer <fweimer@redhat.com> - 2.40.9000-20
+- Revert aarch64 memset changes (cec3aef3241cec3aef32412779e) (#2327564)
+
 * Wed Nov 20 2024 Florian Weimer <fweimer@redhat.com> - 2.40.9000-19
 - Auto-sync with upstream branch master,
   commit 47311cca31e685fa7bfe19bb8cef17d2d3d7fff9: