Revert aarch64 memset changes (cec3aef3241cec3aef32412779e) (#2327564)

2024-11-21 20:09:48 +01:00 · 2024-11-21 20:09:48 +01:00 · 88d869c1c2
commit 88d869c1c2
parent 0e9c278a7e
3 changed files with 440 additions and 1 deletions
--- a/glibc-rh2327564-1.patch
+++ b/glibc-rh2327564-1.patch
@ -0,0 +1,158 @@
+commit d115e98ad627fae62679bc18e3bf062a898860cb
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed Nov 20 19:21:45 2024 +0100
+
+    Revert "AArch64: Remove memset-reg.h"
+    
+    This reverts commit 8ecb477ea16a387a44ace5bf59d39a7e270b238b.
+
+diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
+new file mode 100644
+index 0000000000000000..6c7f60b37edf3b11
+--- /dev/null
+++ b/sysdeps/aarch64/memset-reg.h
+@@ -0,0 +1,30 @@
+/* Register aliases for memset to be used across implementations.
+   Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index b76dde1557ed8fb1..caafb019e2b6217b 100644
+--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
+@@ -30,6 +30,7 @@
+  */
+ 
+ #define dstin	x0
+#define val	x1
+ #define valw	w1
+ #define count	x2
+ #define dst	x3
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index f665b5a891433c1c..2e6d882fc931a882 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+ 
+ /* Assumptions:
+  *
+@@ -35,14 +36,6 @@
+ 
+ 	.arch armv8.2-a+sve
+ 
+-#define dstin   x0
+-#define valw    w1
+-#define count   x2
+-#define dst     x3
+-#define dstend  x4
+-#define tmp1    x5
+-#define tmp2    x6
+-
+ 	.macro st1b_unroll first=0, last=7
+ 	st1b	z0.b, p0, [dst, \first, mul vl]
+ 	.if \last-\first
+diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
+index cf1b25f2edf64900..6d714ed0e1b396ef 100644
+--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#include "memset-reg.h"
+ 
+ /* Assumptions:
+  *
+@@ -25,13 +26,6 @@
+  *
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_emag)
+ 
+ 	PTR_ARG (0)
+diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+index f815c20b0383f057..7b215501376cbe03 100644
+--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+ 
+ /* Assumptions:
+  *
+@@ -25,12 +26,6 @@
+  *
+  */
+ 
+-#define dstin	x0
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_kunpeng)
+ 
+ 	PTR_ARG (0)
+diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
+index 6fa28a9bd030a705..b43a43b54e1b3439 100644
+--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
+@@ -19,18 +19,12 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#include "memset-reg.h"
+ 
+ /* Assumptions:
+    ARMv8-a, AArch64, unaligned accesses
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-
+ ENTRY (__memset_oryon1)
+ 
+ 	PTR_ARG (0)
--- a/glibc-rh2327564-2.patch
+++ b/glibc-rh2327564-2.patch
@ -0,0 +1,276 @@
+commit b26c53ecc4dd3bc48b11e09f6ddc7c1441e126c2
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Wed Nov 20 19:21:48 2024 +0100
+
+    Revert "AArch64: Optimize memset"
+    
+    This reverts commit cec3aef32412779e207f825db0d057ebb4628ae8.
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index caafb019e2b6217b..7ef77ee8c926de21 100644
+--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
+@@ -1,5 +1,4 @@
+-/* Generic optimized memset using SIMD.
+-   Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -18,6 +17,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#include "memset-reg.h"
+ 
+ #ifndef MEMSET
+ # define MEMSET memset
+@@ -25,132 +25,130 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses
+  *
+  */
+ 
+-#define dstin	x0
+-#define val	x1
+-#define valw	w1
+-#define count	x2
+-#define dst	x3
+-#define dstend	x4
+-#define zva_val	x5
+-#define off	x3
+-#define dstend2	x5
+-
+ ENTRY (MEMSET)
+
+ 	PTR_ARG (0)
+ 	SIZE_ARG (2)
+ 
+ 	dup	v0.16B, valw
+-	cmp	count, 16
+-	b.lo	L(set_small)
+-
+ 	add	dstend, dstin, count
+-	cmp	count, 64
+-	b.hs	L(set_128)
+ 
+-	/* Set 16..63 bytes.  */
+-	mov	off, 16
+-	and	off, off, count, lsr 1
+-	sub	dstend2, dstend, off
+-	str	q0, [dstin]
+-	str	q0, [dstin, off]
+-	str	q0, [dstend2, -16]
+-	str	q0, [dstend, -16]
+-	ret
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+ 
+-	.p2align 4
+ 	/* Set 0..15 bytes.  */
+-L(set_small):
+-	add	dstend, dstin, count
+-	cmp	count, 4
+-	b.lo	2f
+-	lsr	off, count, 3
+-	sub	dstend2, dstend, off, lsl 2
+-	str	s0, [dstin]
+-	str	s0, [dstin, off, lsl 2]
+-	str	s0, [dstend2, -4]
+-	str	s0, [dstend, -4]
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+ 	ret
+-
+-	/* Set 0..3 bytes.  */
+ 2:	cbz	count, 3f
+-	lsr	off, count, 1
+ 	strb	valw, [dstin]
+-	strb	valw, [dstin, off]
+-	strb	valw, [dstend, -1]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+ 3:	ret
+ 
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+ 	.p2align 4
+-L(set_128):
+-	bic	dst, dstin, 15
+-	cmp	count, 128
+-	b.hi	L(set_long)
+-	stp	q0, q0, [dstin]
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+ 	stp	q0, q0, [dstin, 32]
+-	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+	nop
+ L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+ 	str	q0, [dstin]
+-	str	q0, [dst, 16]
+-	tst	valw, 255
+-	b.ne	L(no_zva)
+-#ifndef ZVA64_ONLY
+-	mrs	zva_val, dczid_el0
+-	and	zva_val, zva_val, 31
+-	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+-	b.ne	L(zva_128)
+-#endif
+-	stp	q0, q0, [dst, 32]
+-	bic	dst, dstin, 63
+-	sub	count, dstend, dst	/* Count is now 64 too large.  */
+-	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+-
+-	/* Write last bytes before ZVA loop.  */
+-	stp	q0, q0, [dstend, -64]
+-	stp	q0, q0, [dstend, -32]
+-
+-	.p2align 4
+-L(zva64_loop):
+-	add	dst, dst, 64
+-	dc	zva, dst
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+L(tail64):
+ 	subs	count, count, 64
+-	b.hi	L(zva64_loop)
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+L(try_zva):
+#ifndef ZVA64_ONLY
+ 	.p2align 3
+-L(no_zva):
+-	sub	count, dstend, dst	/* Count is 32 too large.  */
+-	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
+-L(no_zva_loop):
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+	nop
+#endif
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+	.p2align 4
+L(zva_64):
+	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+ 	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+ 	add	dst, dst, 64
+ 	subs	count, count, 64
+-	b.hi	L(no_zva_loop)
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+ #ifndef ZVA64_ONLY
+-	.p2align 4
+	.p2align 3
+ L(zva_128):
+-	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+-	b.ne	L(no_zva)
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+ 
+	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dst, 64]
+ 	stp	q0, q0, [dst, 96]
+ 	bic	dst, dst, 127
+ 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+-1:	add	dst, dst, 128
+-	dc	zva, dst
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+ 	subs	count, count, 128
+ 	b.hi	1b
+ 	stp	q0, q0, [dstend, -128]
+@@ -158,6 +156,35 @@ L(zva_128):
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	sub	dst, dst, 32		/* Bias dst for tail loop.  */
+	b	L(tail64)
+ #endif
+ 
+ END (MEMSET)
--- a/glibc.spec
+++ b/glibc.spec
@ -152,7 +152,7 @@ Version: %{glibcversion}
 # - It allows using the Release number without the %%dist tag in the dependency
 #   generator to make the generated requires interchangeable between Rawhide
 #   and ELN (.elnYY < .fcXX).
-%global baserelease 19
+%global baserelease 20
 Release: %{baserelease}%{?dist}

 # Licenses:
@ -342,6 +342,8 @@ Patch13: glibc-fedora-localedata-rh61908.patch
 Patch17: glibc-cs-path.patch
 Patch23: glibc-python3.patch
 Patch24: glibc-nolink-libc.patch
+Patch25: glibc-revert-1.patch
+Patch26: glibc-revert-2.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2358,6 +2360,9 @@ update_gconv_modules_cache ()
 %endif

 %changelog
+* Thu Nov 21 2024 Florian Weimer <fweimer@redhat.com> - 2.40.9000-20
+- Revert aarch64 memset changes (cec3aef3241cec3aef32412779e) (#2327564)
+
 * Wed Nov 20 2024 Florian Weimer <fweimer@redhat.com> - 2.40.9000-19
 - Auto-sync with upstream branch master,
  commit 47311cca31e685fa7bfe19bb8cef17d2d3d7fff9: