From 88d869c1c2e2474b7f6610836237ec6c281ef07e Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 21 Nov 2024 20:09:48 +0100 Subject: [PATCH] Revert aarch64 memset changes (cec3aef3241cec3aef32412779e) (#2327564) --- glibc-rh2327564-1.patch | 158 +++++++++++++++++++++++ glibc-rh2327564-2.patch | 276 ++++++++++++++++++++++++++++++++++++++++ glibc.spec | 7 +- 3 files changed, 440 insertions(+), 1 deletion(-) create mode 100644 glibc-rh2327564-1.patch create mode 100644 glibc-rh2327564-2.patch diff --git a/glibc-rh2327564-1.patch b/glibc-rh2327564-1.patch new file mode 100644 index 0000000..698bc6b --- /dev/null +++ b/glibc-rh2327564-1.patch @@ -0,0 +1,158 @@ +commit d115e98ad627fae62679bc18e3bf062a898860cb +Author: Florian Weimer +Date: Wed Nov 20 19:21:45 2024 +0100 + + Revert "AArch64: Remove memset-reg.h" + + This reverts commit 8ecb477ea16a387a44ace5bf59d39a7e270b238b. + +diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h +new file mode 100644 +index 0000000000000000..6c7f60b37edf3b11 +--- /dev/null ++++ b/sysdeps/aarch64/memset-reg.h +@@ -0,0 +1,30 @@ ++/* Register aliases for memset to be used across implementations. ++ Copyright (C) 2017-2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define dstin x0 ++#define val x1 ++#define valw w1 ++#define count x2 ++#define dst x3 ++#define dstend x4 ++#define tmp1 x5 ++#define tmp1w w5 ++#define tmp2 x6 ++#define tmp2w w6 ++#define zva_len x7 ++#define zva_lenw w7 +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index b76dde1557ed8fb1..caafb019e2b6217b 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -30,6 +30,7 @@ + */ + + #define dstin x0 ++#define val x1 + #define valw w1 + #define count x2 + #define dst x3 +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index f665b5a891433c1c..2e6d882fc931a882 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -18,6 +18,7 @@ + . */ + + #include ++#include + + /* Assumptions: + * +@@ -35,14 +36,6 @@ + + .arch armv8.2-a+sve + +-#define dstin x0 +-#define valw w1 +-#define count x2 +-#define dst x3 +-#define dstend x4 +-#define tmp1 x5 +-#define tmp2 x6 +- + .macro st1b_unroll first=0, last=7 + st1b z0.b, p0, [dst, \first, mul vl] + .if \last-\first +diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S +index cf1b25f2edf64900..6d714ed0e1b396ef 100644 +--- a/sysdeps/aarch64/multiarch/memset_emag.S ++++ b/sysdeps/aarch64/multiarch/memset_emag.S +@@ -18,6 +18,7 @@ + . */ + + #include ++#include "memset-reg.h" + + /* Assumptions: + * +@@ -25,13 +26,6 @@ + * + */ + +-#define dstin x0 +-#define val x1 +-#define valw w1 +-#define count x2 +-#define dst x3 +-#define dstend x4 +- + ENTRY (__memset_emag) + + PTR_ARG (0) +diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S +index f815c20b0383f057..7b215501376cbe03 100644 +--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S ++++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S +@@ -18,6 +18,7 @@ + . */ + + #include ++#include + + /* Assumptions: + * +@@ -25,12 +26,6 @@ + * + */ + +-#define dstin x0 +-#define valw w1 +-#define count x2 +-#define dst x3 +-#define dstend x4 +- + ENTRY (__memset_kunpeng) + + PTR_ARG (0) +diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S +index 6fa28a9bd030a705..b43a43b54e1b3439 100644 +--- a/sysdeps/aarch64/multiarch/memset_oryon1.S ++++ b/sysdeps/aarch64/multiarch/memset_oryon1.S +@@ -19,18 +19,12 @@ + . */ + + #include ++#include "memset-reg.h" + + /* Assumptions: + ARMv8-a, AArch64, unaligned accesses + */ + +-#define dstin x0 +-#define val x1 +-#define valw w1 +-#define count x2 +-#define dst x3 +-#define dstend x4 +- + ENTRY (__memset_oryon1) + + PTR_ARG (0) diff --git a/glibc-rh2327564-2.patch b/glibc-rh2327564-2.patch new file mode 100644 index 0000000..4afd43b --- /dev/null +++ b/glibc-rh2327564-2.patch @@ -0,0 +1,276 @@ +commit b26c53ecc4dd3bc48b11e09f6ddc7c1441e126c2 +Author: Florian Weimer +Date: Wed Nov 20 19:21:48 2024 +0100 + + Revert "AArch64: Optimize memset" + + This reverts commit cec3aef32412779e207f825db0d057ebb4628ae8. + +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index caafb019e2b6217b..7ef77ee8c926de21 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -1,5 +1,4 @@ +-/* Generic optimized memset using SIMD. +- Copyright (C) 2012-2024 Free Software Foundation, Inc. ++/* Copyright (C) 2012-2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -18,6 +17,7 @@ + . */ + + #include ++#include "memset-reg.h" + + #ifndef MEMSET + # define MEMSET memset +@@ -25,132 +25,130 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. ++ * ARMv8-a, AArch64, unaligned accesses + * + */ + +-#define dstin x0 +-#define val x1 +-#define valw w1 +-#define count x2 +-#define dst x3 +-#define dstend x4 +-#define zva_val x5 +-#define off x3 +-#define dstend2 x5 +- + ENTRY (MEMSET) ++ + PTR_ARG (0) + SIZE_ARG (2) + + dup v0.16B, valw +- cmp count, 16 +- b.lo L(set_small) +- + add dstend, dstin, count +- cmp count, 64 +- b.hs L(set_128) + +- /* Set 16..63 bytes. */ +- mov off, 16 +- and off, off, count, lsr 1 +- sub dstend2, dstend, off +- str q0, [dstin] +- str q0, [dstin, off] +- str q0, [dstend2, -16] +- str q0, [dstend, -16] +- ret ++ cmp count, 96 ++ b.hi L(set_long) ++ cmp count, 16 ++ b.hs L(set_medium) ++ mov val, v0.D[0] + +- .p2align 4 + /* Set 0..15 bytes. */ +-L(set_small): +- add dstend, dstin, count +- cmp count, 4 +- b.lo 2f +- lsr off, count, 3 +- sub dstend2, dstend, off, lsl 2 +- str s0, [dstin] +- str s0, [dstin, off, lsl 2] +- str s0, [dstend2, -4] +- str s0, [dstend, -4] ++ tbz count, 3, 1f ++ str val, [dstin] ++ str val, [dstend, -8] ++ ret ++ nop ++1: tbz count, 2, 2f ++ str valw, [dstin] ++ str valw, [dstend, -4] + ret +- +- /* Set 0..3 bytes. */ + 2: cbz count, 3f +- lsr off, count, 1 + strb valw, [dstin] +- strb valw, [dstin, off] +- strb valw, [dstend, -1] ++ tbz count, 1, 3f ++ strh valw, [dstend, -2] + 3: ret + ++ /* Set 17..96 bytes. */ ++L(set_medium): ++ str q0, [dstin] ++ tbnz count, 6, L(set96) ++ str q0, [dstend, -16] ++ tbz count, 5, 1f ++ str q0, [dstin, 16] ++ str q0, [dstend, -32] ++1: ret ++ + .p2align 4 +-L(set_128): +- bic dst, dstin, 15 +- cmp count, 128 +- b.hi L(set_long) +- stp q0, q0, [dstin] ++ /* Set 64..96 bytes. Write 64 bytes from the start and ++ 32 bytes from the end. */ ++L(set96): ++ str q0, [dstin, 16] + stp q0, q0, [dstin, 32] +- stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +- .p2align 4 ++ .p2align 3 ++ nop + L(set_long): ++ and valw, valw, 255 ++ bic dst, dstin, 15 + str q0, [dstin] +- str q0, [dst, 16] +- tst valw, 255 +- b.ne L(no_zva) +-#ifndef ZVA64_ONLY +- mrs zva_val, dczid_el0 +- and zva_val, zva_val, 31 +- cmp zva_val, 4 /* ZVA size is 64 bytes. */ +- b.ne L(zva_128) +-#endif +- stp q0, q0, [dst, 32] +- bic dst, dstin, 63 +- sub count, dstend, dst /* Count is now 64 too large. */ +- sub count, count, 64 + 64 /* Adjust count and bias for loop. */ +- +- /* Write last bytes before ZVA loop. */ +- stp q0, q0, [dstend, -64] +- stp q0, q0, [dstend, -32] +- +- .p2align 4 +-L(zva64_loop): +- add dst, dst, 64 +- dc zva, dst ++ cmp count, 256 ++ ccmp valw, 0, 0, cs ++ b.eq L(try_zva) ++L(no_zva): ++ sub count, dstend, dst /* Count is 16 too large. */ ++ sub dst, dst, 16 /* Dst is biased by -32. */ ++ sub count, count, 64 + 16 /* Adjust count and bias for loop. */ ++1: stp q0, q0, [dst, 32] ++ stp q0, q0, [dst, 64]! ++L(tail64): + subs count, count, 64 +- b.hi L(zva64_loop) ++ b.hi 1b ++2: stp q0, q0, [dstend, -64] ++ stp q0, q0, [dstend, -32] + ret + ++L(try_zva): ++#ifndef ZVA64_ONLY + .p2align 3 +-L(no_zva): +- sub count, dstend, dst /* Count is 32 too large. */ +- sub count, count, 64 + 32 /* Adjust count and bias for loop. */ +-L(no_zva_loop): ++ mrs tmp1, dczid_el0 ++ tbnz tmp1w, 4, L(no_zva) ++ and tmp1w, tmp1w, 15 ++ cmp tmp1w, 4 /* ZVA size is 64 bytes. */ ++ b.ne L(zva_128) ++ nop ++#endif ++ /* Write the first and last 64 byte aligned block using stp rather ++ than using DC ZVA. This is faster on some cores. ++ */ ++ .p2align 4 ++L(zva_64): ++ str q0, [dst, 16] + stp q0, q0, [dst, 32] ++ bic dst, dst, 63 + stp q0, q0, [dst, 64] ++ stp q0, q0, [dst, 96] ++ sub count, dstend, dst /* Count is now 128 too large. */ ++ sub count, count, 128+64+64 /* Adjust count and bias for loop. */ ++ add dst, dst, 128 ++1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 +- b.hi L(no_zva_loop) ++ b.hi 1b ++ stp q0, q0, [dst, 0] ++ stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + #ifndef ZVA64_ONLY +- .p2align 4 ++ .p2align 3 + L(zva_128): +- cmp zva_val, 5 /* ZVA size is 128 bytes. */ +- b.ne L(no_zva) ++ cmp tmp1w, 5 /* ZVA size is 128 bytes. */ ++ b.ne L(zva_other) + ++ str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128 + 128 /* Adjust count and bias for loop. */ +-1: add dst, dst, 128 +- dc zva, dst ++ sub count, count, 128+128 /* Adjust count and bias for loop. */ ++ add dst, dst, 128 ++1: dc zva, dst ++ add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] +@@ -158,6 +156,35 @@ L(zva_128): + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret ++ ++L(zva_other): ++ mov tmp2w, 4 ++ lsl zva_lenw, tmp2w, tmp1w ++ add tmp1, zva_len, 64 /* Max alignment bytes written. */ ++ cmp count, tmp1 ++ blo L(no_zva) ++ ++ sub tmp2, zva_len, 1 ++ add tmp1, dst, zva_len ++ add dst, dst, 16 ++ subs count, tmp1, dst /* Actual alignment bytes to write. */ ++ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ ++ beq 2f ++1: stp q0, q0, [dst], 64 ++ stp q0, q0, [dst, -32] ++ subs count, count, 64 ++ b.hi 1b ++2: mov dst, tmp1 ++ sub count, dstend, tmp1 /* Remaining bytes to write. */ ++ subs count, count, zva_len ++ b.lo 4f ++3: dc zva, dst ++ add dst, dst, zva_len ++ subs count, count, zva_len ++ b.hs 3b ++4: add count, count, zva_len ++ sub dst, dst, 32 /* Bias dst for tail loop. */ ++ b L(tail64) + #endif + + END (MEMSET) diff --git a/glibc.spec b/glibc.spec index 1f3b2a8..a881339 100644 --- a/glibc.spec +++ b/glibc.spec @@ -152,7 +152,7 @@ Version: %{glibcversion} # - It allows using the Release number without the %%dist tag in the dependency # generator to make the generated requires interchangeable between Rawhide # and ELN (.elnYY < .fcXX). -%global baserelease 19 +%global baserelease 20 Release: %{baserelease}%{?dist} # Licenses: @@ -342,6 +342,8 @@ Patch13: glibc-fedora-localedata-rh61908.patch Patch17: glibc-cs-path.patch Patch23: glibc-python3.patch Patch24: glibc-nolink-libc.patch +Patch25: glibc-revert-1.patch +Patch26: glibc-revert-2.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2358,6 +2360,9 @@ update_gconv_modules_cache () %endif %changelog +* Thu Nov 21 2024 Florian Weimer - 2.40.9000-20 +- Revert aarch64 memset changes (cec3aef3241cec3aef32412779e) (#2327564) + * Wed Nov 20 2024 Florian Weimer - 2.40.9000-19 - Auto-sync with upstream branch master, commit 47311cca31e685fa7bfe19bb8cef17d2d3d7fff9: