commit b26c53ecc4dd3bc48b11e09f6ddc7c1441e126c2 Author: Florian Weimer Date: Wed Nov 20 19:21:48 2024 +0100 Revert "AArch64: Optimize memset" This reverts commit cec3aef32412779e207f825db0d057ebb4628ae8. diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index caafb019e2b6217b..7ef77ee8c926de21 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -1,5 +1,4 @@ -/* Generic optimized memset using SIMD. - Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Copyright (C) 2012-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,6 +17,7 @@ . */ #include +#include "memset-reg.h" #ifndef MEMSET # define MEMSET memset @@ -25,132 +25,130 @@ /* Assumptions: * - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * ARMv8-a, AArch64, unaligned accesses * */ -#define dstin x0 -#define val x1 -#define valw w1 -#define count x2 -#define dst x3 -#define dstend x4 -#define zva_val x5 -#define off x3 -#define dstend2 x5 - ENTRY (MEMSET) + PTR_ARG (0) SIZE_ARG (2) dup v0.16B, valw - cmp count, 16 - b.lo L(set_small) - add dstend, dstin, count - cmp count, 64 - b.hs L(set_128) - /* Set 16..63 bytes. */ - mov off, 16 - and off, off, count, lsr 1 - sub dstend2, dstend, off - str q0, [dstin] - str q0, [dstin, off] - str q0, [dstend2, -16] - str q0, [dstend, -16] - ret + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] - .p2align 4 /* Set 0..15 bytes. */ -L(set_small): - add dstend, dstin, count - cmp count, 4 - b.lo 2f - lsr off, count, 3 - sub dstend2, dstend, off, lsl 2 - str s0, [dstin] - str s0, [dstin, off, lsl 2] - str s0, [dstend2, -4] - str s0, [dstend, -4] + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] ret - - /* Set 0..3 bytes. */ 2: cbz count, 3f - lsr off, count, 1 strb valw, [dstin] - strb valw, [dstin, off] - strb valw, [dstend, -1] + tbz count, 1, 3f + strh valw, [dstend, -2] 3: ret + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + .p2align 4 -L(set_128): - bic dst, dstin, 15 - cmp count, 128 - b.hi L(set_long) - stp q0, q0, [dstin] + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] stp q0, q0, [dstin, 32] - stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 4 + .p2align 3 + nop L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 str q0, [dstin] - str q0, [dst, 16] - tst valw, 255 - b.ne L(no_zva) -#ifndef ZVA64_ONLY - mrs zva_val, dczid_el0 - and zva_val, zva_val, 31 - cmp zva_val, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) -#endif - stp q0, q0, [dst, 32] - bic dst, dstin, 63 - sub count, dstend, dst /* Count is now 64 too large. */ - sub count, count, 64 + 64 /* Adjust count and bias for loop. */ - - /* Write last bytes before ZVA loop. */ - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - - .p2align 4 -L(zva64_loop): - add dst, dst, 64 - dc zva, dst + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq L(try_zva) +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! +L(tail64): subs count, count, 64 - b.hi L(zva64_loop) + b.hi 1b +2: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ret +L(try_zva): +#ifndef ZVA64_ONLY .p2align 3 -L(no_zva): - sub count, dstend, dst /* Count is 32 too large. */ - sub count, count, 64 + 32 /* Adjust count and bias for loop. */ -L(no_zva_loop): + mrs tmp1, dczid_el0 + tbnz tmp1w, 4, L(no_zva) + and tmp1w, tmp1w, 15 + cmp tmp1w, 4 /* ZVA size is 64 bytes. */ + b.ne L(zva_128) + nop +#endif + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. + */ + .p2align 4 +L(zva_64): + str q0, [dst, 16] stp q0, q0, [dst, 32] + bic dst, dst, 63 stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst add dst, dst, 64 subs count, count, 64 - b.hi L(no_zva_loop) + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret #ifndef ZVA64_ONLY - .p2align 4 + .p2align 3 L(zva_128): - cmp zva_val, 5 /* ZVA size is 128 bytes. */ - b.ne L(no_zva) + cmp tmp1w, 5 /* ZVA size is 128 bytes. */ + b.ne L(zva_other) + str q0, [dst, 16] stp q0, q0, [dst, 32] stp q0, q0, [dst, 64] stp q0, q0, [dst, 96] bic dst, dst, 127 sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128 + 128 /* Adjust count and bias for loop. */ -1: add dst, dst, 128 - dc zva, dst + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 subs count, count, 128 b.hi 1b stp q0, q0, [dstend, -128] @@ -158,6 +156,35 @@ L(zva_128): stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret + +L(zva_other): + mov tmp2w, 4 + lsl zva_lenw, tmp2w, tmp1w + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + sub dst, dst, 32 /* Bias dst for tail loop. */ + b L(tail64) #endif END (MEMSET)