b68cfb3330
Upstream commit: b2f32e746492615a6eb3e66fac1e766e32e8deb1 - malloc: Simplify implementation of __malloc_assert - Update syscall-names.list for Linux 5.18 - x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S - x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file - x86: Move and slightly improve memset_erms - x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list - x86: Put wcs{n}len-sse4.1 in the sse4.1 text section - x86: Align entry for memrchr to 64-bytes. - x86: Add BMI1/BMI2 checks for ISA_V3 check - x86: Cleanup bounds checking in large memcpy case - x86: Add bounds `x86_non_temporal_threshold` - x86: Add sse42 implementation to strcmp's ifunc - x86: Fix misordered logic for setting `rep_movsb_stop_threshold` - x86: Align varshift table to 32-bytes - x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions - x86: Shrink code size of memchr-evex.S - x86: Shrink code size of memchr-avx2.S - x86: Optimize memrchr-avx2.S - x86: Optimize memrchr-evex.S - x86: Optimize memrchr-sse2.S - x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` - x86: Create header for VEC classes in x86 strings library - x86_64: Add strstr function with 512-bit EVEX - x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT - x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen - x86_64: Remove bzero optimization - x86_64: Remove end of line trailing spaces - nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore - linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304)
697 lines
14 KiB
Diff
697 lines
14 KiB
Diff
commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Mon Jun 6 21:11:30 2022 -0700
|
|
|
|
x86: Optimize memrchr-sse2.S
|
|
|
|
The new code:
|
|
1. prioritizes smaller lengths more.
|
|
2. optimizes target placement more carefully.
|
|
3. reuses logic more.
|
|
4. fixes up various inefficiencies in the logic.
|
|
|
|
The total code size saving is: 394 bytes
|
|
Geometric Mean of all benchmarks New / Old: 0.874
|
|
|
|
Regressions:
|
|
1. The page cross case is now colder, especially re-entry from the
|
|
page cross case if a match is not found in the first VEC
|
|
(roughly 50%). My general opinion with this patch is this is
|
|
acceptable given the "coldness" of this case (less than 4%) and
|
|
generally performance improvement in the other far more common
|
|
cases.
|
|
|
|
2. There are some regressions 5-15% for medium/large user-arg
|
|
lengths that have a match in the first VEC. This is because the
|
|
logic was rewritten to optimize finds in the first VEC if the
|
|
user-arg length is shorter (where we see roughly 20-50%
|
|
performance improvements). It is not always the case this is a
|
|
regression. My intuition is some frontend quirk is partially
|
|
explaining the data although I haven't been able to find the
|
|
root cause.
|
|
|
|
Full xcheck passes on x86_64.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit 731feee3869550e93177e604604c1765d81de571)
|
|
|
|
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
|
|
index cc2001167d77c83c..c2a5902bf9385c67 100644
|
|
--- a/sysdeps/x86_64/memrchr.S
|
|
+++ b/sysdeps/x86_64/memrchr.S
|
|
@@ -19,362 +19,333 @@
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
+#define VEC_SIZE 16
|
|
+#define PAGE_SIZE 4096
|
|
|
|
.text
|
|
-ENTRY (__memrchr)
|
|
- movd %esi, %xmm1
|
|
-
|
|
- sub $16, %RDX_LP
|
|
- jbe L(length_less16)
|
|
-
|
|
- punpcklbw %xmm1, %xmm1
|
|
- punpcklbw %xmm1, %xmm1
|
|
-
|
|
- add %RDX_LP, %RDI_LP
|
|
- pshufd $0, %xmm1, %xmm1
|
|
-
|
|
- movdqu (%rdi), %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
-
|
|
-/* Check if there is a match. */
|
|
- pmovmskb %xmm0, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches0)
|
|
-
|
|
- sub $64, %rdi
|
|
- mov %edi, %ecx
|
|
- and $15, %ecx
|
|
- jz L(loop_prolog)
|
|
-
|
|
- add $16, %rdi
|
|
- add $16, %rdx
|
|
- and $-16, %rdi
|
|
- sub %rcx, %rdx
|
|
-
|
|
- .p2align 4
|
|
-L(loop_prolog):
|
|
- sub $64, %rdx
|
|
- jbe L(exit_loop)
|
|
-
|
|
- movdqa 48(%rdi), %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches48)
|
|
-
|
|
- movdqa 32(%rdi), %xmm2
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pmovmskb %xmm2, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches32)
|
|
-
|
|
- movdqa 16(%rdi), %xmm3
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pmovmskb %xmm3, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches16)
|
|
-
|
|
- movdqa (%rdi), %xmm4
|
|
- pcmpeqb %xmm1, %xmm4
|
|
- pmovmskb %xmm4, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches0)
|
|
-
|
|
- sub $64, %rdi
|
|
- sub $64, %rdx
|
|
- jbe L(exit_loop)
|
|
-
|
|
- movdqa 48(%rdi), %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches48)
|
|
-
|
|
- movdqa 32(%rdi), %xmm2
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pmovmskb %xmm2, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches32)
|
|
-
|
|
- movdqa 16(%rdi), %xmm3
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pmovmskb %xmm3, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches16)
|
|
-
|
|
- movdqa (%rdi), %xmm3
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pmovmskb %xmm3, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches0)
|
|
-
|
|
- mov %edi, %ecx
|
|
- and $63, %ecx
|
|
- jz L(align64_loop)
|
|
-
|
|
- add $64, %rdi
|
|
- add $64, %rdx
|
|
- and $-64, %rdi
|
|
- sub %rcx, %rdx
|
|
-
|
|
- .p2align 4
|
|
-L(align64_loop):
|
|
- sub $64, %rdi
|
|
- sub $64, %rdx
|
|
- jbe L(exit_loop)
|
|
-
|
|
- movdqa (%rdi), %xmm0
|
|
- movdqa 16(%rdi), %xmm2
|
|
- movdqa 32(%rdi), %xmm3
|
|
- movdqa 48(%rdi), %xmm4
|
|
-
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pcmpeqb %xmm1, %xmm4
|
|
-
|
|
- pmaxub %xmm3, %xmm0
|
|
- pmaxub %xmm4, %xmm2
|
|
- pmaxub %xmm0, %xmm2
|
|
- pmovmskb %xmm2, %eax
|
|
-
|
|
- test %eax, %eax
|
|
- jz L(align64_loop)
|
|
-
|
|
- pmovmskb %xmm4, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches48)
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches32)
|
|
-
|
|
- movdqa 16(%rdi), %xmm2
|
|
-
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pcmpeqb (%rdi), %xmm1
|
|
-
|
|
- pmovmskb %xmm2, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches16)
|
|
-
|
|
- pmovmskb %xmm1, %eax
|
|
- bsr %eax, %eax
|
|
-
|
|
- add %rdi, %rax
|
|
+ENTRY_P2ALIGN(__memrchr, 6)
|
|
+#ifdef __ILP32__
|
|
+ /* Clear upper bits. */
|
|
+ mov %RDX_LP, %RDX_LP
|
|
+#endif
|
|
+ movd %esi, %xmm0
|
|
+
|
|
+ /* Get end pointer. */
|
|
+ leaq (%rdx, %rdi), %rcx
|
|
+
|
|
+ punpcklbw %xmm0, %xmm0
|
|
+ punpcklwd %xmm0, %xmm0
|
|
+ pshufd $0, %xmm0, %xmm0
|
|
+
|
|
+ /* Check if we can load 1x VEC without cross a page. */
|
|
+ testl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
+ jz L(page_cross)
|
|
+
|
|
+ /* NB: This load happens regardless of whether rdx (len) is zero. Since
|
|
+ it doesn't cross a page and the standard gurantees any pointer have
|
|
+ at least one-valid byte this load must be safe. For the entire
|
|
+ history of the x86 memrchr implementation this has been possible so
|
|
+ no code "should" be relying on a zero-length check before this load.
|
|
+ The zero-length check is moved to the page cross case because it is
|
|
+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
|
|
+ into 2-cache lines. */
|
|
+ movups -(VEC_SIZE)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ subq $VEC_SIZE, %rdx
|
|
+ ja L(more_1x_vec)
|
|
+L(ret_vec_x0_test):
|
|
+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
|
|
+ zero. */
|
|
+ bsrl %eax, %eax
|
|
+ jz L(ret_0)
|
|
+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
|
|
+ if out of bounds. */
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_0)
|
|
+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
|
|
+ ptr. */
|
|
+ addq %rdi, %rax
|
|
+L(ret_0):
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(exit_loop):
|
|
- add $64, %edx
|
|
- cmp $32, %edx
|
|
- jbe L(exit_loop_32)
|
|
-
|
|
- movdqa 48(%rdi), %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches48)
|
|
-
|
|
- movdqa 32(%rdi), %xmm2
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pmovmskb %xmm2, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches32)
|
|
-
|
|
- movdqa 16(%rdi), %xmm3
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pmovmskb %xmm3, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches16_1)
|
|
- cmp $48, %edx
|
|
- jbe L(return_null)
|
|
-
|
|
- pcmpeqb (%rdi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches0_1)
|
|
- xor %eax, %eax
|
|
+ .p2align 4,, 5
|
|
+L(ret_vec_x0):
|
|
+ bsrl %eax, %eax
|
|
+ leaq -(VEC_SIZE)(%rcx, %rax), %rax
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(exit_loop_32):
|
|
- movdqa 48(%rdi), %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches48_1)
|
|
- cmp $16, %edx
|
|
- jbe L(return_null)
|
|
-
|
|
- pcmpeqb 32(%rdi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- test %eax, %eax
|
|
- jnz L(matches32_1)
|
|
- xor %eax, %eax
|
|
+ .p2align 4,, 2
|
|
+L(zero_0):
|
|
+ xorl %eax, %eax
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(matches0):
|
|
- bsr %eax, %eax
|
|
- add %rdi, %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(matches16):
|
|
- bsr %eax, %eax
|
|
- lea 16(%rax, %rdi), %rax
|
|
- ret
|
|
|
|
- .p2align 4
|
|
-L(matches32):
|
|
- bsr %eax, %eax
|
|
- lea 32(%rax, %rdi), %rax
|
|
+ .p2align 4,, 8
|
|
+L(more_1x_vec):
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x0)
|
|
+
|
|
+ /* Align rcx (pointer to string). */
|
|
+ decq %rcx
|
|
+ andq $-VEC_SIZE, %rcx
|
|
+
|
|
+ movq %rcx, %rdx
|
|
+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
|
|
+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
|
|
+ it adds more frontend uops (even if the moves can be eliminated) and
|
|
+ some percentage of the time actual backend uops. */
|
|
+ movaps -(VEC_SIZE)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ subq %rdi, %rdx
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ cmpq $(VEC_SIZE * 2), %rdx
|
|
+ ja L(more_2x_vec)
|
|
+L(last_2x_vec):
|
|
+ subl $VEC_SIZE, %edx
|
|
+ jbe L(ret_vec_x0_test)
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x0)
|
|
+
|
|
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ subl $VEC_SIZE, %edx
|
|
+ bsrl %eax, %eax
|
|
+ jz L(ret_1)
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_0)
|
|
+ addq %rdi, %rax
|
|
+L(ret_1):
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(matches48):
|
|
- bsr %eax, %eax
|
|
- lea 48(%rax, %rdi), %rax
|
|
+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
|
|
+ causes the hot pause (length <= VEC_SIZE) to span multiple cache
|
|
+ lines. Naturally aligned % 16 to 8-bytes. */
|
|
+L(page_cross):
|
|
+ /* Zero length check. */
|
|
+ testq %rdx, %rdx
|
|
+ jz L(zero_0)
|
|
+
|
|
+ leaq -1(%rcx), %r8
|
|
+ andq $-(VEC_SIZE), %r8
|
|
+
|
|
+ movaps (%r8), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %esi
|
|
+ /* Shift out negative alignment (because we are starting from endptr and
|
|
+ working backwards). */
|
|
+ negl %ecx
|
|
+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
|
|
+ explicitly. */
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ shl %cl, %esi
|
|
+ movzwl %si, %eax
|
|
+ leaq (%rdi, %rdx), %rcx
|
|
+ cmpq %rdi, %r8
|
|
+ ja L(more_1x_vec)
|
|
+ subl $VEC_SIZE, %edx
|
|
+ bsrl %eax, %eax
|
|
+ jz L(ret_2)
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_1)
|
|
+ addq %rdi, %rax
|
|
+L(ret_2):
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(matches0_1):
|
|
- bsr %eax, %eax
|
|
- sub $64, %rdx
|
|
- add %rax, %rdx
|
|
- jl L(return_null)
|
|
- add %rdi, %rax
|
|
+ /* Fits in aliging bytes. */
|
|
+L(zero_1):
|
|
+ xorl %eax, %eax
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(matches16_1):
|
|
- bsr %eax, %eax
|
|
- sub $48, %rdx
|
|
- add %rax, %rdx
|
|
- jl L(return_null)
|
|
- lea 16(%rdi, %rax), %rax
|
|
+ .p2align 4,, 5
|
|
+L(ret_vec_x1):
|
|
+ bsrl %eax, %eax
|
|
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(matches32_1):
|
|
- bsr %eax, %eax
|
|
- sub $32, %rdx
|
|
- add %rax, %rdx
|
|
- jl L(return_null)
|
|
- lea 32(%rdi, %rax), %rax
|
|
- ret
|
|
+ .p2align 4,, 8
|
|
+L(more_2x_vec):
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x0)
|
|
|
|
- .p2align 4
|
|
-L(matches48_1):
|
|
- bsr %eax, %eax
|
|
- sub $16, %rdx
|
|
- add %rax, %rdx
|
|
- jl L(return_null)
|
|
- lea 48(%rdi, %rax), %rax
|
|
- ret
|
|
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x1)
|
|
|
|
- .p2align 4
|
|
-L(return_null):
|
|
- xor %eax, %eax
|
|
- ret
|
|
|
|
- .p2align 4
|
|
-L(length_less16_offset0):
|
|
- test %edx, %edx
|
|
- jz L(return_null)
|
|
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
|
|
- mov %dl, %cl
|
|
- pcmpeqb (%rdi), %xmm1
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ ja L(more_4x_vec)
|
|
|
|
- mov $1, %edx
|
|
- sal %cl, %edx
|
|
- sub $1, %edx
|
|
+ addl $(VEC_SIZE), %edx
|
|
+ jle L(ret_vec_x2_test)
|
|
|
|
- pmovmskb %xmm1, %eax
|
|
+L(last_vec):
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x2)
|
|
|
|
- and %edx, %eax
|
|
- test %eax, %eax
|
|
- jz L(return_null)
|
|
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
|
|
- bsr %eax, %eax
|
|
- add %rdi, %rax
|
|
+ subl $(VEC_SIZE), %edx
|
|
+ bsrl %eax, %eax
|
|
+ jz L(ret_3)
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_2)
|
|
+ addq %rdi, %rax
|
|
+L(ret_3):
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(length_less16):
|
|
- punpcklbw %xmm1, %xmm1
|
|
- punpcklbw %xmm1, %xmm1
|
|
-
|
|
- add $16, %edx
|
|
-
|
|
- pshufd $0, %xmm1, %xmm1
|
|
-
|
|
- mov %edi, %ecx
|
|
- and $15, %ecx
|
|
- jz L(length_less16_offset0)
|
|
-
|
|
- mov %cl, %dh
|
|
- mov %ecx, %esi
|
|
- add %dl, %dh
|
|
- and $-16, %rdi
|
|
-
|
|
- sub $16, %dh
|
|
- ja L(length_less16_part2)
|
|
-
|
|
- pcmpeqb (%rdi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
-
|
|
- sar %cl, %eax
|
|
- mov %dl, %cl
|
|
-
|
|
- mov $1, %edx
|
|
- sal %cl, %edx
|
|
- sub $1, %edx
|
|
-
|
|
- and %edx, %eax
|
|
- test %eax, %eax
|
|
- jz L(return_null)
|
|
-
|
|
- bsr %eax, %eax
|
|
- add %rdi, %rax
|
|
- add %rsi, %rax
|
|
+ .p2align 4,, 6
|
|
+L(ret_vec_x2_test):
|
|
+ bsrl %eax, %eax
|
|
+ jz L(zero_2)
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_2)
|
|
+ addq %rdi, %rax
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(length_less16_part2):
|
|
- movdqa 16(%rdi), %xmm2
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- pmovmskb %xmm2, %eax
|
|
-
|
|
- mov %dh, %cl
|
|
- mov $1, %edx
|
|
- sal %cl, %edx
|
|
- sub $1, %edx
|
|
-
|
|
- and %edx, %eax
|
|
+L(zero_2):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
|
|
- test %eax, %eax
|
|
- jnz L(length_less16_part2_return)
|
|
|
|
- pcmpeqb (%rdi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
+ .p2align 4,, 5
|
|
+L(ret_vec_x2):
|
|
+ bsrl %eax, %eax
|
|
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
|
|
+ ret
|
|
|
|
- mov %esi, %ecx
|
|
- sar %cl, %eax
|
|
- test %eax, %eax
|
|
- jz L(return_null)
|
|
+ .p2align 4,, 5
|
|
+L(ret_vec_x3):
|
|
+ bsrl %eax, %eax
|
|
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
|
|
+ ret
|
|
|
|
- bsr %eax, %eax
|
|
- add %rdi, %rax
|
|
- add %rsi, %rax
|
|
+ .p2align 4,, 8
|
|
+L(more_4x_vec):
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x2)
|
|
+
|
|
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x3)
|
|
+
|
|
+ addq $-(VEC_SIZE * 4), %rcx
|
|
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec)
|
|
+
|
|
+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
|
|
+ keeping the code from spilling to the next cache line. */
|
|
+ addq $(VEC_SIZE * 4 - 1), %rcx
|
|
+ andq $-(VEC_SIZE * 4), %rcx
|
|
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
|
|
+ andq $-(VEC_SIZE * 4), %rdx
|
|
+
|
|
+ .p2align 4,, 11
|
|
+L(loop_4x_vec):
|
|
+ movaps (VEC_SIZE * -1)(%rcx), %xmm1
|
|
+ movaps (VEC_SIZE * -2)(%rcx), %xmm2
|
|
+ movaps (VEC_SIZE * -3)(%rcx), %xmm3
|
|
+ movaps (VEC_SIZE * -4)(%rcx), %xmm4
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pcmpeqb %xmm0, %xmm2
|
|
+ pcmpeqb %xmm0, %xmm3
|
|
+ pcmpeqb %xmm0, %xmm4
|
|
+
|
|
+ por %xmm1, %xmm2
|
|
+ por %xmm3, %xmm4
|
|
+ por %xmm2, %xmm4
|
|
+
|
|
+ pmovmskb %xmm4, %esi
|
|
+ testl %esi, %esi
|
|
+ jnz L(loop_end)
|
|
+
|
|
+ addq $-(VEC_SIZE * 4), %rcx
|
|
+ cmpq %rdx, %rcx
|
|
+ jne L(loop_4x_vec)
|
|
+
|
|
+ subl %edi, %edx
|
|
+
|
|
+ /* Ends up being 1-byte nop. */
|
|
+ .p2align 4,, 2
|
|
+L(last_4x_vec):
|
|
+ movaps -(VEC_SIZE)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ cmpl $(VEC_SIZE * 2), %edx
|
|
+ jbe L(last_2x_vec)
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_x0)
|
|
+
|
|
+
|
|
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_end)
|
|
+
|
|
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
|
|
+ pcmpeqb %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+
|
|
+ subl $(VEC_SIZE * 3), %edx
|
|
+ ja L(last_vec)
|
|
+ bsrl %eax, %eax
|
|
+ jz L(ret_4)
|
|
+ addl %edx, %eax
|
|
+ jl L(zero_3)
|
|
+ addq %rdi, %rax
|
|
+L(ret_4):
|
|
ret
|
|
|
|
- .p2align 4
|
|
-L(length_less16_part2_return):
|
|
- bsr %eax, %eax
|
|
- lea 16(%rax, %rdi), %rax
|
|
+ /* Ends up being 1-byte nop. */
|
|
+ .p2align 4,, 3
|
|
+L(loop_end):
|
|
+ pmovmskb %xmm1, %eax
|
|
+ sall $16, %eax
|
|
+ jnz L(ret_vec_end)
|
|
+
|
|
+ pmovmskb %xmm2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(ret_vec_end)
|
|
+
|
|
+ pmovmskb %xmm3, %eax
|
|
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
|
|
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
|
|
+ then CHAR in VEC3 and bsrq will use that position. */
|
|
+ sall $16, %eax
|
|
+ orl %esi, %eax
|
|
+ bsrl %eax, %eax
|
|
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
|
|
ret
|
|
|
|
-END (__memrchr)
|
|
+L(ret_vec_end):
|
|
+ bsrl %eax, %eax
|
|
+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
|
|
+ ret
|
|
+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare
|
|
+ aligning bytes. */
|
|
+L(zero_3):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+ /* 2-bytes from next cache line. */
|
|
+END(__memrchr)
|
|
weak_alias (__memrchr, memrchr)
|