94eca17094
Upstream commit: ede8d94d154157d269b18f3601440ac576c1f96a - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str|wcs}rchr-evex - x86: Optimize {str|wcs}rchr-avx2 - x86: Optimize {str|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x86: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141]
140 lines
4.3 KiB
Diff
140 lines
4.3 KiB
Diff
commit 3605c744078bb048d876298aaf12a2869e8071b8
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Wed Mar 23 16:57:38 2022 -0500
|
|
|
|
x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
|
|
|
|
Slightly faster method of doing TOLOWER that saves an
|
|
instruction.
|
|
|
|
Also replace the hard coded 5-byte no with .p2align 4. On builds with
|
|
CET enabled this misaligned entry to strcasecmp.
|
|
|
|
geometric_mean(N=40) of all benchmarks New / Original: .920
|
|
|
|
All string/memory tests pass.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
index 6197a723b9e0606e..a6825de8195ad8c6 100644
|
|
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
|
|
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
|
mov %fs:(%rax),%RDX_LP
|
|
|
|
- // XXX 5 byte should be before the function
|
|
- /* 5-byte NOP. */
|
|
- .byte 0x0f,0x1f,0x44,0x00,0x00
|
|
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
|
+ .p2align 4
|
|
END (GLABEL(__strcasecmp))
|
|
/* FALLTHROUGH to strcasecmp_l. */
|
|
#endif
|
|
@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
|
|
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
|
mov %fs:(%rax),%RCX_LP
|
|
|
|
- // XXX 5 byte should be before the function
|
|
- /* 5-byte NOP. */
|
|
- .byte 0x0f,0x1f,0x44,0x00,0x00
|
|
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
|
+ .p2align 4
|
|
END (GLABEL(__strncasecmp))
|
|
/* FALLTHROUGH to strncasecmp_l. */
|
|
#endif
|
|
@@ -170,27 +168,22 @@ STRCMP_SSE42:
|
|
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
|
.section .rodata.cst16,"aM",@progbits,16
|
|
.align 16
|
|
-LABEL(belowupper):
|
|
- .quad 0x4040404040404040
|
|
- .quad 0x4040404040404040
|
|
-LABEL(topupper):
|
|
-# ifdef USE_AVX
|
|
- .quad 0x5a5a5a5a5a5a5a5a
|
|
- .quad 0x5a5a5a5a5a5a5a5a
|
|
-# else
|
|
- .quad 0x5b5b5b5b5b5b5b5b
|
|
- .quad 0x5b5b5b5b5b5b5b5b
|
|
-# endif
|
|
-LABEL(touppermask):
|
|
+LABEL(lcase_min):
|
|
+ .quad 0x3f3f3f3f3f3f3f3f
|
|
+ .quad 0x3f3f3f3f3f3f3f3f
|
|
+LABEL(lcase_max):
|
|
+ .quad 0x9999999999999999
|
|
+ .quad 0x9999999999999999
|
|
+LABEL(case_add):
|
|
.quad 0x2020202020202020
|
|
.quad 0x2020202020202020
|
|
.previous
|
|
- movdqa LABEL(belowupper)(%rip), %xmm4
|
|
-# define UCLOW_reg %xmm4
|
|
- movdqa LABEL(topupper)(%rip), %xmm5
|
|
-# define UCHIGH_reg %xmm5
|
|
- movdqa LABEL(touppermask)(%rip), %xmm6
|
|
-# define LCQWORD_reg %xmm6
|
|
+ movdqa LABEL(lcase_min)(%rip), %xmm4
|
|
+# define LCASE_MIN_reg %xmm4
|
|
+ movdqa LABEL(lcase_max)(%rip), %xmm5
|
|
+# define LCASE_MAX_reg %xmm5
|
|
+ movdqa LABEL(case_add)(%rip), %xmm6
|
|
+# define CASE_ADD_reg %xmm6
|
|
#endif
|
|
cmp $0x30, %ecx
|
|
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
|
|
@@ -201,32 +194,26 @@ LABEL(touppermask):
|
|
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
|
# ifdef USE_AVX
|
|
# define TOLOWER(reg1, reg2) \
|
|
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
|
|
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
|
|
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
|
|
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
|
|
- vpandn %xmm7, %xmm8, %xmm8; \
|
|
- vpandn %xmm9, %xmm10, %xmm10; \
|
|
- vpand LCQWORD_reg, %xmm8, %xmm8; \
|
|
- vpand LCQWORD_reg, %xmm10, %xmm10; \
|
|
- vpor reg1, %xmm8, reg1; \
|
|
- vpor reg2, %xmm10, reg2
|
|
+ vpaddb LCASE_MIN_reg, reg1, %xmm7; \
|
|
+ vpaddb LCASE_MIN_reg, reg2, %xmm8; \
|
|
+ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
|
|
+ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
|
|
+ vpandn CASE_ADD_reg, %xmm7, %xmm7; \
|
|
+ vpandn CASE_ADD_reg, %xmm8, %xmm8; \
|
|
+ vpaddb %xmm7, reg1, reg1; \
|
|
+ vpaddb %xmm8, reg2, reg2
|
|
# else
|
|
# define TOLOWER(reg1, reg2) \
|
|
- movdqa reg1, %xmm7; \
|
|
- movdqa UCHIGH_reg, %xmm8; \
|
|
- movdqa reg2, %xmm9; \
|
|
- movdqa UCHIGH_reg, %xmm10; \
|
|
- pcmpgtb UCLOW_reg, %xmm7; \
|
|
- pcmpgtb reg1, %xmm8; \
|
|
- pcmpgtb UCLOW_reg, %xmm9; \
|
|
- pcmpgtb reg2, %xmm10; \
|
|
- pand %xmm8, %xmm7; \
|
|
- pand %xmm10, %xmm9; \
|
|
- pand LCQWORD_reg, %xmm7; \
|
|
- pand LCQWORD_reg, %xmm9; \
|
|
- por %xmm7, reg1; \
|
|
- por %xmm9, reg2
|
|
+ movdqa LCASE_MIN_reg, %xmm7; \
|
|
+ movdqa LCASE_MIN_reg, %xmm8; \
|
|
+ paddb reg1, %xmm7; \
|
|
+ paddb reg2, %xmm8; \
|
|
+ pcmpgtb LCASE_MAX_reg, %xmm7; \
|
|
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
|
|
+ pandn CASE_ADD_reg, %xmm7; \
|
|
+ pandn CASE_ADD_reg, %xmm8; \
|
|
+ paddb %xmm7, reg1; \
|
|
+ paddb %xmm8, reg2
|
|
# endif
|
|
TOLOWER (%xmm1, %xmm2)
|
|
#else
|