glibc/glibc-upstream-2.34-177.patch

commit baf3ece63453adac59c5688930324a78ced5b2e4
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Sat Oct 23 01:26:47 2021 -0400

    x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
    
    This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
    
    it could potentially be dangerous to use SSE2 if this function is ever
    called without using 'vzeroupper' beforehand. While compilers appear
    to use 'vzeroupper' before function calls if AVX2 has been used, using
    SSE2 here is more brittle. Since it is not absolutely necessary it
    should be avoided.
    
    It costs 2-extra bytes but the extra bytes should only eat into
    alignment padding.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)

diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 2761b54f2e7dea9f..640f6757fac8a356 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@ L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 
 	/* Use movups to save code size.  */
-	movups	(%rsi), %xmm2
+	vmovdqu	(%rsi), %xmm2
 	VPCMP	$4, (%rdi), %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax
Sync with upstream branch release/2.34/master Upstream commit: 55640ed3fde48360a8e8083be4843bd2dc7cecfe - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module 2022-04-28 02:27:50 +00:00			`commit baf3ece63453adac59c5688930324a78ced5b2e4`
			`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Date: Sat Oct 23 01:26:47 2021 -0400`

			`x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S`

			`This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.`

			`it could potentially be dangerous to use SSE2 if this function is ever`
			`called without using 'vzeroupper' beforehand. While compilers appear`
			`to use 'vzeroupper' before function calls if AVX2 has been used, using`
			`SSE2 here is more brittle. Since it is not absolutely necessary it`
			`should be avoided.`

			`It costs 2-extra bytes but the extra bytes should only eat into`
			`alignment padding.`
			`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`

			`(cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)`

			`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
			`index 2761b54f2e7dea9f..640f6757fac8a356 100644`
			`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
			`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
			`@@ -561,13 +561,13 @@ L(between_16_31):`
			`/* From 16 to 31 bytes. No branch when size == 16. */`

			`/* Use movups to save code size. */`
			`- movups (%rsi), %xmm2`
			`+ vmovdqu (%rsi), %xmm2`
			`VPCMP $4, (%rdi), %xmm2, %k1`
			`kmovd %k1, %eax`
			`testl %eax, %eax`
			`jnz L(return_vec_0_lv)`
			`/* Use overlapping loads to avoid branches. */`
			`- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
			`+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2`
			`VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1`
			`addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx`
			`kmovd %k1, %eax`