Sync with upstream branch release/2.34/master

Upstream commit: 55640ed3fde48360a8e8083be4843bd2dc7cecfe

- i386: Regenerate ulps
- linux: Fix missing internal 64 bit time_t stat usage
- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
- x86-64: Use notl in EVEX strcmp [BZ #28646]
- x86: Shrink memcmp-sse4.S code size
- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
- x86: Optimize memmove-vec-unaligned-erms.S
- x86-64: Replace movzx with movzbl
- x86-64: Remove Prefer_AVX2_STRCMP
- x86-64: Improve EVEX strcmp with masked load
- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
- x86: Optimize memset-vec-unaligned-erms.S
- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
- x86: Modify ENTRY in sysdep.h so that p2align can be specified
- x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
- INSTALL: Rephrase -with-default-link documentation
- misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
- Default to --with-default-link=no (bug 25812)
- scripts: Add glibcelf.py module
This commit is contained in:
Carlos O'Donell 2022-04-27 22:27:50 -04:00
parent a8db42ba53
commit 4e3257320c
23 changed files with 8751 additions and 4 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,407 @@
commit f0c71b34f96c816292c49122d50da3a511b67bf2
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Apr 11 11:30:31 2022 +0200
Default to --with-default-link=no (bug 25812)
This is necessary to place the libio vtables into the RELRO segment.
New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to
verify that this is what actually happens.
The new tests fail on ia64 due to lack of (default) RELRO support
inbutils, so they are XFAILed there.
(cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7)
diff --git a/INSTALL b/INSTALL
index d8d4e9f155f56616..60d01568d77645c7 100644
--- a/INSTALL
+++ b/INSTALL
@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization. For example:
library will still be usable, but functionality may be lost--for
example, you can't build a shared libc with old binutils.
+'--with-default-link=FLAG'
+ With '--with-default-link=yes', the build system does not use a
+ custom linker script for linking shared objects. The default for
+ FLAG is the opposite, 'no', because the custom linker script is
+ needed for full RELRO protection.
+
'--with-nonshared-cflags=CFLAGS'
Use additional compiler flags CFLAGS to build the parts of the
library which are always statically linked into applications and
diff --git a/configure b/configure
index 03f4e59e754b5463..34c64f8de44e3086 100755
--- a/configure
+++ b/configure
@@ -3373,7 +3373,7 @@ fi
if test "${with_default_link+set}" = set; then :
withval=$with_default_link; use_default_link=$withval
else
- use_default_link=default
+ use_default_link=no
fi
@@ -6085,69 +6085,6 @@ fi
$as_echo "$libc_cv_hashstyle" >&6; }
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
-if ${libc_cv_use_default_link+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- libc_cv_use_default_link=no
- cat > conftest.s <<\EOF
- .section .note.a,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "foo"
- .section .note.b,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "bar"
-EOF
- if { ac_try=' ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
- { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
- (eval $ac_try) 2>&5
- ac_status=$?
- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
- test $ac_status = 0; }; } &&
- ac_try=`$READELF -S conftest.so | sed -n \
- '${x;p;}
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
- t a
- b
- : a
- H'`
- then
- libc_seen_a=no libc_seen_b=no
- set -- $ac_try
- while test $# -ge 2 -a "$1" = NOTE; do
- case "$2" in
- .note.a) libc_seen_a=yes ;;
- .note.b) libc_seen_b=yes ;;
- esac
- shift 2
- done
- case "$libc_seen_a$libc_seen_b" in
- yesyes)
- libc_cv_use_default_link=yes
- ;;
- *)
- echo >&5 "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
- ;;
- esac
- fi
- rm -f conftest*
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
-$as_echo "$libc_cv_use_default_link" >&6; }
- use_default_link=$libc_cv_use_default_link
-fi
-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
$as_echo_n "checking for GLOB_DAT reloc... " >&6; }
if ${libc_cv_has_glob_dat+:} false; then :
diff --git a/configure.ac b/configure.ac
index eb9431875fae1b0e..2c69af0807266e7e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
AS_HELP_STRING([--with-default-link],
[do not use explicit linker scripts]),
[use_default_link=$withval],
- [use_default_link=default])
+ [use_default_link=no])
dnl Additional build flags injection.
AC_ARG_WITH([nonshared-cflags],
@@ -1378,59 +1378,6 @@ fi
rm -f conftest*])
AC_SUBST(libc_cv_hashstyle)
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
- AC_CACHE_CHECK([for sufficient default -shared layout],
- libc_cv_use_default_link, [dnl
- libc_cv_use_default_link=no
- cat > conftest.s <<\EOF
- .section .note.a,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "foo"
- .section .note.b,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "bar"
-EOF
- if AC_TRY_COMMAND([dnl
- ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
- ac_try=`$READELF -S conftest.so | sed -n \
- ['${x;p;}
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
- t a
- b
- : a
- H']`
- then
- libc_seen_a=no libc_seen_b=no
- set -- $ac_try
- while test $# -ge 2 -a "$1" = NOTE; do
- case "$2" in
- .note.a) libc_seen_a=yes ;;
- .note.b) libc_seen_b=yes ;;
- esac
- shift 2
- done
- case "$libc_seen_a$libc_seen_b" in
- yesyes)
- libc_cv_use_default_link=yes
- ;;
- *)
- echo >&AS_MESSAGE_LOG_FD "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
- ;;
- esac
- fi
- rm -f conftest*])
- use_default_link=$libc_cv_use_default_link
-fi
-
AC_CACHE_CHECK(for GLOB_DAT reloc,
libc_cv_has_glob_dat, [dnl
cat > conftest.c <<EOF
diff --git a/elf/Makefile b/elf/Makefile
index 8afbe3f6ab259331..fec6e23b5b625e3b 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -504,6 +504,40 @@ tests-execstack-yes = \
# tests-execstack-yes
endif
endif
+
+tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
+$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+ $(objpfx)ld.so
+ $(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
+ --required=_rtld_global_ro \
+ > $@ 2>&1; $(evaluate-test)
+# The optional symbols are present in libc only if the architecture has
+# the GLIBC_2.0 symbol set in libc.
+$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+ $(common-objpfx)libc.so
+ $(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
+ --required=_IO_cookie_jumps \
+ --required=_IO_file_jumps \
+ --required=_IO_file_jumps_maybe_mmap \
+ --required=_IO_file_jumps_mmap \
+ --required=_IO_helper_jumps \
+ --required=_IO_mem_jumps \
+ --required=_IO_obstack_jumps \
+ --required=_IO_proc_jumps \
+ --required=_IO_str_chk_jumps \
+ --required=_IO_str_jumps \
+ --required=_IO_strn_jumps \
+ --required=_IO_wfile_jumps \
+ --required=_IO_wfile_jumps_maybe_mmap \
+ --required=_IO_wfile_jumps_mmap \
+ --required=_IO_wmem_jumps \
+ --required=_IO_wstr_jumps \
+ --required=_IO_wstrn_jumps \
+ --optional=_IO_old_cookie_jumps \
+ --optional=_IO_old_file_jumps \
+ --optional=_IO_old_proc_jumps \
+ > $@ 2>&1; $(evaluate-test)
+
tests += $(tests-execstack-$(have-z-execstack))
ifeq ($(run-built-tests),yes)
tests-special += \
diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
new file mode 100644
index 0000000000000000..368ea3349f86bd81
--- /dev/null
+++ b/elf/tst-relro-symbols.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+# Verify that certain symbols are covered by RELRO.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Analyze a (shared) object to verify that certain symbols are
+present and covered by the PT_GNU_RELRO segment.
+
+"""
+
+import argparse
+import os.path
+import sys
+
+# Make available glibc Python modules.
+sys.path.append(os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
+
+import glibcelf
+
+def find_relro(path: str, img: glibcelf.Image) -> (int, int):
+ """Discover the address range of the PT_GNU_RELRO segment."""
+ for phdr in img.phdrs():
+ if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
+ # The computation is not entirely accurate because
+ # _dl_protect_relro in elf/dl-reloc.c rounds both the
+ # start end and downwards using the run-time page size.
+ return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
+ sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
+ sys.exit(1)
+
+def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
+ """Check if a section or symbol falls within in the RELRO segment."""
+ end = start + size - 1
+ if not (relro_begin <= start < end < relro_end):
+ error(
+ '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
+ kind, name.decode('UTF-8'), start, size,
+ relro_begin, relro_end))
+
+def get_parser():
+ """Return an argument parser for this script."""
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('object', help='path to object file to check')
+ parser.add_argument('--required', metavar='NAME', default=(),
+ help='required symbol names', nargs='*')
+ parser.add_argument('--optional', metavar='NAME', default=(),
+ help='required symbol names', nargs='*')
+ return parser
+
+def main(argv):
+ """The main entry point."""
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+ img = glibcelf.Image.readfile(opts.object)
+
+ required_symbols = frozenset([sym.encode('UTF-8')
+ for sym in opts.required])
+ optional_symbols = frozenset([sym.encode('UTF-8')
+ for sym in opts.optional])
+ check_symbols = required_symbols | optional_symbols
+
+ # Tracks the symbols in check_symbols that have been found.
+ symbols_found = set()
+
+ # Discover the extent of the RELRO segment.
+ relro_begin, relro_end = find_relro(opts.object, img)
+ symbol_table_found = False
+
+ errors = False
+ def error(msg: str) -> None:
+ """Record an error condition and write a message to standard output."""
+ nonlocal errors
+ errors = True
+ sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
+
+ # Iterate over section headers to find the symbol table.
+ for shdr in img.shdrs():
+ if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
+ symbol_table_found = True
+ for sym in img.syms(shdr):
+ if sym.st_name in check_symbols:
+ symbols_found.add(sym.st_name)
+
+ # Validate symbol type, section, and size.
+ if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
+ error('symbol {!r} has wrong type {}'.format(
+ sym.st_name.decode('UTF-8'), sym.st_info.type))
+ if sym.st_shndx in glibcelf.Shn:
+ error('symbol {!r} has reserved section {}'.format(
+ sym.st_name.decode('UTF-8'), sym.st_shndx))
+ continue
+ if sym.st_size == 0:
+ error('symbol {!r} has size zero'.format(
+ sym.st_name.decode('UTF-8')))
+ continue
+
+ check_in_relro('symbol', relro_begin, relro_end,
+ sym.st_name, sym.st_value, sym.st_size,
+ error)
+ continue # SHT_SYMTAB
+ if shdr.sh_name == b'.data.rel.ro' \
+ or shdr.sh_name.startswith(b'.data.rel.ro.'):
+ check_in_relro('section', relro_begin, relro_end,
+ shdr.sh_name, shdr.sh_addr, shdr.sh_size,
+ error)
+ continue
+
+ if required_symbols - symbols_found:
+ for sym in sorted(required_symbols - symbols_found):
+ error('symbol {!r} not found'.format(sym.decode('UTF-8')))
+
+ if errors:
+ sys.exit(1)
+
+ if not symbol_table_found:
+ sys.stdout.write(
+ '{}: warning: no symbol table found (stripped object)\n'.format(
+ opts.object))
+ sys.exit(77)
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/manual/install.texi b/manual/install.texi
index 816b77a0a25a88a7..36a5af62bc5722b0 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be
usable, but functionality may be lost---for example, you can't build a
shared libc with old binutils.
+@item --with-default-link=@var{FLAG}
+With @code{--with-default-link=yes}, the build system does not use a
+custom linker script for linking shared objects. The default for
+@var{FLAG} is the opposite, @samp{no}, because the custom linker script
+is needed for full RELRO protection.
+
@item --with-nonshared-cflags=@var{cflags}
Use additional compiler flags @var{cflags} to build the parts of the
library which are always statically linked into applications and
diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644
--- a/sysdeps/unix/sysv/linux/ia64/Makefile
+++ b/sysdeps/unix/sysv/linux/ia64/Makefile
@@ -1,3 +1,9 @@
+ifeq ($(subdir),elf)
+# ia64 does not support PT_GNU_RELRO.
+test-xfail-tst-relro-ldso = yes
+test-xfail-tst-relro-libc = yes
+endif
+
ifeq ($(subdir),misc)
sysdep_headers += sys/rse.h
endif

View File

@ -0,0 +1,87 @@
commit ca0faa140ff8cebe4c041d935f0f5eb480873d99
Author: Joan Bruguera <joanbrugueram@gmail.com>
Date: Mon Apr 11 19:49:56 2022 +0200
misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify
checks should pass, and `__whatever_alias` should be called.
Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but
on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`.
A comment says the -1 case should work as: "The -1 check is redundant because
since it implies that __glibc_safe_len_cond is true.". But this fails when:
* `__s > 1`
* `__osz == -1` (i.e. unknown size at compile time)
* `__l` is big enough
* `__l * __s <= __osz` can be folded to a constant
(I only found this to be true for `mbsrtowcs` and other functions in wchar2.h)
In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be
called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program.
This commit adds the explicit `__osz == -1` check again.
moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041
Minimal test case (test.c):
#include <wchar.h>
int main (void)
{
const char *hw = "HelloWorld";
mbsrtowcs (NULL, &hw, (size_t)-1, NULL);
return 0;
}
Build with:
gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test
Output:
*** buffer overflow detected ***: terminated
Fixes: BZ #29030
Signed-off-by: Joan Bruguera <joanbrugueram@gmail.com>
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
(cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e)
diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
index 8b5902423cf0ad88..fb02452f5993c594 100644
--- a/debug/tst-fortify.c
+++ b/debug/tst-fortify.c
@@ -1505,6 +1505,11 @@ do_test (void)
CHK_FAIL_END
#endif
+ /* Bug 29030 regresion check */
+ cp = "HelloWorld";
+ if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
+ FAIL ();
+
cp = "A";
if (mbstowcs (wenough, cp, 10) != 1
|| wcscmp (wenough, L"A") != 0)
diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
index 515fb681a0547217..b36013b9a6b4d9c3 100644
--- a/misc/sys/cdefs.h
+++ b/misc/sys/cdefs.h
@@ -161,13 +161,13 @@
|| (__builtin_constant_p (__l) && (__l) > 0))
/* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
- condition can be folded to a constant and if it is true. The -1 check is
- redundant because since it implies that __glibc_safe_len_cond is true. */
+ condition can be folded to a constant and if it is true, or unknown (-1) */
#define __glibc_safe_or_unknown_len(__l, __s, __osz) \
- (__glibc_unsigned_or_positive (__l) \
- && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
- __s, __osz)) \
- && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
+ ((__osz) == (__SIZE_TYPE__) -1 \
+ || (__glibc_unsigned_or_positive (__l) \
+ && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
+ (__s), (__osz))) \
+ && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
/* Conversely, we know at compile time that the length is unsafe if the
__L * __S <= __OBJSZ condition can be folded to a constant and if it is

View File

@ -0,0 +1,49 @@
commit 0d477e92c49db2906b32e44135b98746ccc73c7b
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue Apr 26 14:22:10 2022 +0200
INSTALL: Rephrase -with-default-link documentation
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f)
diff --git a/INSTALL b/INSTALL
index 60d01568d77645c7..10a3dcdc0a8db665 100644
--- a/INSTALL
+++ b/INSTALL
@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization. For example:
library will still be usable, but functionality may be lost--for
example, you can't build a shared libc with old binutils.
-'--with-default-link=FLAG'
- With '--with-default-link=yes', the build system does not use a
- custom linker script for linking shared objects. The default for
- FLAG is the opposite, 'no', because the custom linker script is
+'--with-default-link'
+ With '--with-default-link', the build system does not use a custom
+ linker script for linking shared objects. The default is
+ '--without-default-link', because the custom linker script is
needed for full RELRO protection.
'--with-nonshared-cflags=CFLAGS'
diff --git a/manual/install.texi b/manual/install.texi
index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be
usable, but functionality may be lost---for example, you can't build a
shared libc with old binutils.
-@item --with-default-link=@var{FLAG}
-With @code{--with-default-link=yes}, the build system does not use a
-custom linker script for linking shared objects. The default for
-@var{FLAG} is the opposite, @samp{no}, because the custom linker script
-is needed for full RELRO protection.
+@item --with-default-link
+With @code{--with-default-link}, the build system does not use a custom
+linker script for linking shared objects. The default is
+@code{--without-default-link}, because the custom linker script is
+needed for full RELRO protection.
@item --with-nonshared-cflags=@var{cflags}
Use additional compiler flags @var{cflags} to build the parts of the

View File

@ -0,0 +1,377 @@
commit bc56ab1f4aa937665034373d3e320d0779a839aa
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue Apr 26 14:23:02 2022 +0200
dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
When audit modules are loaded, ld.so initialization is not yet
complete, and rtld_active () returns false even though ld.so is
mostly working. Instead, the static dlopen hook is used, but that
does not work at all because this is not a static dlopen situation.
Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework
static dlopen hooks") moved the hook pointer into _rtld_global_ro,
which means that separate protection is not needed anymore and the
hook pointer can be checked directly.
The guard for disabling libio vtable hardening in _IO_vtable_check
should stay for now.
Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so:
Examine GLRO to detect inactive loader [BZ #20204]").
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b)
diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644
--- a/dlfcn/dladdr.c
+++ b/dlfcn/dladdr.c
@@ -24,7 +24,7 @@ int
__dladdr (const void *address, Dl_info *info)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dladdr (address, info);
#endif
return _dl_addr (address, info, NULL, NULL);
diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
index 78560dbac208c316..93ce68c1d6067fe2 100644
--- a/dlfcn/dladdr1.c
+++ b/dlfcn/dladdr1.c
@@ -24,7 +24,7 @@ int
__dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
#endif
diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
index 6a013a81bb648191..07ecb21bf7d43be4 100644
--- a/dlfcn/dlclose.c
+++ b/dlfcn/dlclose.c
@@ -24,7 +24,7 @@ int
__dlclose (void *handle)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlclose (handle);
#endif
diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
index 5047b140662bc33e..63da79c63000eef0 100644
--- a/dlfcn/dlerror.c
+++ b/dlfcn/dlerror.c
@@ -32,7 +32,7 @@ char *
__dlerror (void)
{
# ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlerror ();
# endif
diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
index c6f9a1da09ff8622..47d2daa96fa5986f 100644
--- a/dlfcn/dlinfo.c
+++ b/dlfcn/dlinfo.c
@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg)
int
___dlinfo (void *handle, int request, void *arg)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
else
return dlinfo_implementation (handle, request, arg);
diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
index c171c8953da20fc7..2309224eb8484b1a 100644
--- a/dlfcn/dlmopen.c
+++ b/dlfcn/dlmopen.c
@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
void *
___dlmopen (Lmid_t nsid, const char *file, int mode)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
else
return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
index e04b374b82b04337..9c59c751c4eaf7a7 100644
--- a/dlfcn/dlopen.c
+++ b/dlfcn/dlopen.c
@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
void *
___dlopen (const char *file, int mode)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
else
return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
index 9115501ac121eeca..c2f2a42194d50953 100644
--- a/dlfcn/dlopenold.c
+++ b/dlfcn/dlopenold.c
@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
mode |= RTLD_LAZY;
args.mode = mode;
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
index 43044cf7bb95801e..d3861170a7631d01 100644
--- a/dlfcn/dlsym.c
+++ b/dlfcn/dlsym.c
@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
void *
___dlsym (void *handle, const char *name)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
else
return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
index 9b76f9afa513e11f..3af02109c306b800 100644
--- a/dlfcn/dlvsym.c
+++ b/dlfcn/dlvsym.c
@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
void *
___dlvsym (void *handle, const char *name, const char *version)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
RETURN_ADDRESS (0));
else
diff --git a/elf/Makefile b/elf/Makefile
index fec6e23b5b625e3b..c89a6a58690646ee 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -376,6 +376,7 @@ tests += \
tst-audit24d \
tst-audit25a \
tst-audit25b \
+ tst-audit26 \
tst-auditmany \
tst-auxobj \
tst-auxobj-dlopen \
@@ -721,6 +722,7 @@ modules-names = \
tst-auditmod24c \
tst-auditmod24d \
tst-auditmod25 \
+ tst-auditmod26 \
tst-auxvalmod \
tst-big-note-lib \
tst-deep1mod1 \
@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
LDFLAGS-tst-audit25b = -Wl,-z,now
tst-audit25b-ARGS = -- $(host-test-program-cmd)
+$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
+$(objpfx)tst-auditmod26.so: $(libsupport)
+tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
+
# tst-sonamemove links against an older implementation of the library.
LDFLAGS-tst-sonamemove-linkmod1.so = \
-Wl,--version-script=tst-sonamemove-linkmod1.map \
diff --git a/elf/dl-libc.c b/elf/dl-libc.c
index d5bc4a277f4c6ef3..db4342a3256921f0 100644
--- a/elf/dl-libc.c
+++ b/elf/dl-libc.c
@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode)
args.caller_dlopen = RETURN_ADDRESS (0);
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
#endif
return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name)
args.name = name;
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
#endif
return (dlerror_run (do_dlsym, &args) ? NULL
@@ -199,7 +199,7 @@ void *
__libc_dlvsym (void *map, const char *name, const char *version)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
#endif
@@ -222,7 +222,7 @@ int
__libc_dlclose (void *map)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
#endif
return dlerror_run (do_dlclose, map);
diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
new file mode 100644
index 0000000000000000..3f920e83bac247a5
--- /dev/null
+++ b/elf/tst-audit26.c
@@ -0,0 +1,35 @@
+/* Check the usability of <dlfcn.h> functions in audit modules.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <gnu/lib-names.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+ /* Check that the audit module has been loaded. */
+ void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
+ TEST_VERIFY (handle
+ == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
new file mode 100644
index 0000000000000000..db7ba95abec20f53
--- /dev/null
+++ b/elf/tst-auditmod26.c
@@ -0,0 +1,104 @@
+/* Check the usability of <dlfcn.h> functions in audit modules. Audit module.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <first-versions.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+unsigned int
+la_version (unsigned int current)
+{
+ /* Exercise various <dlfcn.h> functions. */
+
+ /* Check dlopen, dlsym, dlclose. */
+ void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
+ void *ptr = xdlsym (handle, "sincos");
+ TEST_VERIFY (ptr != NULL);
+ ptr = dlsym (handle, "SINCOS");
+ TEST_VERIFY (ptr == NULL);
+ const char *message = dlerror ();
+ TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
+ ptr = dlsym (handle, "SINCOS");
+ TEST_VERIFY (ptr == NULL);
+ xdlclose (handle);
+ TEST_COMPARE_STRING (dlerror (), NULL);
+
+ handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
+
+ /* Check dlvsym. _exit is unlikely to gain another symbol
+ version. */
+ TEST_VERIFY (xdlsym (handle, "_exit")
+ == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
+
+ /* Check dlinfo. */
+ {
+ void *handle2 = NULL;
+ TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
+ TEST_VERIFY (handle2 == handle);
+ }
+
+ /* Check dladdr and dladdr1. */
+ Dl_info info = { };
+ TEST_VERIFY (dladdr (&_exit, &info) != 0);
+ if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias. */
+ TEST_COMPARE_STRING (info.dli_sname, "_exit");
+ TEST_VERIFY (info.dli_saddr == &_exit);
+ TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
+ void *extra_info;
+ memset (&info, 0, sizeof (info));
+ TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+ TEST_VERIFY (extra_info == handle);
+
+ /* Verify that dlmopen creates a new namespace. */
+ void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
+ TEST_VERIFY (dlmopen_handle != handle);
+ memset (&info, 0, sizeof (info));
+ extra_info = NULL;
+ ptr = xdlsym (dlmopen_handle, "_exit");
+ TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+ TEST_VERIFY (extra_info == dlmopen_handle);
+ xdlclose (dlmopen_handle);
+
+ /* Terminate the process with an error state. This does not happen
+ automatically because the audit module state is not shared with
+ the main program. */
+ if (support_record_failure_is_failed ())
+ {
+ fflush (stdout);
+ fflush (stderr);
+ _exit (1);
+ }
+
+ return LAV_CURRENT;
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+ if (strcmp (name, "mapped to libc") == 0)
+ return (char *) LIBC_SO;
+ else
+ return (char *) name;
+}

View File

@ -0,0 +1,28 @@
commit 83cc145830bdbefdabe03787ed884d548bea9c99
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Apr 22 19:34:52 2022 +0200
scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
enum.IntFlag and enum.EnumMeta._missing_ support are not part of
earlier Python versions.
(cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b)
diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
index 8f7d0ca184845714..da0d5380f33a195e 100644
--- a/scripts/glibcelf.py
+++ b/scripts/glibcelf.py
@@ -28,6 +28,12 @@ import collections
import enum
import struct
+if not hasattr(enum, 'IntFlag'):
+ import sys
+ sys.stdout.write(
+ 'warning: glibcelf.py needs Python 3.6 for enum support\n')
+ sys.exit(77)
+
class _OpenIntEnum(enum.IntEnum):
"""Integer enumeration that supports arbitrary int values."""
@classmethod

View File

@ -0,0 +1,254 @@
commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Aug 20 06:42:24 2021 -0700
x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
by replacing
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
and
vmovups .L_2il0floatpacket.13(%rip), %zmmX
with
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
This fixes BZ #28252.
(cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
vmovaps %zmm0, %zmm8
/* Check for large arguments path */
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
/*
ARGUMENT RANGE REDUCTION:
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
jmp .LBL_2_7
#endif
END (_ZGVeN8v_cos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.16:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.16,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
index dfa2acafc486b56b..f5f117d474f66176 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
/* preserve mantissa, set input exponent to 2^(-10) */
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
vpsrlq $32, %zmm4, %zmm6
/* reciprocal approximation good to at least 11 bits */
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
jmp .LBL_2_7
#endif
END (_ZGVeN8v_log_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.12:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.12,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
index be8ab7c6e0e33819..48d251db16ccab9d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
andq $-64, %rsp
subq $1280, %rsp
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
vmovups __dAbsMask(%rax), %zmm7
vmovups __dInvPI(%rax), %zmm2
vmovups __dRShifter(%rax), %zmm1
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
jmp .LBL_2_7
#endif
END (_ZGVeN8v_sin_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.14:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.14,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index 611887082a545854..a4944a4feef6aa98 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
/* SinPoly = SinR*SinPoly */
vfmadd213pd %zmm5, %zmm5, %zmm4
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
/* Update Cos result's sign */
vxorpd %zmm2, %zmm1, %zmm1
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
ENTRY (_ZGVeN8vvv_sincos_skx)
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
END (_ZGVeN8vvv_sincos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.15:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.15,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
index f671d60d5dab5a0e..fe8474fed943e8ad 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %zmm0, %zmm6
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
vmovups __sRShifter(%rax), %zmm3
vmovups __sPI1_FMA(%rax), %zmm5
vmovups __sA9_FMA(%rax), %zmm9
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
jmp .LBL_2_7
#endif
END (_ZGVeN16v_cosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index 637bfe3c06ab9ad4..229b7828cde04db2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
vmovaps %zmm0, %zmm7
/* compare against threshold */
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
vmovups __sInvLn2(%rax), %zmm4
vmovups __sShifter(%rax), %zmm1
vmovups __sLn2hi(%rax), %zmm6
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
#endif
END (_ZGVeN16v_expf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
andq $-64, %rsp
subq $1280, %rsp
movq __svml_slog_data@GOTPCREL(%rip), %rax
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
vmovups _iBrkValue(%rax), %zmm4
vmovups _sPoly_7(%rax), %zmm8
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
#endif
END (_ZGVeN16v_logf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.7:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.7,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
vpsrlq $32, %zmm3, %zmm2
vpmovqd %zmm2, %ymm11
vcvtps2pd %ymm14, %zmm13
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovaps %zmm14, %zmm26
vpandd _ABSMASK(%rax), %zmm1, %zmm8
vpcmpd $1, _INF(%rax), %zmm8, %k2
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
vpmovqd %zmm11, %ymm5
vpxord %zmm10, %zmm10, %zmm10
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
vpxord %zmm11, %zmm11, %zmm11
vcvtdq2pd %ymm7, %zmm7
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
jmp .LBL_2_7
#endif
END (_ZGVeN16vv_powf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.23:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.23,@object
-.L_2il0floatpacket.24:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.24,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index 9cf359c86ff9bd70..a446c504f63c9399 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
/* Result sign calculations */
vpternlogd $150, %zmm0, %zmm14, %zmm1
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
/* Add correction term 0.5 for cos() part */
vaddps %zmm8, %zmm5, %zmm15
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
ENTRY (_ZGVeN16vvv_sincosf_skx)
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
END (_ZGVeN16vvv_sincosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
index bd05109a62181f22..c1b352d0ad1992cd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
/* Check for large and special values */
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovups __sAbsMask(%rax), %zmm5
vmovups __sInvPI(%rax), %zmm1
vmovups __sRShifter(%rax), %zmm2
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
jmp .LBL_2_7
#endif
END (_ZGVeN16v_sinf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.11:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.11,@object

View File

@ -0,0 +1,42 @@
commit b5a44a6a471aafd3677659a610f32468c40a666b
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue Sep 21 18:31:49 2021 -0500
x86: Modify ENTRY in sysdep.h so that p2align can be specified
No bug.
This change adds a new macro ENTRY_P2ALIGN which takes a second
argument, log2 of the desired function alignment.
The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
doesn't affect any existing functionality.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
(cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index cac1d762fb3f99d0..937180c1bd791570 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
/* Define an entry point visible from C. */
-#define ENTRY(name) \
+#define ENTRY_P2ALIGN(name, alignment) \
.globl C_SYMBOL_NAME(name); \
.type C_SYMBOL_NAME(name),@function; \
- .align ALIGNARG(4); \
+ .align ALIGNARG(alignment); \
C_LABEL(name) \
cfi_startproc; \
_CET_ENDBR; \
CALL_MCOUNT
+/* Common entry 16 byte aligns. */
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
+
#undef END
#define END(name) \
cfi_endproc; \

View File

@ -0,0 +1,653 @@
commit 5ec3416853c4150c4d13312e05f93a053586d528
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue Sep 21 18:45:03 2021 -0500
x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
No bug.
The frontend optimizations are to:
1. Reorganize logically connected basic blocks so they are either in
the same cache line or adjacent cache lines.
2. Avoid cases when basic blocks unnecissarily cross cache lines.
3. Try and 32 byte align any basic blocks possible without sacrificing
code size. Smaller / Less hot basic blocks are used for this.
Overall code size shrunk by 168 bytes. This should make up for any
extra costs due to aligning to 64 bytes.
In general performance before deviated a great deal dependending on
whether entry alignment % 64 was 0, 16, 32, or 48. These changes
essentially make it so that the current implementation is at least
equal to the best alignment of the original for any arguments.
The only additional optimization is in the page cross case. Branch on
equals case was removed from the size == [4, 7] case. As well the [4,
7] and [2, 3] case where swapped as [4, 7] is likely a more hot
argument size.
test-memcmp and test-wmemcmp are both passing.
(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -34,7 +34,24 @@
area.
7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
+
+When possible the implementation tries to optimize for frontend in the
+following ways:
+Throughput:
+ 1. All code sections that fit are able to run optimally out of the
+ LSD.
+ 2. All code sections that fit are able to run optimally out of the
+ DSB
+ 3. Basic blocks are contained in minimum number of fetch blocks
+ necessary.
+
+Latency:
+ 1. Logically connected basic blocks are put in the same
+ cache-line.
+ 2. Logically connected basic blocks that do not fit in the same
+ cache-line are put in adjacent lines. This can get beneficial
+ L2 spatial prefetching and L1 next-line prefetching. */
# include <sysdep.h>
@@ -47,9 +64,11 @@
# ifdef USE_AS_WMEMCMP
# define CHAR_SIZE 4
# define VPCMP vpcmpd
+# define VPTEST vptestmd
# else
# define CHAR_SIZE 1
# define VPCMP vpcmpub
+# define VPTEST vptestmb
# endif
# define VEC_SIZE 32
@@ -75,7 +94,9 @@
*/
.section .text.evex,"ax",@progbits
-ENTRY (MEMCMP)
+/* Cache align memcmp entry. This allows for much more thorough
+ frontend optimization. */
+ENTRY_P2ALIGN (MEMCMP, 6)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
VPCMP $4, (%rdi), %YMM1, %k1
kmovd %k1, %eax
/* NB: eax must be destination register if going to
- L(return_vec_[0,2]). For L(return_vec_3 destination register
+ L(return_vec_[0,2]). For L(return_vec_3) destination register
must be ecx. */
testl %eax, %eax
jnz L(return_vec_0)
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
testl %ecx, %ecx
jnz L(return_vec_3)
- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
- compare with zero to get a mask is needed. */
- vpxorq %XMM0, %XMM0, %XMM0
-
/* Go to 4x VEC loop. */
cmpq $(CHAR_PER_VEC * 8), %rdx
ja L(more_8x_vec)
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
- oring with YMM3. Result is stored in YMM4. */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
- VPCMP $4, %YMM4, %YMM0, %k1
+ oring with YMM1. Result is stored in YMM4. */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+
+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+
+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.
+ */
+ VPTEST %YMM4, %YMM4, %k1
+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
- /* NB: aligning 32 here allows for the rest of the jump targets
- to be tuned for 32 byte alignment. Most important this ensures
- the L(more_8x_vec) loop is 32 byte aligned. */
- .p2align 5
-L(less_vec):
- /* Check if one or less CHAR. This is necessary for size = 0 but
- is also faster for size = CHAR_SIZE. */
- cmpl $1, %edx
- jbe L(one_or_less)
+ .p2align 4
+L(8x_end_return_vec_0_1_2_3):
+ movq %rdx, %rdi
+L(8x_return_vec_0_1_2_3):
+ addq %rdi, %rsi
+L(return_vec_0_1_2_3):
+ VPTEST %YMM1, %YMM1, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
+ VPTEST %YMM2, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_1)
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Create mask in ecx for potentially in bound matches. */
- bzhil %edx, %eax, %eax
- jnz L(return_vec_0)
+ VPTEST %YMM3, %YMM3, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_2)
+L(return_vec_3):
+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
+ line. */
+ bsfl %ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
+# endif
ret
.p2align 4
@@ -209,10 +240,11 @@ L(return_vec_0):
# endif
ret
- /* NB: No p2align necessary. Alignment % 16 is naturally 1
- which is good enough for a target not in a loop. */
+ .p2align 4
L(return_vec_1):
- tzcntl %eax, %eax
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
+ fetch block. */
+ bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -226,10 +258,11 @@ L(return_vec_1):
# endif
ret
- /* NB: No p2align necessary. Alignment % 16 is naturally 2
- which is good enough for a target not in a loop. */
+ .p2align 4,, 10
L(return_vec_2):
- tzcntl %eax, %eax
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
+ fetch block. */
+ bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -243,40 +276,6 @@ L(return_vec_2):
# endif
ret
- .p2align 4
-L(8x_return_vec_0_1_2_3):
- /* Returning from L(more_8x_vec) requires restoring rsi. */
- addq %rdi, %rsi
-L(return_vec_0_1_2_3):
- VPCMP $4, %YMM1, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_0)
-
- VPCMP $4, %YMM2, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_1)
-
- VPCMP $4, %YMM3, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_2)
-L(return_vec_3):
- tzcntl %ecx, %ecx
-# ifdef USE_AS_WMEMCMP
- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
- xorl %edx, %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-
.p2align 4
L(more_8x_vec):
/* Set end of s1 in rdx. */
@@ -288,21 +287,19 @@ L(more_8x_vec):
andq $-VEC_SIZE, %rdi
/* Adjust because first 4x vec where check already. */
subq $-(VEC_SIZE * 4), %rdi
+
.p2align 4
L(loop_4x_vec):
VMOVU (%rsi, %rdi), %YMM1
vpxorq (%rdi), %YMM1, %YMM1
-
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
-
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
-
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
- VPCMP $4, %YMM4, %YMM0, %k1
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(8x_return_vec_0_1_2_3)
@@ -319,28 +316,25 @@ L(loop_4x_vec):
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+
VMOVU (%rsi, %rdx), %YMM1
vpxorq (%rdx), %YMM1, %YMM1
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
-
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
-
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
- VPCMP $4, %YMM4, %YMM0, %k1
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
- /* Restore s1 pointer to rdi. */
- movq %rdx, %rdi
testl %ecx, %ecx
- jnz L(8x_return_vec_0_1_2_3)
+ jnz L(8x_end_return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
/* Only entry is from L(more_8x_vec). */
- .p2align 4
+ .p2align 4,, 10
L(8x_last_2x_vec):
VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
kmovd %k1, %eax
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
jnz L(8x_return_vec_3)
ret
- .p2align 4
+ /* Not ideally aligned (at offset +9 bytes in fetch block) but
+ not aligning keeps it in the same cache line as
+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
+ size. */
+ .p2align 4,, 4
+L(8x_return_vec_2):
+ subq $VEC_SIZE, %rdx
+L(8x_return_vec_3):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
+ movl (VEC_SIZE * 3)(%rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addq %rdx, %rax
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 3)(%rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4,, 10
L(last_2x_vec):
/* Check second to last VEC. */
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
@@ -374,26 +392,49 @@ L(last_1x_vec):
jnz L(return_vec_0_end)
ret
- .p2align 4
-L(8x_return_vec_2):
- subq $VEC_SIZE, %rdx
-L(8x_return_vec_3):
- tzcntl %eax, %eax
+ .p2align 4,, 10
+L(return_vec_1_end):
+ /* Use bsf to save code size. This is necessary to have
+ L(one_or_less) fit in aligning bytes between. */
+ bsfl %eax, %eax
+ addl %edx, %eax
# ifdef USE_AS_WMEMCMP
- leaq (%rdx, %rax, CHAR_SIZE), %rax
- movl (VEC_SIZE * 3)(%rax), %ecx
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
setg %dl
leal -1(%rdx, %rdx), %eax
# else
- addq %rdx, %rax
- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
- movzbl (VEC_SIZE * 3)(%rax), %eax
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
+ /* NB: L(one_or_less) fits in alignment padding between
+ L(return_vec_1_end) and L(return_vec_0_end). */
+# ifdef USE_AS_WMEMCMP
+L(one_or_less):
+ jb L(zero)
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(zero)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+ ret
+# else
+L(one_or_less):
+ jb L(zero)
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+ subl %ecx, %eax
+ ret
+# endif
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(return_vec_0_end):
tzcntl %eax, %eax
@@ -412,23 +453,56 @@ L(return_vec_0_end):
ret
.p2align 4
-L(return_vec_1_end):
+L(less_vec):
+ /* Check if one or less CHAR. This is necessary for size == 0
+ but is also faster for size == CHAR_SIZE. */
+ cmpl $1, %edx
+ jbe L(one_or_less)
+
+ /* Check if loading one VEC from either s1 or s2 could cause a
+ page cross. This can have false positives but is by far the
+ fastest method. */
+ movl %edi, %eax
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(page_cross_less_vec)
+
+ /* No page cross possible. */
+ VMOVU (%rsi), %YMM2
+ VPCMP $4, (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check if any matches where in bounds. Intentionally not
+ storing result in eax to limit dependency chain if it goes to
+ L(return_vec_0_lv). */
+ bzhil %edx, %eax, %edx
+ jnz L(return_vec_0_lv)
+ xorl %eax, %eax
+ ret
+
+ /* Essentially duplicate of L(return_vec_0). Ends up not costing
+ any code as shrinks L(less_vec) by allowing 2-byte encoding of
+ the jump and ends up fitting in aligning bytes. As well fits on
+ same cache line as L(less_vec) so also saves a line from having
+ to be fetched on cold calls to memcmp. */
+ .p2align 4,, 4
+L(return_vec_0_lv):
tzcntl %eax, %eax
- addl %edx, %eax
# ifdef USE_AS_WMEMCMP
- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
-
.p2align 4
L(page_cross_less_vec):
/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
cmpl $8, %edx
jae L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
-L(between_2_3):
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
- ret
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
+ jb L(between_2_3)
+
+ /* Load as big endian with overlapping movbe to avoid branches.
+ */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* edx is guranteed to be positive int32 in range [4, 7]. */
+ cmovne %edx, %eax
+ /* ecx is -1 if rcx > rax. Otherwise 0. */
+ sbbl %ecx, %ecx
+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+ eax doesn't matter. */
+ orl %ecx, %eax
ret
- .p2align 4
+ .p2align 4,, 8
L(between_8_15):
# endif
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
- vmovq (%rdi), %XMM1
- vmovq (%rsi), %XMM2
- VPCMP $4, %XMM1, %XMM2, %k1
+ vmovq (%rdi), %xmm1
+ vmovq (%rsi), %xmm2
+ VPCMP $4, %xmm1, %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
+ jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
- vmovq (%rdi), %XMM1
- vmovq (%rsi), %XMM2
- VPCMP $4, %XMM1, %XMM2, %k1
+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
+ VPCMP $4, %xmm1, %xmm2, %k1
+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
+ jnz L(return_vec_0_end)
ret
- .p2align 4
+ .p2align 4,, 8
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
- VMOVU (%rsi), %XMM2
- VPCMP $4, (%rdi), %XMM2, %k1
+
+ /* Use movups to save code size. */
+ movups (%rsi), %xmm2
+ VPCMP $4, (%rdi), %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
-
+ jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
-
- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
- VPCMP $4, (%rdi), %XMM2, %k1
+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
- ret
-
-# ifdef USE_AS_WMEMCMP
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movl (%rdi), %ecx
- xorl %edx, %edx
- cmpl (%rsi), %ecx
- je L(zero)
- setg %dl
- leal -1(%rdx, %rdx), %eax
+ jnz L(return_vec_0_end)
ret
-# else
- .p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+# ifndef USE_AS_WMEMCMP
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
ret
# endif
-
END (MEMCMP)
#endif

View File

@ -0,0 +1,497 @@
commit 6d18a93dbbde2958001d65dff3080beed7ae675a
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Sep 20 16:20:15 2021 -0500
x86: Optimize memset-vec-unaligned-erms.S
No bug.
Optimization are
1. change control flow for L(more_2x_vec) to fall through to loop and
jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
size and saves jumps for length > 4x VEC_SIZE.
2. For EVEX/AVX512 move L(less_vec) closer to entry.
3. Avoid complex address mode for length > 2x VEC_SIZE
4. Slightly better aligning code for the loop from the perspective of
code size and uops.
5. Align targets so they make full use of their fetch block and if
possible cache line.
6. Try and reduce total number of icache lines that will need to be
pulled in for a given length.
7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
jumping to the stosb target in the sse2 code section will almost
certainly be to a new page. The new version does increase code size
marginally by duplicating the target but should get better iTLB
behavior as a result.
test-memset, test-wmemset, and test-bzero are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -18,13 +18,15 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#define USE_WITH_SSE2 1
#define VEC_SIZE 16
+#define MOV_SIZE 3
+#define RET_SIZE 1
+
#define VEC(i) xmm##i
-/* Don't use movups and movaps since it will get larger nop paddings for
- alignment. */
-#define VMOVU movdqu
-#define VMOVA movdqa
+#define VMOVU movups
+#define VMOVA movaps
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index ae0860f36a47d594..1af668af0aeda59e 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,8 +1,14 @@
#if IS_IN (libc)
+# define USE_WITH_AVX2 1
+
# define VEC_SIZE 32
+# define MOV_SIZE 4
+# define RET_SIZE 4
+
# define VEC(i) ymm##i
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
+
+# define VMOVU vmovdqu
+# define VMOVA vmovdqa
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 8ad842fc2f140527..f14d6f8493c21a36 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,11 +1,18 @@
#if IS_IN (libc)
+# define USE_WITH_AVX512 1
+
# define VEC_SIZE 64
+# define MOV_SIZE 6
+# define RET_SIZE 1
+
# define XMM0 xmm16
# define YMM0 ymm16
# define VEC0 zmm16
# define VEC(i) VEC##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 640f092903302ad0..64b09e77cc20cc42 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -1,11 +1,18 @@
#if IS_IN (libc)
+# define USE_WITH_EVEX 1
+
# define VEC_SIZE 32
+# define MOV_SIZE 6
+# define RET_SIZE 1
+
# define XMM0 xmm16
# define YMM0 ymm16
# define VEC0 ymm16
# define VEC(i) VEC##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index ff196844a093dc3b..e723413a664c088f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,8 +63,27 @@
# endif
#endif
+#if VEC_SIZE == 64
+# define LOOP_4X_OFFSET (VEC_SIZE * 4)
+#else
+# define LOOP_4X_OFFSET (0)
+#endif
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+# define END_REG rcx
+# define LOOP_REG rdi
+#else
+# define END_REG rdi
+# define LOOP_REG rdx
+#endif
+
#define PAGE_SIZE 4096
+/* Macro to calculate size of small memset block for aligning
+ purposes. */
+#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
+
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -74,6 +93,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
+ xorl %esi, %esi
pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
jb L(less_vec)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
+ */
+ VMOVU %VEC(0), (%rax)
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
VZEROUPPER_RETURN
-
- .p2align 4
-L(stosb_more_2x_vec):
- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
- ja L(stosb)
-#else
- .p2align 4
#endif
-L(more_2x_vec):
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_start)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
-L(return):
-#if VEC_SIZE > 16
- ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ .p2align 4,, 10
+L(last_2x_vec):
+#ifdef USE_LESS_VEC_MASK_STORE
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
#else
- ret
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
#endif
+ VZEROUPPER_RETURN
-L(loop_start):
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
- cmpq $(VEC_SIZE * 8), %rdx
- jbe L(loop_end)
- andq $-(VEC_SIZE * 2), %rdi
- subq $-(VEC_SIZE * 4), %rdi
- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
- .p2align 4
-L(loop):
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(0), VEC_SIZE(%rdi)
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
- subq $-(VEC_SIZE * 4), %rdi
- cmpq %rcx, %rdi
- jb L(loop)
-L(loop_end):
- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
- rdx as length is also unchanged. */
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
- VZEROUPPER_SHORT_RETURN
-
- .p2align 4
+ /* If have AVX512 mask instructions put L(less_vec) close to
+ entry as it doesn't take much space and is likely a hot target.
+ */
+#ifdef USE_LESS_VEC_MASK_STORE
+ .p2align 4,, 10
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
-# ifdef USE_LESS_VEC_MASK_STORE
/* Clear high bits from edi. Only keeping bits relevant to page
cross check. Note that we are using rax which is set in
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
- */
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
andl $(PAGE_SIZE - 1), %edi
- /* Check if VEC_SIZE store cross page. Mask stores suffer serious
- performance degradation when it has to fault supress. */
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
+ serious performance degradation when it has to fault supress.
+ */
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
+ /* This is generally considered a cold target. */
ja L(cross_page)
# if VEC_SIZE > 32
movq $-1, %rcx
@@ -247,58 +235,185 @@ L(less_vec):
bzhil %edx, %ecx, %ecx
kmovd %ecx, %k1
# endif
- vmovdqu8 %VEC(0), (%rax) {%k1}
+ vmovdqu8 %VEC(0), (%rax){%k1}
VZEROUPPER_RETURN
+# if defined USE_MULTIARCH && IS_IN (libc)
+ /* Include L(stosb_local) here if including L(less_vec) between
+ L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+ L(stosb_more_2x_vec) target. */
+ .p2align 4,, 10
+L(stosb_local):
+ movzbl %sil, %eax
+ mov %RDX_LP, %RCX_LP
+ mov %RDI_LP, %RDX_LP
+ rep stosb
+ mov %RDX_LP, %RAX_LP
+ VZEROUPPER_RETURN
+# endif
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
.p2align 4
-L(cross_page):
+L(stosb_more_2x_vec):
+ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
+ ja L(stosb_local)
+#endif
+ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ and (4x, 8x] jump to target. */
+L(more_2x_vec):
+
+ /* Two different methods of setting up pointers / compare. The
+ two methods are based on the fact that EVEX/AVX512 mov
+ instructions take more bytes then AVX2/SSE2 mov instructions. As
+ well that EVEX/AVX512 machines also have fast LEA_BID. Both
+ setup and END_REG to avoid complex address mode. For EVEX/AVX512
+ this saves code size and keeps a few targets in one fetch block.
+ For AVX2/SSE2 this helps prevent AGU bottlenecks. */
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+ LOOP_4X_OFFSET) with LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+ /* Stores to first 2x VEC before cmp as any path forward will
+ require it. */
+ VMOVU %VEC(0), (%rax)
+ VMOVU %VEC(0), VEC_SIZE(%rax)
+
+
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
+ addq %rdx, %END_REG
+#endif
+
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_2x_vec)
+
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
+ extra offset to addresses in loop. Used for AVX512 to save space
+ as no way to get (VEC_SIZE * 4) in imm8. */
+# if LOOP_4X_OFFSET == 0
+ subq $-(VEC_SIZE * 4), %LOOP_REG
# endif
-# if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
+ /* Avoid imm32 compare here to save code size. */
+ cmpq %rdi, %rcx
+#else
+ addq $-(VEC_SIZE * 4), %END_REG
+ cmpq $(VEC_SIZE * 8), %rdx
+#endif
+ jbe L(last_4x_vec)
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ /* Set LOOP_REG (rdx). */
+ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
+#endif
+ /* Align dst for loop. */
+ andq $(VEC_SIZE * -2), %LOOP_REG
+ .p2align 4
+L(loop):
+ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+ subq $-(VEC_SIZE * 4), %LOOP_REG
+ cmpq %END_REG, %LOOP_REG
+ jb L(loop)
+ .p2align 4,, MOV_SIZE
+L(last_4x_vec):
+ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return):
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
+ ret
+#endif
+
+ .p2align 4,, 10
+#ifndef USE_LESS_VEC_MASK_STORE
+# if defined USE_MULTIARCH && IS_IN (libc)
+ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ range for 2-byte jump encoding. */
+L(stosb_local):
+ movzbl %sil, %eax
+ mov %RDX_LP, %RCX_LP
+ mov %RDI_LP, %RDX_LP
+ rep stosb
+ mov %RDX_LP, %RAX_LP
+ VZEROUPPER_RETURN
# endif
-# if VEC_SIZE > 16
- cmpb $16, %dl
+ /* Define L(less_vec) only if not otherwise defined. */
+ .p2align 4
+L(less_vec):
+#endif
+L(cross_page):
+#if VEC_SIZE > 32
+ cmpl $32, %edx
+ jae L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+ cmpl $16, %edx
jae L(between_16_31)
-# endif
- MOVQ %XMM0, %rcx
- cmpb $8, %dl
+#endif
+ MOVQ %XMM0, %rdi
+ cmpl $8, %edx
jae L(between_8_15)
- cmpb $4, %dl
+ cmpl $4, %edx
jae L(between_4_7)
- cmpb $1, %dl
+ cmpl $1, %edx
ja L(between_2_3)
- jb 1f
- movb %cl, (%rax)
-1:
+ jb L(return)
+ movb %sil, (%rax)
VZEROUPPER_RETURN
-# if VEC_SIZE > 32
+
+ /* Align small targets only if not doing so would cross a fetch
+ line. */
+#if VEC_SIZE > 32
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, -32(%rax,%rdx)
VMOVU %YMM0, (%rax)
+ VMOVU %YMM0, -32(%rax, %rdx)
VZEROUPPER_RETURN
-# endif
-# if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
+#endif
+
+#if VEC_SIZE >= 32
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
L(between_16_31):
- VMOVU %XMM0, -16(%rax,%rdx)
+ /* From 16 to 31. No branch when size == 16. */
VMOVU %XMM0, (%rax)
+ VMOVU %XMM0, -16(%rax, %rdx)
VZEROUPPER_RETURN
-# endif
- /* From 8 to 15. No branch when size == 8. */
+#endif
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
L(between_8_15):
- movq %rcx, -8(%rax,%rdx)
- movq %rcx, (%rax)
+ /* From 8 to 15. No branch when size == 8. */
+ movq %rdi, (%rax)
+ movq %rdi, -8(%rax, %rdx)
VZEROUPPER_RETURN
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %ecx, -4(%rax,%rdx)
- movl %ecx, (%rax)
+ movl %edi, (%rax)
+ movl %edi, -4(%rax, %rdx)
VZEROUPPER_RETURN
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %cx, -2(%rax,%rdx)
- movw %cx, (%rax)
+ movw %di, (%rax)
+ movb %dil, -1(%rax, %rdx)
VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))

View File

@ -0,0 +1,40 @@
commit baf3ece63453adac59c5688930324a78ced5b2e4
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sat Oct 23 01:26:47 2021 -0400
x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
it could potentially be dangerous to use SSE2 if this function is ever
called without using 'vzeroupper' beforehand. While compilers appear
to use 'vzeroupper' before function calls if AVX2 has been used, using
SSE2 here is more brittle. Since it is not absolutely necessary it
should be avoided.
It costs 2-extra bytes but the extra bytes should only eat into
alignment padding.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 2761b54f2e7dea9f..640f6757fac8a356 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@ L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
/* Use movups to save code size. */
- movups (%rsi), %xmm2
+ vmovdqu (%rsi), %xmm2
VPCMP $4, (%rdi), %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
kmovd %k1, %eax

View File

@ -0,0 +1,690 @@
commit f35ad30da4880a1574996df0674986ecf82fa7ae
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Oct 29 12:40:20 2021 -0700
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
(cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -41,6 +41,8 @@
# ifdef USE_AS_WCSCMP
/* Compare packed dwords. */
# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define VPTESTM vptestmd
# define SHIFT_REG32 r8d
# define SHIFT_REG64 r8
/* 1 dword char == 4 bytes. */
@@ -48,6 +50,8 @@
# else
/* Compare packed bytes. */
# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define VPTESTM vptestmb
# define SHIFT_REG32 ecx
# define SHIFT_REG64 rcx
/* 1 byte char == 1 byte. */
@@ -67,6 +71,9 @@
# define YMM5 ymm22
# define YMM6 ymm23
# define YMM7 ymm24
+# define YMM8 ymm25
+# define YMM9 ymm26
+# define YMM10 ymm27
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -76,7 +83,7 @@
/* The main idea of the string comparison (byte or dword) using 256-bit
EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
latter can be on either packed bytes or dwords depending on
- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
@@ -123,27 +130,21 @@ ENTRY (STRCMP)
jg L(cross_page)
/* Start comparing 4 vectors. */
VMOVU (%rdi), %YMM0
- VMOVU (%rsi), %YMM1
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
- /* Check for NULL in YMM0. */
- VPCMP $0, %YMMZERO, %YMM0, %k1
- /* Check for NULL in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (%rsi). */
+ VPCMP $0, (%rsi), %YMM0, %k1{%k2}
- /* Each bit in K1 represents:
- 1. A mismatch in YMM0 and YMM1. Or
- 2. A NULL in YMM0 or YMM1.
- */
- kord %k0, %k1, %k1
-
- ktestd %k1, %k1
- je L(next_3_vectors)
kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ je L(next_3_vectors)
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -172,9 +173,7 @@ L(return):
# endif
ret
- .p2align 4
L(return_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -210,9 +209,7 @@ L(return_vec_size):
# endif
ret
- .p2align 4
L(return_2_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -248,9 +245,7 @@ L(return_2_vec_size):
# endif
ret
- .p2align 4
L(return_3_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -289,43 +284,45 @@ L(return_3_vec_size):
.p2align 4
L(next_3_vectors):
VMOVU VEC_SIZE(%rdi), %YMM0
- VMOVU VEC_SIZE(%rsi), %YMM1
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
+ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_vec_size)
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
-
- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
- VPCMP $4, %YMM2, %YMM4, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM4, %k2
- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
+ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_2_vec_size)
- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
- VPCMP $4, %YMM3, %YMM5, %k0
- VPCMP $0, %YMMZERO, %YMM3, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k2
- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
+ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_3_vec_size)
L(main_loop_header):
leaq (VEC_SIZE * 4)(%rdi), %rdx
@@ -375,56 +372,51 @@ L(back_to_loop):
VMOVA VEC_SIZE(%rax), %YMM2
VMOVA (VEC_SIZE * 2)(%rax), %YMM4
VMOVA (VEC_SIZE * 3)(%rax), %YMM6
- VMOVU (%rdx), %YMM1
- VMOVU VEC_SIZE(%rdx), %YMM3
- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
-
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- kord %k1, %k2, %k1
- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
- YMM1. */
- kord %k0, %k1, %k4
-
- VPCMP $4, %YMM2, %YMM3, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k2
- kord %k1, %k2, %k1
- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
- YMM3. */
- kord %k0, %k1, %k5
-
- VPCMP $4, %YMM4, %YMM5, %k0
- VPCMP $0, %YMMZERO, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k2
- kord %k1, %k2, %k1
- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
- YMM5. */
- kord %k0, %k1, %k6
-
- VPCMP $4, %YMM6, %YMM7, %k0
- VPCMP $0, %YMMZERO, %YMM6, %k1
- VPCMP $0, %YMMZERO, %YMM7, %k2
- kord %k1, %k2, %k1
- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
- YMM7. */
- kord %k0, %k1, %k7
-
- kord %k4, %k5, %k0
- kord %k6, %k7, %k1
-
- /* Test each mask (32 bits) individually because for VEC_SIZE
- == 32 is not possible to OR the four masks and keep all bits
- in a 64-bit integer register, differing from SSE2 strcmp
- where ORing is possible. */
- kortestd %k0, %k1
- je L(loop)
- ktestd %k4, %k4
+
+ VPMINU %YMM0, %YMM2, %YMM8
+ VPMINU %YMM4, %YMM6, %YMM9
+
+ /* A zero CHAR in YMM8 means that there is a null CHAR. */
+ VPMINU %YMM8, %YMM9, %YMM8
+
+ /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ VPTESTM %YMM8, %YMM8, %k1
+
+ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
+ vpxorq (%rdx), %YMM0, %YMM1
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+
+ vporq %YMM1, %YMM3, %YMM9
+ vporq %YMM5, %YMM7, %YMM10
+
+ /* A non-zero CHAR in YMM9 represents a mismatch. */
+ vporq %YMM9, %YMM10, %YMM9
+
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
+ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ je L(loop)
+
+ /* Each bit set in K1 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM0 and (%rdx). */
+ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_vec)
- kmovd %k4, %edi
- tzcntl %edi, %ecx
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %ecx
@@ -466,9 +458,18 @@ L(test_vec):
cmpq $VEC_SIZE, %r11
jbe L(zero)
# endif
- ktestd %k5, %k5
+ /* Each bit set in K1 represents a non-null CHAR in YMM2. */
+ VPTESTM %YMM2, %YMM2, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM2 and VEC_SIZE(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_2_vec)
- kmovd %k5, %ecx
tzcntl %ecx, %edi
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -512,9 +513,18 @@ L(test_2_vec):
cmpq $(VEC_SIZE * 2), %r11
jbe L(zero)
# endif
- ktestd %k6, %k6
+ /* Each bit set in K1 represents a non-null CHAR in YMM4. */
+ VPTESTM %YMM4, %YMM4, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM4 and (VEC_SIZE * 2)(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_3_vec)
- kmovd %k6, %ecx
tzcntl %ecx, %edi
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -558,8 +568,18 @@ L(test_3_vec):
cmpq $(VEC_SIZE * 3), %r11
jbe L(zero)
# endif
- kmovd %k7, %esi
- tzcntl %esi, %ecx
+ /* Each bit set in K1 represents a non-null CHAR in YMM6. */
+ VPTESTM %YMM6, %YMM6, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM6 and (VEC_SIZE * 3)(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %ecx
@@ -615,39 +635,51 @@ L(loop_cross_page):
VMOVU (%rax, %r10), %YMM2
VMOVU VEC_SIZE(%rax, %r10), %YMM3
- VMOVU (%rdx, %r10), %YMM4
- VMOVU VEC_SIZE(%rdx, %r10), %YMM5
-
- VPCMP $4, %YMM4, %YMM2, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM4, %k2
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
- YMM4. */
- kord %k0, %k1, %k1
-
- VPCMP $4, %YMM5, %YMM3, %k3
- VPCMP $0, %YMMZERO, %YMM3, %k4
- VPCMP $0, %YMMZERO, %YMM5, %k5
- kord %k4, %k5, %k4
- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
- YMM5. */
- kord %k3, %k4, %k3
+
+ /* Each bit set in K2 represents a non-null CHAR in YMM2. */
+ VPTESTM %YMM2, %YMM2, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM2 and 32 bytes at (%rdx, %r10). */
+ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
+ kmovd %k1, %r9d
+ /* Don't use subl since it is the lower 16/32 bits of RDI
+ below. */
+ notl %r9d
+# ifdef USE_AS_WCSCMP
+ /* Only last 8 bits are valid. */
+ andl $0xff, %r9d
+# endif
+
+ /* Each bit set in K4 represents a non-null CHAR in YMM3. */
+ VPTESTM %YMM3, %YMM3, %k4
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
+ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
+ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ kmovd %k3, %edi
+# ifdef USE_AS_WCSCMP
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
+ notl %edi
+ andl $0xff, %edi
+# else
+ incl %edi
+# endif
# ifdef USE_AS_WCSCMP
- /* NB: Each bit in K1/K3 represents 4-byte element. */
- kshiftlw $8, %k3, %k2
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
+ sall $8, %edi
/* NB: Divide shift count by 4 since each bit in K1 represent 4
bytes. */
movl %ecx, %SHIFT_REG32
sarl $2, %SHIFT_REG32
+
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
+ orl %r9d, %edi
# else
- kshiftlq $32, %k3, %k2
-# endif
+ salq $32, %rdi
- /* Each bit in K1 represents a NULL or a mismatch. */
- korq %k1, %k2, %k1
- kmovq %k1, %rdi
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
+ orq %r9, %rdi
+# endif
/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
shrxq %SHIFT_REG64, %rdi, %rdi
@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
/* The first VEC_SIZE * 2 bytes match or are ignored. */
VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
-
- VPCMP $4, %YMM0, %YMM2, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM2, %k2
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
- YMM2. */
- kord %k0, %k1, %k1
-
- VPCMP $4, %YMM1, %YMM3, %k3
- VPCMP $0, %YMMZERO, %YMM1, %k4
- VPCMP $0, %YMMZERO, %YMM3, %k5
- kord %k4, %k5, %k4
- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
- YMM3. */
- kord %k3, %k4, %k3
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
+ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+ kmovd %k1, %r9d
+ /* Don't use subl since it is the lower 16/32 bits of RDI
+ below. */
+ notl %r9d
# ifdef USE_AS_WCSCMP
- /* NB: Each bit in K1/K3 represents 4-byte element. */
- kshiftlw $8, %k3, %k2
+ /* Only last 8 bits are valid. */
+ andl $0xff, %r9d
+# endif
+
+ VPTESTM %YMM1, %YMM1, %k4
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
+ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
+ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ kmovd %k3, %edi
+# ifdef USE_AS_WCSCMP
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
+ notl %edi
+ andl $0xff, %edi
# else
- kshiftlq $32, %k3, %k2
+ incl %edi
# endif
- /* Each bit in K1 represents a NULL or a mismatch. */
- korq %k1, %k2, %k1
- kmovq %k1, %rdi
+# ifdef USE_AS_WCSCMP
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
+ sall $8, %edi
+
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
+ orl %r9d, %edi
+# else
+ salq $32, %rdi
+
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
+ orq %r9, %rdi
+# endif
xorl %r8d, %r8d
/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
/* R8 has number of bytes skipped. */
movl %ecx, %r8d
# ifdef USE_AS_WCSCMP
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in RDI represent 4
bytes. */
sarl $2, %ecx
-# endif
+ /* Skip ECX bytes. */
+ shrl %cl, %edi
+# else
/* Skip ECX bytes. */
shrq %cl, %rdi
+# endif
1:
/* Before jumping back to the loop, set ESI to the number of
VEC_SIZE * 4 blocks before page crossing. */
@@ -818,7 +863,7 @@ L(cross_page_loop):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
# endif
- /* Check null char. */
+ /* Check null CHAR. */
testl %eax, %eax
jne L(cross_page_loop)
/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
@@ -901,18 +946,17 @@ L(cross_page):
jg L(cross_page_1_vector)
L(loop_1_vector):
VMOVU (%rdi, %rdx), %YMM0
- VMOVU (%rsi, %rdx), %YMM1
-
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
+
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (%rsi, %rdx). */
+ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
- testl %ecx, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(last_vector)
addl $VEC_SIZE, %edx
@@ -931,18 +975,17 @@ L(cross_page_1_vector):
cmpl $(PAGE_SIZE - 16), %eax
jg L(cross_page_1_xmm)
VMOVU (%rdi, %rdx), %XMM0
- VMOVU (%rsi, %rdx), %XMM1
-
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- korw %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- korw %k0, %k1, %k1
- kmovw %k1, %ecx
- testl %ecx, %ecx
+
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and 16 bytes at (%rsi, %rdx). */
+ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xf, %ecx
+# else
+ subl $0xffff, %ecx
+# endif
jne L(last_vector)
addl $16, %edx
@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
vmovq (%rdi, %rdx), %XMM0
vmovq (%rsi, %rdx), %XMM1
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- kmovd %k1, %ecx
-
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and XMM1. */
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
+ kmovb %k1, %ecx
# ifdef USE_AS_WCSCMP
- /* Only last 2 bits are valid. */
- andl $0x3, %ecx
+ subl $0x3, %ecx
# else
- /* Only last 8 bits are valid. */
- andl $0xff, %ecx
+ subl $0xff, %ecx
# endif
-
- testl %ecx, %ecx
jne L(last_vector)
addl $8, %edx
@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
vmovd (%rdi, %rdx), %XMM0
vmovd (%rsi, %rdx), %XMM1
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and XMM1. */
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
kmovd %k1, %ecx
-
# ifdef USE_AS_WCSCMP
- /* Only the last bit is valid. */
- andl $0x1, %ecx
+ subl $0x1, %ecx
# else
- /* Only last 4 bits are valid. */
- andl $0xf, %ecx
+ subl $0xf, %ecx
# endif
-
- testl %ecx, %ecx
jne L(last_vector)
addl $4, %edx

View File

@ -0,0 +1,85 @@
commit a182bb7a3922404f79def09d79ef89678b4049f0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Oct 29 12:56:53 2021 -0700
x86-64: Remove Prefer_AVX2_STRCMP
Remove Prefer_AVX2_STRCMP to enable EVEX strcmp. When comparing 2 32-byte
strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
VPMOVMSKB and 1 TESTL. EVEX strcmp is now faster than AVX2 strcmp by up
to 40% on Tiger Lake and Ice Lake.
(cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index de4e3c3b7258120d..f4d4049e391cbabd 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -574,14 +574,6 @@ disable_tsx:
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
-
- /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
- requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
- requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
- AVX2 strcmp is faster than EVEX strcmp. */
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
- cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
- |= bit_arch_Prefer_AVX2_STRCMP;
}
/* Avoid avoid short distance REP MOVSB on processor with FSRM. */
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 58f2fad4323d5d91..957db3ad229ba39f 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Fast_Copy_Backward,
disable, 18);
- CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
- (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
}
break;
case 19:
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 3bdc76cf71007948..8250bfcbecd29a9f 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -31,5 +31,4 @@ BIT (Prefer_ERMS)
BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
-BIT (Prefer_AVX2_STRCMP)
BIT (Avoid_Short_Distance_REP_MOVSB)
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 62b7abeeee646ab4..7c2901bf44456259 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index 60ba0fe356b31779..f94a421784bfe923 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))

View File

@ -0,0 +1,48 @@
commit 2e64237a8744dd50f9222293275fa52e7248ff76
Author: Fangrui Song <maskray@google.com>
Date: Tue Nov 2 20:59:52 2021 -0700
x86-64: Replace movzx with movzbl
Clang cannot assemble movzx in the AT&T dialect mode.
../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
movzx (%rsi), %ecx
^~~~
Change movzx to movzbl, which follows the AT&T dialect and is used
elsewhere in the file.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index bc19547b09639071..6197a723b9e0606e 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
.p2align 4
// XXX Same as code above
LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 824e648230a15739..7f8a1bc756f86aee 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
.p2align 4
LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx

View File

@ -0,0 +1,843 @@
commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Nov 1 00:49:51 2021 -0500
x86: Optimize memmove-vec-unaligned-erms.S
No bug.
The optimizations are as follows:
1) Always align entry to 64 bytes. This makes behavior more
predictable and makes other frontend optimizations easier.
2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
significant benefits in the case that:
0 < (dst - src) < [256, 512]
3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
improvement and for FSRM [-10%, 25%].
In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1f23f268..b2b318084823dceb 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
/* Use movups and movaps for smaller code sizes. */
#define VMOVU movups
#define VMOVA movaps
-
+#define MOV_SIZE 3
#define SECTION(p) p
#ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 1ec1962e861dbf63..67a55f0c85af841c 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-
+# define MOV_SIZE 4
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f153c9512..975ae6c0515b83cb 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-
+# define MOV_SIZE 4
# define SECTION(p) p##.avx
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 848848ab39ff9326..0fa7126830af7acb 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
-
+# define MOV_SIZE 6
# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 0cbce8f944da51a0..88715441feaaccf5 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
-
+# define MOV_SIZE 6
# define SECTION(p) p##.evex
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index abde8438d41f2320..7b27cbdda5fb99f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -76,6 +76,25 @@
# endif
#endif
+/* Whether to align before movsb. Ultimately we want 64 byte
+ align and not worth it to load 4x VEC for VEC_SIZE == 16. */
+#define ALIGN_MOVSB (VEC_SIZE > 16)
+/* Number of bytes to align movsb to. */
+#define MOVSB_ALIGN_TO 64
+
+#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE (MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET (4)
+#else
+# define SMALL_SIZE_OFFSET (0)
+#endif
+
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
@@ -199,25 +218,21 @@ L(start):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ /* Load regardless. */
+ VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
- ret
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+ ZERO_UPPER_VEC_REGISTERS_RETURN
#else
VZEROUPPER_RETURN
#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
cmp %RDX_LP, %RCX_LP
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
movq %rdi, %rax
L(start_erms):
# ifdef __ILP32__
@@ -298,310 +313,448 @@ L(start_erms):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ /* Load regardless. */
+ VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
-L(last_2x_vec):
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
+ */
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
L(return):
-#if VEC_SIZE > 16
+# if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
-#else
+# else
ret
+# endif
#endif
-L(movsb):
- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
- jae L(more_8x_vec)
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %r9
- cmpq %r9, %rdi
- /* Avoid slow backward REP MOVSB. */
- jb L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rdi, %rcx
- subq %rsi, %rcx
- jmp 2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
- jz 3f
- movq %rsi, %rcx
- subq %rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
- is N*4GB + [1..63] with N >= 0. */
- cmpl $63, %ecx
- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
-3:
-# endif
- mov %RDX_LP, %RCX_LP
- rep movsb
-L(nop):
+#if LARGE_MOV_SIZE
+ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+ ENTRY block and L(less_vec). */
+ .p2align 4,, 8
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl (%rsi), %ecx
+ movl (%rsi, %rdx), %esi
+ movl %ecx, (%rdi)
+ movl %esi, (%rdi, %rdx)
ret
#endif
+ .p2align 4
L(less_vec):
/* Less than 1 VEC. */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
- cmpb $32, %dl
+ cmpl $32, %edx
jae L(between_32_63)
#endif
#if VEC_SIZE > 16
- cmpb $16, %dl
+ cmpl $16, %edx
jae L(between_16_31)
#endif
- cmpb $8, %dl
+ cmpl $8, %edx
jae L(between_8_15)
- cmpb $4, %dl
+#if SMALL_MOV_SIZE
+ cmpl $4, %edx
+#else
+ subq $4, %rdx
+#endif
jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movzbl (%rsi), %ecx
+ cmpl $(1 - SMALL_SIZE_OFFSET), %edx
+ jl L(copy_0)
+ movb (%rsi), %cl
+ je L(copy_1)
+ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
movb %cl, (%rdi)
-1:
+L(copy_0):
ret
+
+#if SMALL_MOV_SIZE
+ .p2align 4,, 8
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl -4(%rsi, %rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi, %rdx)
+ movl %esi, (%rdi)
+ ret
+#endif
+
+#if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+ .p2align 4,, 8
+L(between_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi, %rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi, %rdx)
+ /* No ymm registers have been touched. */
+ ret
+#endif
+
#if VEC_SIZE > 32
+ .p2align 4,, 10
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
VMOVU (%rsi), %YMM0
- VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU -32(%rsi, %rdx), %YMM1
VMOVU %YMM0, (%rdi)
- VMOVU %YMM1, -32(%rdi,%rdx)
- VZEROUPPER_RETURN
-#endif
-#if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- VMOVU (%rsi), %XMM0
- VMOVU -16(%rsi,%rdx), %XMM1
- VMOVU %XMM0, (%rdi)
- VMOVU %XMM1, -16(%rdi,%rdx)
+ VMOVU %YMM1, -32(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
+
+ .p2align 4,, 10
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq -8(%rsi,%rdx), %rcx
+ movq -8(%rsi, %rdx), %rcx
movq (%rsi), %rsi
- movq %rcx, -8(%rdi,%rdx)
movq %rsi, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl -4(%rsi,%rdx), %ecx
- movl (%rsi), %esi
- movl %ecx, -4(%rdi,%rdx)
- movl %esi, (%rdi)
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movzwl -2(%rsi,%rdx), %ecx
- movzwl (%rsi), %esi
- movw %cx, -2(%rdi,%rdx)
- movw %si, (%rdi)
- ret
+ .p2align 4,, 10
+L(last_4x_vec):
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+
+ /* VEC(0) and VEC(1) have already been loaded. */
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VZEROUPPER_RETURN
+
+ .p2align 4
#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
ja L(movsb)
#endif
L(more_2x_vec):
- /* More than 2 * VEC and there may be overlap between destination
- and source. */
+ /* More than 2 * VEC and there may be overlap between
+ destination and source. */
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
+ /* Load VEC(1) regardless. VEC(0) has already been loaded. */
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec)
- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER_RETURN
-L(last_4x_vec):
- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
VZEROUPPER_RETURN
+ .p2align 4,, 4
L(more_8x_vec):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ /* Go to backwards temporal copy if overlap no matter what as
+ backward REP MOVSB is slow and we don't want to use NT stores if
+ there is overlap. */
+ cmpq %rdx, %rcx
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
+ jb L(more_8x_vec_backward_check_nop)
/* Check if non-temporal move candidate. */
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_memcpy_2x)
#endif
- /* Entry if rdx is greater than non-temporal threshold but there
- is overlap. */
+ /* To reach this point there cannot be overlap and dst > src. So
+ check for overlap and src > dst in which case correctness
+ requires forward copy. Otherwise decide between backward/forward
+ copy depending on address aliasing. */
+
+ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+ but less than __x86_shared_non_temporal_threshold. */
L(more_8x_vec_check):
- cmpq %rsi, %rdi
- ja L(more_8x_vec_backward)
- /* Source == destination is less common. */
- je L(nop)
- /* Load the first VEC and last 4 * VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
+ /* rcx contains dst - src. Add back length (rdx). */
+ leaq (%rcx, %rdx), %r8
+ /* If r8 has different sign than rcx then there is overlap so we
+ must do forward copy. */
+ xorq %rcx, %r8
+ /* Isolate just sign bit of r8. */
+ shrq $63, %r8
+ /* Get 4k difference dst - src. */
+ andl $(PAGE_SIZE - 256), %ecx
+ /* If r8 is non-zero must do foward for correctness. Otherwise
+ if ecx is non-zero there is 4k False Alaising so do backward
+ copy. */
+ addl %r8d, %ecx
+ jz L(more_8x_vec_backward)
+
+ /* if rdx is greater than __x86_shared_non_temporal_threshold
+ but there is overlap, or from short distance movsb. */
+L(more_8x_vec_forward):
+ /* Load first and last 4 * VEC to support overlapping addresses.
+ */
+
+ /* First vec was already loaded into VEC(0). */
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ /* Save begining of dst. */
+ movq %rdi, %rcx
+ /* Align dst to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
- /* Save start and stop of the destination buffer. */
- movq %rdi, %r11
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
- /* Align destination for aligned stores in the loop. Compute
- how much destination is misaligned. */
- movq %rdi, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %r8
- /* Adjust source. */
- subq %r8, %rsi
- /* Adjust destination which should be aligned now. */
- subq %r8, %rdi
- /* Adjust length. */
- addq %r8, %rdx
- .p2align 4
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rcx, %rsi
+ /* Finish aligning dst. */
+ incq %rdi
+ /* Restore src adjusted with new value for aligned dst. */
+ addq %rdi, %rsi
+ /* Store end of buffer minus tail in rdx. */
+ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+ /* Dont use multi-byte nop to align. */
+ .p2align 4,, 11
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU (%rsi), %VEC(1)
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
subq $-(VEC_SIZE * 4), %rsi
- addq $-(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(1), VEC_SIZE(%rdi)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVA %VEC(1), (%rdi)
+ VMOVA %VEC(2), VEC_SIZE(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
- cmpq $(VEC_SIZE * 4), %rdx
+ cmpq %rdi, %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
+ VMOVU %VEC(7), VEC_SIZE(%rdx)
+ VMOVU %VEC(8), (%rdx)
/* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU %VEC(0), (%rcx)
+ /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+ */
+L(nop_backward):
VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+ /* rcx contains dst - src. Test for dst == src to skip all of
+ memmove. */
+ testq %rcx, %rcx
+ jz L(nop_backward)
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
- VMOVU (%rsi), %VEC(4)
+
+ /* First vec was also loaded into VEC(0). */
VMOVU VEC_SIZE(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ /* Begining of region for 4x backward copy stored in rcx. */
+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
- /* Save stop of the destination buffer. */
- leaq -VEC_SIZE(%rdi, %rdx), %r11
- /* Align destination end for aligned stores in the loop. Compute
- how much destination end is misaligned. */
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- movq %r11, %r9
- movq %r11, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Adjust source. */
- subq %r8, %rcx
- /* Adjust the end of destination which should be aligned now. */
- subq %r8, %r9
- /* Adjust length. */
- subq %r8, %rdx
-
- .p2align 4
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Align dst. */
+ andq $-(VEC_SIZE), %rcx
+ /* Restore src. */
+ addq %rcx, %rsi
+
+ /* Don't use multi-byte nop to align. */
+ .p2align 4,, 11
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- addq $-(VEC_SIZE * 4), %rcx
- addq $-(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%r9)
- VMOVA %VEC(1), -VEC_SIZE(%r9)
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- addq $-(VEC_SIZE * 4), %r9
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_backward)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
+ addq $(VEC_SIZE * -4), %rsi
+ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
+ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
+ addq $(VEC_SIZE * -4), %rcx
+ cmpq %rcx, %rdi
+ jb L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(0), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
+ VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ /* L(skip_short_movsb_check) is only used with ERMS. Not for
+ FSRM. */
+ .p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# endif
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+# error Unsupported MOVSB_ALIGN_TO
+# endif
+ /* If CPU does not have FSRM two options for aligning. Align src
+ if dst and src 4k alias. Otherwise align dst. */
+ testl $(PAGE_SIZE - 512), %ecx
+ jnz L(movsb_align_dst)
+ /* Fall through. dst and src 4k alias. It's better to align src
+ here because the bottleneck will be loads dues to the false
+ dependency on dst. */
+
+ /* rcx already has dst - src. */
+ movq %rcx, %r9
+ /* Add src to len. Subtract back after src aligned. -1 because
+ src is initially aligned to MOVSB_ALIGN_TO - 1. */
+ leaq -1(%rsi, %rdx), %rcx
+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
+ orq $(MOVSB_ALIGN_TO - 1), %rsi
+ /* Restore dst and len adjusted with new values for aligned dst.
+ */
+ leaq 1(%rsi, %r9), %rdi
+ subq %rsi, %rcx
+ /* Finish aligning src. */
+ incq %rsi
+
+ rep movsb
+
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
VZEROUPPER_RETURN
+# endif
+
+ .p2align 4,, 12
+L(movsb):
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ /* Go to backwards temporal copy if overlap no matter what as
+ backward REP MOVSB is slow and we don't want to use NT stores if
+ there is overlap. */
+ cmpq %rdx, %rcx
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
+ jb L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+ /* Save dest for storing aligning VECs later. */
+ movq %rdi, %r8
+# endif
+ /* If above __x86_rep_movsb_stop_threshold most likely is
+ candidate for NT moves aswell. */
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+ jae L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+ /* Only avoid short movsb if CPU has FSRM. */
+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+ jz L(skip_short_movsb_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ /* Avoid "rep movsb" if RCX, the distance between source and
+ destination, is N*4GB + [1..63] with N >= 0. */
+
+ /* ecx contains dst - src. Early check for backward copy
+ conditions means only case of slow movsb with src = dst + [0,
+ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+ for that case. */
+ cmpl $-64, %ecx
+ ja L(more_8x_vec_forward)
+# endif
+# endif
+# if ALIGN_MOVSB
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# endif
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+# error Unsupported MOVSB_ALIGN_TO
+# endif
+ /* Fall through means cpu has FSRM. In that case exclusively
+ align destination. */
+L(movsb_align_dst):
+ /* Subtract dst from src. Add back after dst aligned. */
+ subq %rdi, %rsi
+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
+ addq $(MOVSB_ALIGN_TO - 1), %rdi
+ /* Add dst to len. Subtract back after dst aligned. */
+ leaq (%r8, %rdx), %rcx
+ /* Finish aligning dst. */
+ andq $-(MOVSB_ALIGN_TO), %rdi
+ /* Restore src and len adjusted with new values for aligned dst.
+ */
+ addq %rdi, %rsi
+ subq %rdi, %rcx
+
+ rep movsb
+
+ /* Store VECs loaded for aligning. */
+ VMOVU %VEC(0), (%r8)
+# if MOVSB_ALIGN_TO > VEC_SIZE
+ VMOVU %VEC(1), VEC_SIZE(%r8)
+# endif
+ VZEROUPPER_RETURN
+# else /* !ALIGN_MOVSB. */
+L(skip_short_movsb_check):
+ mov %RDX_LP, %RCX_LP
+ rep movsb
+ ret
+# endif
+#endif
+ .p2align 4,, 10
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- .p2align 4
+L(large_memcpy_2x_check):
+ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
+ jb L(more_8x_vec_check)
L(large_memcpy_2x):
- /* Compute absolute value of difference between source and
- destination. */
- movq %rdi, %r9
- subq %rsi, %r9
- movq %r9, %r8
- leaq -1(%r9), %rcx
- sarq $63, %r8
- xorq %r8, %r9
- subq %r8, %r9
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache when
- source is loaded. */
- cmpq %r9, %rdx
- ja L(more_8x_vec_check)
+ /* To reach this point it is impossible for dst > src and
+ overlap. Remaining to check is src > dst and overlap. rcx
+ already contains dst - src. Negate rcx to get src - dst. If
+ length > rcx then there is overlap and forward copy is best. */
+ negq %rcx
+ cmpq %rcx, %rdx
+ ja L(more_8x_vec_forward)
/* Cache align destination. First store the first 64 bytes then
adjust alignments. */
- VMOVU (%rsi), %VEC(8)
-#if VEC_SIZE < 64
- VMOVU VEC_SIZE(%rsi), %VEC(9)
-#if VEC_SIZE < 32
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
-#endif
-#endif
- VMOVU %VEC(8), (%rdi)
-#if VEC_SIZE < 64
- VMOVU %VEC(9), VEC_SIZE(%rdi)
-#if VEC_SIZE < 32
- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
-#endif
-#endif
+
+ /* First vec was also loaded into VEC(0). */
+# if VEC_SIZE < 64
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+# if VEC_SIZE < 32
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+# endif
+# endif
+ VMOVU %VEC(0), (%rdi)
+# if VEC_SIZE < 64
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+# if VEC_SIZE < 32
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+# endif
+# endif
+
/* Adjust source, destination, and size. */
movq %rdi, %r8
andq $63, %r8
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
/* Adjust length. */
addq %r8, %rdx
- /* Test if source and destination addresses will alias. If they do
- the larger pipeline in large_memcpy_4x alleviated the
+ /* Test if source and destination addresses will alias. If they
+ do the larger pipeline in large_memcpy_4x alleviated the
performance drop. */
+
+ /* ecx contains -(dst - src). not ecx will return dst - src - 1
+ which works for testing aliasing. */
+ notl %ecx
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
jz L(large_memcpy_4x)
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
/* ecx stores inner loop counter. */
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
L(loop_large_memcpy_4x_inner):
- /* Only one prefetch set per page as doing 4 pages give more time
- for prefetcher to keep up. */
+ /* Only one prefetch set per page as doing 4 pages give more
+ time for prefetcher to keep up. */
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)

View File

@ -0,0 +1,131 @@
commit cecbac52123456e2fbcff062a4165bf7b9174797
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Nov 1 00:49:52 2021 -0500
x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
No bug.
This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.
On Skylake with ERMS:
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096, 0, 0, 0, 0.975
4096, 0, 0, 1, 0.953
4096, 12, 0, 0, 0.969
4096, 12, 0, 1, 0.872
4096, 44, 0, 0, 0.979
4096, 44, 0, 1, 0.83
4096, 0, 12, 0, 1.006
4096, 0, 12, 1, 0.989
4096, 0, 44, 0, 0.739
4096, 0, 44, 1, 0.942
4096, 12, 12, 0, 1.009
4096, 12, 12, 1, 0.973
4096, 44, 44, 0, 0.791
4096, 44, 44, 1, 0.961
4096, 2048, 0, 0, 0.978
4096, 2048, 0, 1, 0.951
4096, 2060, 0, 0, 0.986
4096, 2060, 0, 1, 0.963
4096, 2048, 12, 0, 0.971
4096, 2048, 12, 1, 0.941
4096, 2060, 12, 0, 0.977
4096, 2060, 12, 1, 0.949
8192, 0, 0, 0, 0.85
8192, 0, 0, 1, 0.845
8192, 13, 0, 0, 0.937
8192, 13, 0, 1, 0.939
8192, 45, 0, 0, 0.932
8192, 45, 0, 1, 0.927
8192, 0, 13, 0, 0.621
8192, 0, 13, 1, 0.62
8192, 0, 45, 0, 0.53
8192, 0, 45, 1, 0.516
8192, 13, 13, 0, 0.664
8192, 13, 13, 1, 0.659
8192, 45, 45, 0, 0.593
8192, 45, 45, 1, 0.575
8192, 2048, 0, 0, 0.854
8192, 2048, 0, 1, 0.834
8192, 2061, 0, 0, 0.863
8192, 2061, 0, 1, 0.857
8192, 2048, 13, 0, 0.63
8192, 2048, 13, 1, 0.629
8192, 2061, 13, 0, 0.627
8192, 2061, 13, 1, 0.62
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
unsigned int minimum_rep_movsb_threshold;
#endif
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
+ threshold is 2048 * (VEC_SIZE / 16). */
unsigned int rep_movsb_threshold;
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
{
- rep_movsb_threshold = 2048 * (64 / 16);
+ rep_movsb_threshold = 4096 * (64 / 16);
#if HAVE_TUNABLES
minimum_rep_movsb_threshold = 64 * 8;
#endif
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
AVX_Fast_Unaligned_Load))
{
- rep_movsb_threshold = 2048 * (32 / 16);
+ rep_movsb_threshold = 4096 * (32 / 16);
#if HAVE_TUNABLES
minimum_rep_movsb_threshold = 32 * 8;
#endif
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index dd6e1d65c9490d4f..419313804d49cf65 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
}
x86_rep_movsb_threshold {
type: SIZE_T
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
- # isn't faster on short data. The memcpy micro benchmark in glibc
- # shows that 2KB is the approximate value above which REP MOVSB
- # becomes faster than SSE2 optimization on processors with Enhanced
- # REP MOVSB. Since larger register size can move more data with a
- # single load and store, the threshold is higher with larger register
- # size. Note: Since the REP MOVSB threshold must be greater than 8
- # times of vector size and the default value is 2048 * (vector size
- # / 16), the default value and the minimum value must be updated at
- # run-time. NB: Don't set the default value since we can't tell if
- # the tunable value is set by user or not [BZ #27069].
+ # Since there is overhead to set up REP MOVSB operation, REP
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
+ # in glibc shows that 2KB is the approximate value above which
+ # REP MOVSB becomes faster than SSE2 optimization on processors
+ # with Enhanced REP MOVSB. Since larger register size can move
+ # more data with a single load and store, the threshold is
+ # higher with larger register size. Micro benchmarks show AVX
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
+ # threshold is extrapolated to 16KB. For machines with FSRM the
+ # threshold is universally set at 2112 bytes. Note: Since the
+ # REP MOVSB threshold must be greater than 8 times of vector
+ # size and the default value is 4096 * (vector size / 16), the
+ # default value and the minimum value must be updated at
+ # run-time. NB: Don't set the default value since we can't tell
+ # if the tunable value is set by user or not [BZ #27069].
minval: 1
}
x86_rep_stosb_threshold {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,104 @@
commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Dec 3 15:29:25 2021 -0800
x86-64: Use notl in EVEX strcmp [BZ #28646]
Must use notl %edi here as lower bits are for CHAR comparisons
potentially out of range thus can be 0 without indicating mismatch.
This fixes BZ #28646.
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 7feababf4ddc5603..a0255b9625fbcedd 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -25,6 +25,7 @@
# define TEST_NAME "strcmp"
#endif
#include "test-string.h"
+#include <support/test-driver.h>
#ifdef WIDE
# include <wchar.h>
@@ -392,6 +393,32 @@ check2 (void)
}
}
+static void
+check3 (void)
+{
+ size_t size = 0xd000 + 0x4000;
+ CHAR *s1, *s2;
+ CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
+ error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+ s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
+ s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
+
+ STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
+ STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
+
+ int exp_result = SIMPLE_STRCMP (s1, s2);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1, s2, exp_result);
+
+ munmap ((void *) buffer1, size);
+ munmap ((void *) buffer2, size);
+}
+
int
test_main (void)
{
@@ -400,6 +427,7 @@ test_main (void)
test_init ();
check();
check2 ();
+ check3 ();
printf ("%23s", "");
FOR_EACH_IMPL (impl, 0)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 82f12ac89bcae20b..6f5c4bf984da2b80 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -656,12 +656,13 @@ L(loop_cross_page):
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
kmovd %k3, %edi
+ /* Must use notl %edi here as lower bits are for CHAR
+ comparisons potentially out of range thus can be 0 without
+ indicating mismatch. */
+ notl %edi
# ifdef USE_AS_WCSCMP
/* Don't use subl since it is the upper 8 bits of EDI below. */
- notl %edi
andl $0xff, %edi
-# else
- incl %edi
# endif
# ifdef USE_AS_WCSCMP
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
kmovd %k3, %edi
+ /* Must use notl %edi here as lower bits are for CHAR
+ comparisons potentially out of range thus can be 0 without
+ indicating mismatch. */
+ notl %edi
# ifdef USE_AS_WCSCMP
/* Don't use subl since it is the upper 8 bits of EDI below. */
- notl %edi
andl $0xff, %edi
-# else
- incl %edi
# endif
# ifdef USE_AS_WCSCMP

View File

@ -0,0 +1,30 @@
commit f3a99b2216114f89b20329ae7664b764248b4bbd
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Dec 6 07:14:12 2021 -0800
x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
they won't lower CPU frequency when ZMM load and store instructions are
used.
(cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index f4d4049e391cbabd..09590d8794b1c6fb 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -566,8 +566,11 @@ disable_tsx:
|= bit_arch_Prefer_No_VZEROUPPER;
else
{
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
+ when ZMM load and store instructions are used. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
/* Avoid RTM abort triggered by VZEROUPPER inside a
transactionally executing RTM region. */

View File

@ -0,0 +1,384 @@
commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Dec 24 18:54:41 2021 -0600
x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
No bug.
Optimizations are twofold.
1) Replace page cross and 0/1 checks with masked load instructions in
L(less_vec). In applications this reduces branch-misses in the
hot [0, 32] case.
2) Change controlflow so that L(less_vec) case gets the fall through.
Change 2) helps copies in the [0, 32] size range but comes at the cost
of copies in the [33, 64] size range. From profiles of GCC and
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
appears to the the right tradeoff.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 640f6757fac8a356..d2899e7c7078cd41 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,15 +62,18 @@ Latency:
# define VMOVU vmovdqu64
# ifdef USE_AS_WMEMCMP
+# define VMOVU_MASK vmovdqu32
# define CHAR_SIZE 4
# define VPCMP vpcmpd
# define VPTEST vptestmd
# else
+# define VMOVU_MASK vmovdqu8
# define CHAR_SIZE 1
# define VPCMP vpcmpub
# define VPTEST vptestmb
# endif
+
# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
movl %edx, %edx
# endif
cmp $CHAR_PER_VEC, %RDX_LP
- jb L(less_vec)
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
+ ja L(more_1x_vec)
+
+ /* Create mask for CHAR's we want to compare. This allows us to
+ avoid having to include page cross logic. */
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k2
+
+ /* Safe to load full ymm with mask. */
+ VMOVU_MASK (%rsi), %YMM2{%k2}
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
+ ret
+ .p2align 4
+L(return_vec_0):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+
+ .p2align 4
+L(more_1x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %YMM1
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4, (%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %YMM1, %k1
kmovd %k1, %eax
/* NB: eax must be destination register if going to
L(return_vec_[0,2]). For L(return_vec_3) destination register
@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* Check third and fourth VEC no matter what. */
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(return_vec_3)
@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
oring with YMM1. Result is stored in YMM4. */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
/* Or together YMM2, YMM3, and YMM4 into YMM4. */
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* NB: eax must be zero to reach here. */
ret
- .p2align 4
+
+ .p2align 4,, 8
L(8x_end_return_vec_0_1_2_3):
movq %rdx, %rdi
L(8x_return_vec_0_1_2_3):
@@ -222,23 +262,6 @@ L(return_vec_3):
# endif
ret
- .p2align 4
-L(return_vec_0):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax, CHAR_SIZE), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
- /* NB: no partial register stall here because xorl zero idiom
- above. */
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
.p2align 4
L(return_vec_1):
@@ -297,7 +320,7 @@ L(loop_4x_vec):
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
@@ -324,7 +347,7 @@ L(loop_4x_vec):
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
@@ -336,14 +359,14 @@ L(loop_4x_vec):
/* Only entry is from L(more_8x_vec). */
.p2align 4,, 10
L(8x_last_2x_vec):
- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_2)
/* Naturally aligned to 16 bytes. */
L(8x_last_1x_vec):
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_3)
@@ -392,7 +415,9 @@ L(last_1x_vec):
jnz L(return_vec_0_end)
ret
- .p2align 4,, 10
+
+ /* Don't align. Takes 2-fetch blocks either way and aligning
+ will cause code to spill into another cacheline. */
L(return_vec_1_end):
/* Use bsf to save code size. This is necessary to have
L(one_or_less) fit in aligning bytes between. */
@@ -411,31 +436,8 @@ L(return_vec_1_end):
# endif
ret
- /* NB: L(one_or_less) fits in alignment padding between
- L(return_vec_1_end) and L(return_vec_0_end). */
-# ifdef USE_AS_WMEMCMP
-L(one_or_less):
- jb L(zero)
- movl (%rdi), %ecx
- xorl %edx, %edx
- cmpl (%rsi), %ecx
- je L(zero)
- setg %dl
- leal -1(%rdx, %rdx), %eax
- ret
-# else
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
- ret
-# endif
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ /* Don't align. Takes 2-fetch blocks either way and aligning
+ will cause code to spill into another cacheline. */
L(return_vec_0_end):
tzcntl %eax, %eax
addl %edx, %eax
@@ -451,146 +453,7 @@ L(return_vec_0_end):
subl %ecx, %eax
# endif
ret
+ /* 1-byte until next cache line. */
- .p2align 4
-L(less_vec):
- /* Check if one or less CHAR. This is necessary for size == 0
- but is also faster for size == CHAR_SIZE. */
- cmpl $1, %edx
- jbe L(one_or_less)
-
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
-
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Check if any matches where in bounds. Intentionally not
- storing result in eax to limit dependency chain if it goes to
- L(return_vec_0_lv). */
- bzhil %edx, %eax, %edx
- jnz L(return_vec_0_lv)
- xorl %eax, %eax
- ret
-
- /* Essentially duplicate of L(return_vec_0). Ends up not costing
- any code as shrinks L(less_vec) by allowing 2-byte encoding of
- the jump and ends up fitting in aligning bytes. As well fits on
- same cache line as L(less_vec) so also saves a line from having
- to be fetched on cold calls to memcmp. */
- .p2align 4,, 4
-L(return_vec_0_lv):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax, CHAR_SIZE), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
- /* NB: no partial register stall here because xorl zero idiom
- above. */
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(page_cross_less_vec):
- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
- bytes. */
- cmpl $(16 / CHAR_SIZE), %edx
- jae L(between_16_31)
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(between_8_15)
- cmpl $4, %edx
- jb L(between_2_3)
-
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- /* edx is guranteed to be positive int32 in range [4, 7]. */
- cmovne %edx, %eax
- /* ecx is -1 if rcx > rax. Otherwise 0. */
- sbbl %ecx, %ecx
- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
- rax then eax and ecx are zero. If rax < rax then ecx is -1 so
- eax doesn't matter. */
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(between_8_15):
-# endif
- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMP $4, %xmm1, %xmm2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_lv)
- /* Use overlapping loads to avoid branches. */
- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
- VPCMP $4, %xmm1, %xmm2, %k1
- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_end)
- ret
-
- .p2align 4,, 8
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
-
- /* Use movups to save code size. */
- vmovdqu (%rsi), %xmm2
- VPCMP $4, (%rdi), %xmm2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_lv)
- /* Use overlapping loads to avoid branches. */
- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_end)
- ret
-
-# ifndef USE_AS_WMEMCMP
-L(between_2_3):
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
- ret
-# endif
END (MEMCMP)
#endif

View File

@ -0,0 +1,42 @@
commit 9681691402052b727e01ae3375c73e0f76566593
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Wed Apr 27 13:59:26 2022 -0300
linux: Fix missing internal 64 bit time_t stat usage
These are two missing spots initially done by 52a5fe70a2c77935.
Checked on i686-linux-gnu.
(cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9)
diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
index 13160d32499c4e58..00e4ce7f80ee2dfe 100644
--- a/sysdeps/unix/sysv/linux/faccessat.c
+++ b/sysdeps/unix/sysv/linux/faccessat.c
@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
- struct stat64 stats;
- if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+ struct __stat64_t64 stats;
+ if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
return -1;
mode &= (X_OK | W_OK | R_OK); /* Clear any bogus bits. */
diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
index b599a66c930cad4d..f79930303118ebcd 100644
--- a/sysdeps/unix/sysv/linux/pathconf.c
+++ b/sysdeps/unix/sysv/linux/pathconf.c
@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
&& strcmp (mntbuf.mnt_type, "ext4") != 0)
continue;
- struct stat64 fsst;
- if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
+ struct __stat64_t64 fsst;
+ if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
&& st.st_dev == fsst.st_dev)
{
if (strcmp (mntbuf.mnt_type, "ext4") == 0)

View File

@ -0,0 +1,39 @@
commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe
Author: Carlos O'Donell <carlos@redhat.com>
Date: Tue Apr 26 10:52:41 2022 -0400
i386: Regenerate ulps
These failures were caught while building glibc master for Fedora
Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse'
using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon
processor.
(cherry picked from commit e465d97653311c3687aee49de782177353acfe86)
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 7601049110789201..84e6686eba5fe79a 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
Function: Imaginary part of "clog10":
double: 2
-float: 1
+float: 2
float128: 2
ldouble: 2
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index a39c89cec1141935..cc21e6907fe8b6a3 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
Function: Imaginary part of "clog10":
double: 2
-float: 1
+float: 2
float128: 2
ldouble: 2

View File

@ -148,7 +148,7 @@ end \
Summary: The GNU libc libraries
Name: glibc
Version: %{glibcversion}
Release: 30%{?dist}
Release: 31%{?dist}
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
# libraries.
@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch
Patch176: glibc-rh2058230.patch
Patch177: glibc-rh2054789.patch
Patch178: glibc-upstream-2.34-108.patch
Patch179: glibc-upstream-2.34-110.patch
# glibc-2.34-109-gd64b08d5ba only changes NEWS.
Patch179: glibc-upstream-2.34-110.patch
Patch180: glibc-upstream-2.34-111.patch
Patch181: glibc-upstream-2.34-112.patch
Patch182: glibc-upstream-2.34-113.patch
Patch183: glibc-upstream-2.34-114.patch
# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
Patch184: glibc-upstream-2.34-117.patch
Patch185: glibc-upstream-2.34-118.patch
Patch186: glibc-upstream-2.34-119.patch
# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
Patch187: glibc-upstream-2.34-120.patch
Patch188: glibc-upstream-2.34-121.patch
Patch189: glibc-upstream-2.34-122.patch
@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch
Patch230: glibc-upstream-2.34-164.patch
Patch231: glibc-upstream-2.34-165.patch
Patch232: glibc-upstream-2.34-166.patch
Patch233: glibc-upstream-2.34-167.patch
Patch234: glibc-upstream-2.34-168.patch
Patch235: glibc-upstream-2.34-169.patch
Patch236: glibc-upstream-2.34-170.patch
Patch237: glibc-upstream-2.34-171.patch
Patch238: glibc-upstream-2.34-172.patch
Patch239: glibc-upstream-2.34-173.patch
Patch240: glibc-upstream-2.34-174.patch
Patch241: glibc-upstream-2.34-175.patch
Patch242: glibc-upstream-2.34-176.patch
Patch243: glibc-upstream-2.34-177.patch
Patch244: glibc-upstream-2.34-178.patch
Patch245: glibc-upstream-2.34-179.patch
Patch246: glibc-upstream-2.34-180.patch
Patch247: glibc-upstream-2.34-181.patch
Patch248: glibc-upstream-2.34-182.patch
Patch249: glibc-upstream-2.34-183.patch
Patch250: glibc-upstream-2.34-184.patch
Patch251: glibc-upstream-2.34-185.patch
Patch252: glibc-upstream-2.34-186.patch
Patch253: glibc-upstream-2.34-187.patch
Patch254: glibc-upstream-2.34-188.patch
##############################################################################
# Continued list of core "glibc" package information:
@ -2493,6 +2515,32 @@ fi
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
%changelog
* Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31
- Sync with upstream branch release/2.34/master,
commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe:
- i386: Regenerate ulps
- linux: Fix missing internal 64 bit time_t stat usage
- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
- x86-64: Use notl in EVEX strcmp [BZ #28646]
- x86: Shrink memcmp-sse4.S code size
- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
- x86: Optimize memmove-vec-unaligned-erms.S
- x86-64: Replace movzx with movzbl
- x86-64: Remove Prefer_AVX2_STRCMP
- x86-64: Improve EVEX strcmp with masked load
- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
- x86: Optimize memset-vec-unaligned-erms.S
- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
- x86: Modify ENTRY in sysdep.h so that p2align can be specified
- x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
- INSTALL: Rephrase -with-default-link documentation
- misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
- Default to --with-default-link=no (bug 25812)
- scripts: Add glibcelf.py module
* Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30
- Sync with upstream branch release/2.34/master,
commit 71326f1f2fd09dafb9c34404765fb88129e94237: