Sync with upstream branch release/2.34/master
Upstream commit: 55640ed3fde48360a8e8083be4843bd2dc7cecfe - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module
This commit is contained in:
parent
a8db42ba53
commit
4e3257320c
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,407 @@
|
|||
commit f0c71b34f96c816292c49122d50da3a511b67bf2
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Mon Apr 11 11:30:31 2022 +0200
|
||||
|
||||
Default to --with-default-link=no (bug 25812)
|
||||
|
||||
This is necessary to place the libio vtables into the RELRO segment.
|
||||
New tests elf/tst-relro-ldso and elf/tst-relro-libc are added to
|
||||
verify that this is what actually happens.
|
||||
|
||||
The new tests fail on ia64 due to lack of (default) RELRO support
|
||||
inbutils, so they are XFAILed there.
|
||||
|
||||
(cherry picked from commit 198abcbb94618730dae1b3f4393efaa49e0ec8c7)
|
||||
|
||||
diff --git a/INSTALL b/INSTALL
|
||||
index d8d4e9f155f56616..60d01568d77645c7 100644
|
||||
--- a/INSTALL
|
||||
+++ b/INSTALL
|
||||
@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization. For example:
|
||||
library will still be usable, but functionality may be lost--for
|
||||
example, you can't build a shared libc with old binutils.
|
||||
|
||||
+'--with-default-link=FLAG'
|
||||
+ With '--with-default-link=yes', the build system does not use a
|
||||
+ custom linker script for linking shared objects. The default for
|
||||
+ FLAG is the opposite, 'no', because the custom linker script is
|
||||
+ needed for full RELRO protection.
|
||||
+
|
||||
'--with-nonshared-cflags=CFLAGS'
|
||||
Use additional compiler flags CFLAGS to build the parts of the
|
||||
library which are always statically linked into applications and
|
||||
diff --git a/configure b/configure
|
||||
index 03f4e59e754b5463..34c64f8de44e3086 100755
|
||||
--- a/configure
|
||||
+++ b/configure
|
||||
@@ -3373,7 +3373,7 @@ fi
|
||||
if test "${with_default_link+set}" = set; then :
|
||||
withval=$with_default_link; use_default_link=$withval
|
||||
else
|
||||
- use_default_link=default
|
||||
+ use_default_link=no
|
||||
fi
|
||||
|
||||
|
||||
@@ -6085,69 +6085,6 @@ fi
|
||||
$as_echo "$libc_cv_hashstyle" >&6; }
|
||||
|
||||
|
||||
-# The linker's default -shared behavior is good enough if it
|
||||
-# does these things that our custom linker scripts ensure that
|
||||
-# all allocated NOTE sections come first.
|
||||
-if test "$use_default_link" = default; then
|
||||
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
|
||||
-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
|
||||
-if ${libc_cv_use_default_link+:} false; then :
|
||||
- $as_echo_n "(cached) " >&6
|
||||
-else
|
||||
- libc_cv_use_default_link=no
|
||||
- cat > conftest.s <<\EOF
|
||||
- .section .note.a,"a",%note
|
||||
- .balign 4
|
||||
- .long 4,4,9
|
||||
- .string "GNU"
|
||||
- .string "foo"
|
||||
- .section .note.b,"a",%note
|
||||
- .balign 4
|
||||
- .long 4,4,9
|
||||
- .string "GNU"
|
||||
- .string "bar"
|
||||
-EOF
|
||||
- if { ac_try=' ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
|
||||
- { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
|
||||
- (eval $ac_try) 2>&5
|
||||
- ac_status=$?
|
||||
- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
|
||||
- test $ac_status = 0; }; } &&
|
||||
- ac_try=`$READELF -S conftest.so | sed -n \
|
||||
- '${x;p;}
|
||||
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
|
||||
- t a
|
||||
- b
|
||||
- : a
|
||||
- H'`
|
||||
- then
|
||||
- libc_seen_a=no libc_seen_b=no
|
||||
- set -- $ac_try
|
||||
- while test $# -ge 2 -a "$1" = NOTE; do
|
||||
- case "$2" in
|
||||
- .note.a) libc_seen_a=yes ;;
|
||||
- .note.b) libc_seen_b=yes ;;
|
||||
- esac
|
||||
- shift 2
|
||||
- done
|
||||
- case "$libc_seen_a$libc_seen_b" in
|
||||
- yesyes)
|
||||
- libc_cv_use_default_link=yes
|
||||
- ;;
|
||||
- *)
|
||||
- echo >&5 "\
|
||||
-$libc_seen_a$libc_seen_b from:
|
||||
-$ac_try"
|
||||
- ;;
|
||||
- esac
|
||||
- fi
|
||||
- rm -f conftest*
|
||||
-fi
|
||||
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
|
||||
-$as_echo "$libc_cv_use_default_link" >&6; }
|
||||
- use_default_link=$libc_cv_use_default_link
|
||||
-fi
|
||||
-
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
|
||||
$as_echo_n "checking for GLOB_DAT reloc... " >&6; }
|
||||
if ${libc_cv_has_glob_dat+:} false; then :
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index eb9431875fae1b0e..2c69af0807266e7e 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
|
||||
AS_HELP_STRING([--with-default-link],
|
||||
[do not use explicit linker scripts]),
|
||||
[use_default_link=$withval],
|
||||
- [use_default_link=default])
|
||||
+ [use_default_link=no])
|
||||
|
||||
dnl Additional build flags injection.
|
||||
AC_ARG_WITH([nonshared-cflags],
|
||||
@@ -1378,59 +1378,6 @@ fi
|
||||
rm -f conftest*])
|
||||
AC_SUBST(libc_cv_hashstyle)
|
||||
|
||||
-# The linker's default -shared behavior is good enough if it
|
||||
-# does these things that our custom linker scripts ensure that
|
||||
-# all allocated NOTE sections come first.
|
||||
-if test "$use_default_link" = default; then
|
||||
- AC_CACHE_CHECK([for sufficient default -shared layout],
|
||||
- libc_cv_use_default_link, [dnl
|
||||
- libc_cv_use_default_link=no
|
||||
- cat > conftest.s <<\EOF
|
||||
- .section .note.a,"a",%note
|
||||
- .balign 4
|
||||
- .long 4,4,9
|
||||
- .string "GNU"
|
||||
- .string "foo"
|
||||
- .section .note.b,"a",%note
|
||||
- .balign 4
|
||||
- .long 4,4,9
|
||||
- .string "GNU"
|
||||
- .string "bar"
|
||||
-EOF
|
||||
- if AC_TRY_COMMAND([dnl
|
||||
- ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
|
||||
- ac_try=`$READELF -S conftest.so | sed -n \
|
||||
- ['${x;p;}
|
||||
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
|
||||
- t a
|
||||
- b
|
||||
- : a
|
||||
- H']`
|
||||
- then
|
||||
- libc_seen_a=no libc_seen_b=no
|
||||
- set -- $ac_try
|
||||
- while test $# -ge 2 -a "$1" = NOTE; do
|
||||
- case "$2" in
|
||||
- .note.a) libc_seen_a=yes ;;
|
||||
- .note.b) libc_seen_b=yes ;;
|
||||
- esac
|
||||
- shift 2
|
||||
- done
|
||||
- case "$libc_seen_a$libc_seen_b" in
|
||||
- yesyes)
|
||||
- libc_cv_use_default_link=yes
|
||||
- ;;
|
||||
- *)
|
||||
- echo >&AS_MESSAGE_LOG_FD "\
|
||||
-$libc_seen_a$libc_seen_b from:
|
||||
-$ac_try"
|
||||
- ;;
|
||||
- esac
|
||||
- fi
|
||||
- rm -f conftest*])
|
||||
- use_default_link=$libc_cv_use_default_link
|
||||
-fi
|
||||
-
|
||||
AC_CACHE_CHECK(for GLOB_DAT reloc,
|
||||
libc_cv_has_glob_dat, [dnl
|
||||
cat > conftest.c <<EOF
|
||||
diff --git a/elf/Makefile b/elf/Makefile
|
||||
index 8afbe3f6ab259331..fec6e23b5b625e3b 100644
|
||||
--- a/elf/Makefile
|
||||
+++ b/elf/Makefile
|
||||
@@ -504,6 +504,40 @@ tests-execstack-yes = \
|
||||
# tests-execstack-yes
|
||||
endif
|
||||
endif
|
||||
+
|
||||
+tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
|
||||
+$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
|
||||
+ $(objpfx)ld.so
|
||||
+ $(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
|
||||
+ --required=_rtld_global_ro \
|
||||
+ > $@ 2>&1; $(evaluate-test)
|
||||
+# The optional symbols are present in libc only if the architecture has
|
||||
+# the GLIBC_2.0 symbol set in libc.
|
||||
+$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
|
||||
+ $(common-objpfx)libc.so
|
||||
+ $(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
|
||||
+ --required=_IO_cookie_jumps \
|
||||
+ --required=_IO_file_jumps \
|
||||
+ --required=_IO_file_jumps_maybe_mmap \
|
||||
+ --required=_IO_file_jumps_mmap \
|
||||
+ --required=_IO_helper_jumps \
|
||||
+ --required=_IO_mem_jumps \
|
||||
+ --required=_IO_obstack_jumps \
|
||||
+ --required=_IO_proc_jumps \
|
||||
+ --required=_IO_str_chk_jumps \
|
||||
+ --required=_IO_str_jumps \
|
||||
+ --required=_IO_strn_jumps \
|
||||
+ --required=_IO_wfile_jumps \
|
||||
+ --required=_IO_wfile_jumps_maybe_mmap \
|
||||
+ --required=_IO_wfile_jumps_mmap \
|
||||
+ --required=_IO_wmem_jumps \
|
||||
+ --required=_IO_wstr_jumps \
|
||||
+ --required=_IO_wstrn_jumps \
|
||||
+ --optional=_IO_old_cookie_jumps \
|
||||
+ --optional=_IO_old_file_jumps \
|
||||
+ --optional=_IO_old_proc_jumps \
|
||||
+ > $@ 2>&1; $(evaluate-test)
|
||||
+
|
||||
tests += $(tests-execstack-$(have-z-execstack))
|
||||
ifeq ($(run-built-tests),yes)
|
||||
tests-special += \
|
||||
diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
|
||||
new file mode 100644
|
||||
index 0000000000000000..368ea3349f86bd81
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-relro-symbols.py
|
||||
@@ -0,0 +1,137 @@
|
||||
+#!/usr/bin/python3
|
||||
+# Verify that certain symbols are covered by RELRO.
|
||||
+# Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+# This file is part of the GNU C Library.
|
||||
+#
|
||||
+# The GNU C Library is free software; you can redistribute it and/or
|
||||
+# modify it under the terms of the GNU Lesser General Public
|
||||
+# License as published by the Free Software Foundation; either
|
||||
+# version 2.1 of the License, or (at your option) any later version.
|
||||
+#
|
||||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+# Lesser General Public License for more details.
|
||||
+#
|
||||
+# You should have received a copy of the GNU Lesser General Public
|
||||
+# License along with the GNU C Library; if not, see
|
||||
+# <https://www.gnu.org/licenses/>.
|
||||
+
|
||||
+"""Analyze a (shared) object to verify that certain symbols are
|
||||
+present and covered by the PT_GNU_RELRO segment.
|
||||
+
|
||||
+"""
|
||||
+
|
||||
+import argparse
|
||||
+import os.path
|
||||
+import sys
|
||||
+
|
||||
+# Make available glibc Python modules.
|
||||
+sys.path.append(os.path.join(
|
||||
+ os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
|
||||
+
|
||||
+import glibcelf
|
||||
+
|
||||
+def find_relro(path: str, img: glibcelf.Image) -> (int, int):
|
||||
+ """Discover the address range of the PT_GNU_RELRO segment."""
|
||||
+ for phdr in img.phdrs():
|
||||
+ if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
|
||||
+ # The computation is not entirely accurate because
|
||||
+ # _dl_protect_relro in elf/dl-reloc.c rounds both the
|
||||
+ # start end and downwards using the run-time page size.
|
||||
+ return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
|
||||
+ sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
|
||||
+ sys.exit(1)
|
||||
+
|
||||
+def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
|
||||
+ """Check if a section or symbol falls within in the RELRO segment."""
|
||||
+ end = start + size - 1
|
||||
+ if not (relro_begin <= start < end < relro_end):
|
||||
+ error(
|
||||
+ '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
|
||||
+ kind, name.decode('UTF-8'), start, size,
|
||||
+ relro_begin, relro_end))
|
||||
+
|
||||
+def get_parser():
|
||||
+ """Return an argument parser for this script."""
|
||||
+ parser = argparse.ArgumentParser(description=__doc__)
|
||||
+ parser.add_argument('object', help='path to object file to check')
|
||||
+ parser.add_argument('--required', metavar='NAME', default=(),
|
||||
+ help='required symbol names', nargs='*')
|
||||
+ parser.add_argument('--optional', metavar='NAME', default=(),
|
||||
+ help='required symbol names', nargs='*')
|
||||
+ return parser
|
||||
+
|
||||
+def main(argv):
|
||||
+ """The main entry point."""
|
||||
+ parser = get_parser()
|
||||
+ opts = parser.parse_args(argv)
|
||||
+ img = glibcelf.Image.readfile(opts.object)
|
||||
+
|
||||
+ required_symbols = frozenset([sym.encode('UTF-8')
|
||||
+ for sym in opts.required])
|
||||
+ optional_symbols = frozenset([sym.encode('UTF-8')
|
||||
+ for sym in opts.optional])
|
||||
+ check_symbols = required_symbols | optional_symbols
|
||||
+
|
||||
+ # Tracks the symbols in check_symbols that have been found.
|
||||
+ symbols_found = set()
|
||||
+
|
||||
+ # Discover the extent of the RELRO segment.
|
||||
+ relro_begin, relro_end = find_relro(opts.object, img)
|
||||
+ symbol_table_found = False
|
||||
+
|
||||
+ errors = False
|
||||
+ def error(msg: str) -> None:
|
||||
+ """Record an error condition and write a message to standard output."""
|
||||
+ nonlocal errors
|
||||
+ errors = True
|
||||
+ sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
|
||||
+
|
||||
+ # Iterate over section headers to find the symbol table.
|
||||
+ for shdr in img.shdrs():
|
||||
+ if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
|
||||
+ symbol_table_found = True
|
||||
+ for sym in img.syms(shdr):
|
||||
+ if sym.st_name in check_symbols:
|
||||
+ symbols_found.add(sym.st_name)
|
||||
+
|
||||
+ # Validate symbol type, section, and size.
|
||||
+ if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
|
||||
+ error('symbol {!r} has wrong type {}'.format(
|
||||
+ sym.st_name.decode('UTF-8'), sym.st_info.type))
|
||||
+ if sym.st_shndx in glibcelf.Shn:
|
||||
+ error('symbol {!r} has reserved section {}'.format(
|
||||
+ sym.st_name.decode('UTF-8'), sym.st_shndx))
|
||||
+ continue
|
||||
+ if sym.st_size == 0:
|
||||
+ error('symbol {!r} has size zero'.format(
|
||||
+ sym.st_name.decode('UTF-8')))
|
||||
+ continue
|
||||
+
|
||||
+ check_in_relro('symbol', relro_begin, relro_end,
|
||||
+ sym.st_name, sym.st_value, sym.st_size,
|
||||
+ error)
|
||||
+ continue # SHT_SYMTAB
|
||||
+ if shdr.sh_name == b'.data.rel.ro' \
|
||||
+ or shdr.sh_name.startswith(b'.data.rel.ro.'):
|
||||
+ check_in_relro('section', relro_begin, relro_end,
|
||||
+ shdr.sh_name, shdr.sh_addr, shdr.sh_size,
|
||||
+ error)
|
||||
+ continue
|
||||
+
|
||||
+ if required_symbols - symbols_found:
|
||||
+ for sym in sorted(required_symbols - symbols_found):
|
||||
+ error('symbol {!r} not found'.format(sym.decode('UTF-8')))
|
||||
+
|
||||
+ if errors:
|
||||
+ sys.exit(1)
|
||||
+
|
||||
+ if not symbol_table_found:
|
||||
+ sys.stdout.write(
|
||||
+ '{}: warning: no symbol table found (stripped object)\n'.format(
|
||||
+ opts.object))
|
||||
+ sys.exit(77)
|
||||
+
|
||||
+if __name__ == '__main__':
|
||||
+ main(sys.argv[1:])
|
||||
diff --git a/manual/install.texi b/manual/install.texi
|
||||
index 816b77a0a25a88a7..36a5af62bc5722b0 100644
|
||||
--- a/manual/install.texi
|
||||
+++ b/manual/install.texi
|
||||
@@ -117,6 +117,12 @@ problem and suppress these constructs, so that the library will still be
|
||||
usable, but functionality may be lost---for example, you can't build a
|
||||
shared libc with old binutils.
|
||||
|
||||
+@item --with-default-link=@var{FLAG}
|
||||
+With @code{--with-default-link=yes}, the build system does not use a
|
||||
+custom linker script for linking shared objects. The default for
|
||||
+@var{FLAG} is the opposite, @samp{no}, because the custom linker script
|
||||
+is needed for full RELRO protection.
|
||||
+
|
||||
@item --with-nonshared-cflags=@var{cflags}
|
||||
Use additional compiler flags @var{cflags} to build the parts of the
|
||||
library which are always statically linked into applications and
|
||||
diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
|
||||
index da85ba43e2d0ddef..c5cc41b3677d4a2a 100644
|
||||
--- a/sysdeps/unix/sysv/linux/ia64/Makefile
|
||||
+++ b/sysdeps/unix/sysv/linux/ia64/Makefile
|
||||
@@ -1,3 +1,9 @@
|
||||
+ifeq ($(subdir),elf)
|
||||
+# ia64 does not support PT_GNU_RELRO.
|
||||
+test-xfail-tst-relro-ldso = yes
|
||||
+test-xfail-tst-relro-libc = yes
|
||||
+endif
|
||||
+
|
||||
ifeq ($(subdir),misc)
|
||||
sysdep_headers += sys/rse.h
|
||||
endif
|
|
@ -0,0 +1,87 @@
|
|||
commit ca0faa140ff8cebe4c041d935f0f5eb480873d99
|
||||
Author: Joan Bruguera <joanbrugueram@gmail.com>
|
||||
Date: Mon Apr 11 19:49:56 2022 +0200
|
||||
|
||||
misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
|
||||
|
||||
If `__glibc_objsize (__o) == (size_t) -1` (i.e. `__o` is unknown size), fortify
|
||||
checks should pass, and `__whatever_alias` should be called.
|
||||
|
||||
Previously, `__glibc_objsize (__o) == (size_t) -1` was explicitly checked, but
|
||||
on commit a643f60c53876b, this was moved into `__glibc_safe_or_unknown_len`.
|
||||
|
||||
A comment says the -1 case should work as: "The -1 check is redundant because
|
||||
since it implies that __glibc_safe_len_cond is true.". But this fails when:
|
||||
* `__s > 1`
|
||||
* `__osz == -1` (i.e. unknown size at compile time)
|
||||
* `__l` is big enough
|
||||
* `__l * __s <= __osz` can be folded to a constant
|
||||
(I only found this to be true for `mbsrtowcs` and other functions in wchar2.h)
|
||||
|
||||
In this case `__l * __s <= __osz` is false, and `__whatever_chk_warn` will be
|
||||
called by `__glibc_fortify` or `__glibc_fortify_n` and crash the program.
|
||||
|
||||
This commit adds the explicit `__osz == -1` check again.
|
||||
moc crashes on startup due to this, see: https://bugs.archlinux.org/task/74041
|
||||
|
||||
Minimal test case (test.c):
|
||||
#include <wchar.h>
|
||||
|
||||
int main (void)
|
||||
{
|
||||
const char *hw = "HelloWorld";
|
||||
mbsrtowcs (NULL, &hw, (size_t)-1, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
Build with:
|
||||
gcc -O2 -Wp,-D_FORTIFY_SOURCE=2 test.c -o test && ./test
|
||||
|
||||
Output:
|
||||
*** buffer overflow detected ***: terminated
|
||||
|
||||
Fixes: BZ #29030
|
||||
Signed-off-by: Joan Bruguera <joanbrugueram@gmail.com>
|
||||
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
(cherry picked from commit 33e03f9cd2be4f2cd62f93fda539cc07d9c8130e)
|
||||
|
||||
diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
|
||||
index 8b5902423cf0ad88..fb02452f5993c594 100644
|
||||
--- a/debug/tst-fortify.c
|
||||
+++ b/debug/tst-fortify.c
|
||||
@@ -1505,6 +1505,11 @@ do_test (void)
|
||||
CHK_FAIL_END
|
||||
#endif
|
||||
|
||||
+ /* Bug 29030 regresion check */
|
||||
+ cp = "HelloWorld";
|
||||
+ if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
|
||||
+ FAIL ();
|
||||
+
|
||||
cp = "A";
|
||||
if (mbstowcs (wenough, cp, 10) != 1
|
||||
|| wcscmp (wenough, L"A") != 0)
|
||||
diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
|
||||
index 515fb681a0547217..b36013b9a6b4d9c3 100644
|
||||
--- a/misc/sys/cdefs.h
|
||||
+++ b/misc/sys/cdefs.h
|
||||
@@ -161,13 +161,13 @@
|
||||
|| (__builtin_constant_p (__l) && (__l) > 0))
|
||||
|
||||
/* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
|
||||
- condition can be folded to a constant and if it is true. The -1 check is
|
||||
- redundant because since it implies that __glibc_safe_len_cond is true. */
|
||||
+ condition can be folded to a constant and if it is true, or unknown (-1) */
|
||||
#define __glibc_safe_or_unknown_len(__l, __s, __osz) \
|
||||
- (__glibc_unsigned_or_positive (__l) \
|
||||
- && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
|
||||
- __s, __osz)) \
|
||||
- && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
|
||||
+ ((__osz) == (__SIZE_TYPE__) -1 \
|
||||
+ || (__glibc_unsigned_or_positive (__l) \
|
||||
+ && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
|
||||
+ (__s), (__osz))) \
|
||||
+ && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
|
||||
|
||||
/* Conversely, we know at compile time that the length is unsafe if the
|
||||
__L * __S <= __OBJSZ condition can be folded to a constant and if it is
|
|
@ -0,0 +1,49 @@
|
|||
commit 0d477e92c49db2906b32e44135b98746ccc73c7b
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue Apr 26 14:22:10 2022 +0200
|
||||
|
||||
INSTALL: Rephrase -with-default-link documentation
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit c935789bdf40ba22b5698da869d3a4789797e09f)
|
||||
|
||||
diff --git a/INSTALL b/INSTALL
|
||||
index 60d01568d77645c7..10a3dcdc0a8db665 100644
|
||||
--- a/INSTALL
|
||||
+++ b/INSTALL
|
||||
@@ -90,10 +90,10 @@ if 'CFLAGS' is specified it must enable optimization. For example:
|
||||
library will still be usable, but functionality may be lost--for
|
||||
example, you can't build a shared libc with old binutils.
|
||||
|
||||
-'--with-default-link=FLAG'
|
||||
- With '--with-default-link=yes', the build system does not use a
|
||||
- custom linker script for linking shared objects. The default for
|
||||
- FLAG is the opposite, 'no', because the custom linker script is
|
||||
+'--with-default-link'
|
||||
+ With '--with-default-link', the build system does not use a custom
|
||||
+ linker script for linking shared objects. The default is
|
||||
+ '--without-default-link', because the custom linker script is
|
||||
needed for full RELRO protection.
|
||||
|
||||
'--with-nonshared-cflags=CFLAGS'
|
||||
diff --git a/manual/install.texi b/manual/install.texi
|
||||
index 36a5af62bc5722b0..8e34ff7e1847f3ae 100644
|
||||
--- a/manual/install.texi
|
||||
+++ b/manual/install.texi
|
||||
@@ -117,11 +117,11 @@ problem and suppress these constructs, so that the library will still be
|
||||
usable, but functionality may be lost---for example, you can't build a
|
||||
shared libc with old binutils.
|
||||
|
||||
-@item --with-default-link=@var{FLAG}
|
||||
-With @code{--with-default-link=yes}, the build system does not use a
|
||||
-custom linker script for linking shared objects. The default for
|
||||
-@var{FLAG} is the opposite, @samp{no}, because the custom linker script
|
||||
-is needed for full RELRO protection.
|
||||
+@item --with-default-link
|
||||
+With @code{--with-default-link}, the build system does not use a custom
|
||||
+linker script for linking shared objects. The default is
|
||||
+@code{--without-default-link}, because the custom linker script is
|
||||
+needed for full RELRO protection.
|
||||
|
||||
@item --with-nonshared-cflags=@var{cflags}
|
||||
Use additional compiler flags @var{cflags} to build the parts of the
|
|
@ -0,0 +1,377 @@
|
|||
commit bc56ab1f4aa937665034373d3e320d0779a839aa
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue Apr 26 14:23:02 2022 +0200
|
||||
|
||||
dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
|
||||
|
||||
When audit modules are loaded, ld.so initialization is not yet
|
||||
complete, and rtld_active () returns false even though ld.so is
|
||||
mostly working. Instead, the static dlopen hook is used, but that
|
||||
does not work at all because this is not a static dlopen situation.
|
||||
|
||||
Commit 466c1ea15f461edb8e3ffaf5d86d708876343bbf ("dlfcn: Rework
|
||||
static dlopen hooks") moved the hook pointer into _rtld_global_ro,
|
||||
which means that separate protection is not needed anymore and the
|
||||
hook pointer can be checked directly.
|
||||
|
||||
The guard for disabling libio vtable hardening in _IO_vtable_check
|
||||
should stay for now.
|
||||
|
||||
Fixes commit 8e1472d2c1e25e6eabc2059170731365f6d5b3d1 ("ld.so:
|
||||
Examine GLRO to detect inactive loader [BZ #20204]").
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit 8dcb6d0af07fda3607b541857e4f3970a74ed55b)
|
||||
|
||||
diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
|
||||
index 1cc305f0c46e7c3b..0d07ae1cd4dbb7a2 100644
|
||||
--- a/dlfcn/dladdr.c
|
||||
+++ b/dlfcn/dladdr.c
|
||||
@@ -24,7 +24,7 @@ int
|
||||
__dladdr (const void *address, Dl_info *info)
|
||||
{
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dladdr (address, info);
|
||||
#endif
|
||||
return _dl_addr (address, info, NULL, NULL);
|
||||
diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
|
||||
index 78560dbac208c316..93ce68c1d6067fe2 100644
|
||||
--- a/dlfcn/dladdr1.c
|
||||
+++ b/dlfcn/dladdr1.c
|
||||
@@ -24,7 +24,7 @@ int
|
||||
__dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
|
||||
{
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
|
||||
#endif
|
||||
|
||||
diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
|
||||
index 6a013a81bb648191..07ecb21bf7d43be4 100644
|
||||
--- a/dlfcn/dlclose.c
|
||||
+++ b/dlfcn/dlclose.c
|
||||
@@ -24,7 +24,7 @@ int
|
||||
__dlclose (void *handle)
|
||||
{
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlclose (handle);
|
||||
#endif
|
||||
|
||||
diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
|
||||
index 5047b140662bc33e..63da79c63000eef0 100644
|
||||
--- a/dlfcn/dlerror.c
|
||||
+++ b/dlfcn/dlerror.c
|
||||
@@ -32,7 +32,7 @@ char *
|
||||
__dlerror (void)
|
||||
{
|
||||
# ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlerror ();
|
||||
# endif
|
||||
|
||||
diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
|
||||
index c6f9a1da09ff8622..47d2daa96fa5986f 100644
|
||||
--- a/dlfcn/dlinfo.c
|
||||
+++ b/dlfcn/dlinfo.c
|
||||
@@ -89,7 +89,7 @@ dlinfo_implementation (void *handle, int request, void *arg)
|
||||
int
|
||||
___dlinfo (void *handle, int request, void *arg)
|
||||
{
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
|
||||
else
|
||||
return dlinfo_implementation (handle, request, arg);
|
||||
diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
|
||||
index c171c8953da20fc7..2309224eb8484b1a 100644
|
||||
--- a/dlfcn/dlmopen.c
|
||||
+++ b/dlfcn/dlmopen.c
|
||||
@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
|
||||
void *
|
||||
___dlmopen (Lmid_t nsid, const char *file, int mode)
|
||||
{
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
|
||||
else
|
||||
return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
|
||||
diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
|
||||
index e04b374b82b04337..9c59c751c4eaf7a7 100644
|
||||
--- a/dlfcn/dlopen.c
|
||||
+++ b/dlfcn/dlopen.c
|
||||
@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
|
||||
void *
|
||||
___dlopen (const char *file, int mode)
|
||||
{
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
|
||||
else
|
||||
return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
|
||||
diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
|
||||
index 9115501ac121eeca..c2f2a42194d50953 100644
|
||||
--- a/dlfcn/dlopenold.c
|
||||
+++ b/dlfcn/dlopenold.c
|
||||
@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
|
||||
mode |= RTLD_LAZY;
|
||||
args.mode = mode;
|
||||
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
|
||||
|
||||
return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
|
||||
diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
|
||||
index 43044cf7bb95801e..d3861170a7631d01 100644
|
||||
--- a/dlfcn/dlsym.c
|
||||
+++ b/dlfcn/dlsym.c
|
||||
@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
|
||||
void *
|
||||
___dlsym (void *handle, const char *name)
|
||||
{
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
|
||||
else
|
||||
return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
|
||||
diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
|
||||
index 9b76f9afa513e11f..3af02109c306b800 100644
|
||||
--- a/dlfcn/dlvsym.c
|
||||
+++ b/dlfcn/dlvsym.c
|
||||
@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
|
||||
void *
|
||||
___dlvsym (void *handle, const char *name, const char *version)
|
||||
{
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
|
||||
RETURN_ADDRESS (0));
|
||||
else
|
||||
diff --git a/elf/Makefile b/elf/Makefile
|
||||
index fec6e23b5b625e3b..c89a6a58690646ee 100644
|
||||
--- a/elf/Makefile
|
||||
+++ b/elf/Makefile
|
||||
@@ -376,6 +376,7 @@ tests += \
|
||||
tst-audit24d \
|
||||
tst-audit25a \
|
||||
tst-audit25b \
|
||||
+ tst-audit26 \
|
||||
tst-auditmany \
|
||||
tst-auxobj \
|
||||
tst-auxobj-dlopen \
|
||||
@@ -721,6 +722,7 @@ modules-names = \
|
||||
tst-auditmod24c \
|
||||
tst-auditmod24d \
|
||||
tst-auditmod25 \
|
||||
+ tst-auditmod26 \
|
||||
tst-auxvalmod \
|
||||
tst-big-note-lib \
|
||||
tst-deep1mod1 \
|
||||
@@ -2194,6 +2196,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
|
||||
LDFLAGS-tst-audit25b = -Wl,-z,now
|
||||
tst-audit25b-ARGS = -- $(host-test-program-cmd)
|
||||
|
||||
+$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
|
||||
+$(objpfx)tst-auditmod26.so: $(libsupport)
|
||||
+tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
|
||||
+
|
||||
# tst-sonamemove links against an older implementation of the library.
|
||||
LDFLAGS-tst-sonamemove-linkmod1.so = \
|
||||
-Wl,--version-script=tst-sonamemove-linkmod1.map \
|
||||
diff --git a/elf/dl-libc.c b/elf/dl-libc.c
|
||||
index d5bc4a277f4c6ef3..db4342a3256921f0 100644
|
||||
--- a/elf/dl-libc.c
|
||||
+++ b/elf/dl-libc.c
|
||||
@@ -157,7 +157,7 @@ __libc_dlopen_mode (const char *name, int mode)
|
||||
args.caller_dlopen = RETURN_ADDRESS (0);
|
||||
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
|
||||
#endif
|
||||
return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
|
||||
@@ -185,7 +185,7 @@ __libc_dlsym (void *map, const char *name)
|
||||
args.name = name;
|
||||
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
|
||||
#endif
|
||||
return (dlerror_run (do_dlsym, &args) ? NULL
|
||||
@@ -199,7 +199,7 @@ void *
|
||||
__libc_dlvsym (void *map, const char *name, const char *version)
|
||||
{
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
|
||||
#endif
|
||||
|
||||
@@ -222,7 +222,7 @@ int
|
||||
__libc_dlclose (void *map)
|
||||
{
|
||||
#ifdef SHARED
|
||||
- if (!rtld_active ())
|
||||
+ if (GLRO (dl_dlfcn_hook) != NULL)
|
||||
return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
|
||||
#endif
|
||||
return dlerror_run (do_dlclose, map);
|
||||
diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..3f920e83bac247a5
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-audit26.c
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Check the usability of <dlfcn.h> functions in audit modules.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <gnu/lib-names.h>
|
||||
+
|
||||
+#include <support/check.h>
|
||||
+#include <support/xdlfcn.h>
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ /* Check that the audit module has been loaded. */
|
||||
+ void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
|
||||
+ TEST_VERIFY (handle
|
||||
+ == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..db7ba95abec20f53
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-auditmod26.c
|
||||
@@ -0,0 +1,104 @@
|
||||
+/* Check the usability of <dlfcn.h> functions in audit modules. Audit module.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <dlfcn.h>
|
||||
+#include <first-versions.h>
|
||||
+#include <gnu/lib-names.h>
|
||||
+#include <link.h>
|
||||
+#include <stdio.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+#include <support/check.h>
|
||||
+#include <support/xdlfcn.h>
|
||||
+
|
||||
+unsigned int
|
||||
+la_version (unsigned int current)
|
||||
+{
|
||||
+ /* Exercise various <dlfcn.h> functions. */
|
||||
+
|
||||
+ /* Check dlopen, dlsym, dlclose. */
|
||||
+ void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
|
||||
+ void *ptr = xdlsym (handle, "sincos");
|
||||
+ TEST_VERIFY (ptr != NULL);
|
||||
+ ptr = dlsym (handle, "SINCOS");
|
||||
+ TEST_VERIFY (ptr == NULL);
|
||||
+ const char *message = dlerror ();
|
||||
+ TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
|
||||
+ ptr = dlsym (handle, "SINCOS");
|
||||
+ TEST_VERIFY (ptr == NULL);
|
||||
+ xdlclose (handle);
|
||||
+ TEST_COMPARE_STRING (dlerror (), NULL);
|
||||
+
|
||||
+ handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
|
||||
+
|
||||
+ /* Check dlvsym. _exit is unlikely to gain another symbol
|
||||
+ version. */
|
||||
+ TEST_VERIFY (xdlsym (handle, "_exit")
|
||||
+ == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
|
||||
+
|
||||
+ /* Check dlinfo. */
|
||||
+ {
|
||||
+ void *handle2 = NULL;
|
||||
+ TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
|
||||
+ TEST_VERIFY (handle2 == handle);
|
||||
+ }
|
||||
+
|
||||
+ /* Check dladdr and dladdr1. */
|
||||
+ Dl_info info = { };
|
||||
+ TEST_VERIFY (dladdr (&_exit, &info) != 0);
|
||||
+ if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias. */
|
||||
+ TEST_COMPARE_STRING (info.dli_sname, "_exit");
|
||||
+ TEST_VERIFY (info.dli_saddr == &_exit);
|
||||
+ TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
|
||||
+ void *extra_info;
|
||||
+ memset (&info, 0, sizeof (info));
|
||||
+ TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
|
||||
+ TEST_VERIFY (extra_info == handle);
|
||||
+
|
||||
+ /* Verify that dlmopen creates a new namespace. */
|
||||
+ void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
|
||||
+ TEST_VERIFY (dlmopen_handle != handle);
|
||||
+ memset (&info, 0, sizeof (info));
|
||||
+ extra_info = NULL;
|
||||
+ ptr = xdlsym (dlmopen_handle, "_exit");
|
||||
+ TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
|
||||
+ TEST_VERIFY (extra_info == dlmopen_handle);
|
||||
+ xdlclose (dlmopen_handle);
|
||||
+
|
||||
+ /* Terminate the process with an error state. This does not happen
|
||||
+ automatically because the audit module state is not shared with
|
||||
+ the main program. */
|
||||
+ if (support_record_failure_is_failed ())
|
||||
+ {
|
||||
+ fflush (stdout);
|
||||
+ fflush (stderr);
|
||||
+ _exit (1);
|
||||
+ }
|
||||
+
|
||||
+ return LAV_CURRENT;
|
||||
+}
|
||||
+
|
||||
+char *
|
||||
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
|
||||
+{
|
||||
+ if (strcmp (name, "mapped to libc") == 0)
|
||||
+ return (char *) LIBC_SO;
|
||||
+ else
|
||||
+ return (char *) name;
|
||||
+}
|
|
@ -0,0 +1,28 @@
|
|||
commit 83cc145830bdbefdabe03787ed884d548bea9c99
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Apr 22 19:34:52 2022 +0200
|
||||
|
||||
scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
|
||||
|
||||
enum.IntFlag and enum.EnumMeta._missing_ support are not part of
|
||||
earlier Python versions.
|
||||
|
||||
(cherry picked from commit b571f3adffdcbed23f35ea39b0ca43809dbb4f5b)
|
||||
|
||||
diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
|
||||
index 8f7d0ca184845714..da0d5380f33a195e 100644
|
||||
--- a/scripts/glibcelf.py
|
||||
+++ b/scripts/glibcelf.py
|
||||
@@ -28,6 +28,12 @@ import collections
|
||||
import enum
|
||||
import struct
|
||||
|
||||
+if not hasattr(enum, 'IntFlag'):
|
||||
+ import sys
|
||||
+ sys.stdout.write(
|
||||
+ 'warning: glibcelf.py needs Python 3.6 for enum support\n')
|
||||
+ sys.exit(77)
|
||||
+
|
||||
class _OpenIntEnum(enum.IntEnum):
|
||||
"""Integer enumeration that supports arbitrary int values."""
|
||||
@classmethod
|
|
@ -0,0 +1,254 @@
|
|||
commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Aug 20 06:42:24 2021 -0700
|
||||
|
||||
x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
|
||||
|
||||
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
|
||||
by replacing
|
||||
|
||||
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
|
||||
|
||||
and
|
||||
|
||||
vmovups .L_2il0floatpacket.13(%rip), %zmmX
|
||||
|
||||
with
|
||||
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
|
||||
|
||||
This fixes BZ #28252.
|
||||
|
||||
(cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
|
||||
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
vmovaps %zmm0, %zmm8
|
||||
|
||||
/* Check for large arguments path */
|
||||
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
|
||||
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
|
||||
|
||||
/*
|
||||
ARGUMENT RANGE REDUCTION:
|
||||
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_cos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.16:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.16,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
index dfa2acafc486b56b..f5f117d474f66176 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
|
||||
/* preserve mantissa, set input exponent to 2^(-10) */
|
||||
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
|
||||
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
|
||||
vpsrlq $32, %zmm4, %zmm6
|
||||
|
||||
/* reciprocal approximation good to at least 11 bits */
|
||||
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_log_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.12:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.12,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
index be8ab7c6e0e33819..48d251db16ccab9d 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
|
||||
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
|
||||
vmovups __dAbsMask(%rax), %zmm7
|
||||
vmovups __dInvPI(%rax), %zmm2
|
||||
vmovups __dRShifter(%rax), %zmm1
|
||||
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_sin_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.14:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.14,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
index 611887082a545854..a4944a4feef6aa98 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
|
||||
|
||||
/* SinPoly = SinR*SinPoly */
|
||||
vfmadd213pd %zmm5, %zmm5, %zmm4
|
||||
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
|
||||
/* Update Cos result's sign */
|
||||
vxorpd %zmm2, %zmm1, %zmm1
|
||||
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
|
||||
ENTRY (_ZGVeN8vvv_sincos_skx)
|
||||
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
|
||||
END (_ZGVeN8vvv_sincos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.15:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.15,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
index f671d60d5dab5a0e..fe8474fed943e8ad 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
X = X - Y*PI1 - Y*PI2 - Y*PI3
|
||||
*/
|
||||
vmovaps %zmm0, %zmm6
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
|
||||
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
|
||||
vmovups __sRShifter(%rax), %zmm3
|
||||
vmovups __sPI1_FMA(%rax), %zmm5
|
||||
vmovups __sA9_FMA(%rax), %zmm9
|
||||
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_cosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
index 637bfe3c06ab9ad4..229b7828cde04db2 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
vmovaps %zmm0, %zmm7
|
||||
|
||||
/* compare against threshold */
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
vmovups __sInvLn2(%rax), %zmm4
|
||||
vmovups __sShifter(%rax), %zmm1
|
||||
vmovups __sLn2hi(%rax), %zmm6
|
||||
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_expf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_slog_data@GOTPCREL(%rip), %rax
|
||||
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
|
||||
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
|
||||
vmovups _iBrkValue(%rax), %zmm4
|
||||
vmovups _sPoly_7(%rax), %zmm8
|
||||
|
||||
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_logf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.7:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.7,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpsrlq $32, %zmm3, %zmm2
|
||||
vpmovqd %zmm2, %ymm11
|
||||
vcvtps2pd %ymm14, %zmm13
|
||||
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovaps %zmm14, %zmm26
|
||||
vpandd _ABSMASK(%rax), %zmm1, %zmm8
|
||||
vpcmpd $1, _INF(%rax), %zmm8, %k2
|
||||
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpmovqd %zmm11, %ymm5
|
||||
vpxord %zmm10, %zmm10, %zmm10
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
|
||||
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
|
||||
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
|
||||
vpxord %zmm11, %zmm11, %zmm11
|
||||
vcvtdq2pd %ymm7, %zmm7
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
|
||||
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16vv_powf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.23:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.23,@object
|
||||
-.L_2il0floatpacket.24:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.24,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
index 9cf359c86ff9bd70..a446c504f63c9399 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
|
||||
|
||||
/* Result sign calculations */
|
||||
vpternlogd $150, %zmm0, %zmm14, %zmm1
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
|
||||
/* Add correction term 0.5 for cos() part */
|
||||
vaddps %zmm8, %zmm5, %zmm15
|
||||
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
|
||||
ENTRY (_ZGVeN16vvv_sincosf_skx)
|
||||
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
|
||||
END (_ZGVeN16vvv_sincosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
index bd05109a62181f22..c1b352d0ad1992cd 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
|
||||
|
||||
/* Check for large and special values */
|
||||
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovups __sAbsMask(%rax), %zmm5
|
||||
vmovups __sInvPI(%rax), %zmm1
|
||||
vmovups __sRShifter(%rax), %zmm2
|
||||
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_sinf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.11:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.11,@object
|
|
@ -0,0 +1,42 @@
|
|||
commit b5a44a6a471aafd3677659a610f32468c40a666b
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue Sep 21 18:31:49 2021 -0500
|
||||
|
||||
x86: Modify ENTRY in sysdep.h so that p2align can be specified
|
||||
|
||||
No bug.
|
||||
|
||||
This change adds a new macro ENTRY_P2ALIGN which takes a second
|
||||
argument, log2 of the desired function alignment.
|
||||
|
||||
The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
|
||||
doesn't affect any existing functionality.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
(cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
|
||||
|
||||
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
|
||||
index cac1d762fb3f99d0..937180c1bd791570 100644
|
||||
--- a/sysdeps/x86/sysdep.h
|
||||
+++ b/sysdeps/x86/sysdep.h
|
||||
@@ -78,15 +78,18 @@ enum cf_protection_level
|
||||
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
|
||||
|
||||
/* Define an entry point visible from C. */
|
||||
-#define ENTRY(name) \
|
||||
+#define ENTRY_P2ALIGN(name, alignment) \
|
||||
.globl C_SYMBOL_NAME(name); \
|
||||
.type C_SYMBOL_NAME(name),@function; \
|
||||
- .align ALIGNARG(4); \
|
||||
+ .align ALIGNARG(alignment); \
|
||||
C_LABEL(name) \
|
||||
cfi_startproc; \
|
||||
_CET_ENDBR; \
|
||||
CALL_MCOUNT
|
||||
|
||||
+/* Common entry 16 byte aligns. */
|
||||
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
|
||||
+
|
||||
#undef END
|
||||
#define END(name) \
|
||||
cfi_endproc; \
|
|
@ -0,0 +1,653 @@
|
|||
commit 5ec3416853c4150c4d13312e05f93a053586d528
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue Sep 21 18:45:03 2021 -0500
|
||||
|
||||
x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
|
||||
|
||||
No bug.
|
||||
|
||||
The frontend optimizations are to:
|
||||
1. Reorganize logically connected basic blocks so they are either in
|
||||
the same cache line or adjacent cache lines.
|
||||
2. Avoid cases when basic blocks unnecissarily cross cache lines.
|
||||
3. Try and 32 byte align any basic blocks possible without sacrificing
|
||||
code size. Smaller / Less hot basic blocks are used for this.
|
||||
|
||||
Overall code size shrunk by 168 bytes. This should make up for any
|
||||
extra costs due to aligning to 64 bytes.
|
||||
|
||||
In general performance before deviated a great deal dependending on
|
||||
whether entry alignment % 64 was 0, 16, 32, or 48. These changes
|
||||
essentially make it so that the current implementation is at least
|
||||
equal to the best alignment of the original for any arguments.
|
||||
|
||||
The only additional optimization is in the page cross case. Branch on
|
||||
equals case was removed from the size == [4, 7] case. As well the [4,
|
||||
7] and [2, 3] case where swapped as [4, 7] is likely a more hot
|
||||
argument size.
|
||||
|
||||
test-memcmp and test-wmemcmp are both passing.
|
||||
|
||||
(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -34,7 +34,24 @@
|
||||
area.
|
||||
7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
|
||||
8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
|
||||
- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
|
||||
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
|
||||
+
|
||||
+When possible the implementation tries to optimize for frontend in the
|
||||
+following ways:
|
||||
+Throughput:
|
||||
+ 1. All code sections that fit are able to run optimally out of the
|
||||
+ LSD.
|
||||
+ 2. All code sections that fit are able to run optimally out of the
|
||||
+ DSB
|
||||
+ 3. Basic blocks are contained in minimum number of fetch blocks
|
||||
+ necessary.
|
||||
+
|
||||
+Latency:
|
||||
+ 1. Logically connected basic blocks are put in the same
|
||||
+ cache-line.
|
||||
+ 2. Logically connected basic blocks that do not fit in the same
|
||||
+ cache-line are put in adjacent lines. This can get beneficial
|
||||
+ L2 spatial prefetching and L1 next-line prefetching. */
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
@@ -47,9 +64,11 @@
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
# define CHAR_SIZE 4
|
||||
# define VPCMP vpcmpd
|
||||
+# define VPTEST vptestmd
|
||||
# else
|
||||
# define CHAR_SIZE 1
|
||||
# define VPCMP vpcmpub
|
||||
+# define VPTEST vptestmb
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
@@ -75,7 +94,9 @@
|
||||
*/
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
-ENTRY (MEMCMP)
|
||||
+/* Cache align memcmp entry. This allows for much more thorough
|
||||
+ frontend optimization. */
|
||||
+ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
|
||||
VPCMP $4, (%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
/* NB: eax must be destination register if going to
|
||||
- L(return_vec_[0,2]). For L(return_vec_3 destination register
|
||||
+ L(return_vec_[0,2]). For L(return_vec_3) destination register
|
||||
must be ecx. */
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_0)
|
||||
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_3)
|
||||
|
||||
- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
|
||||
- compare with zero to get a mask is needed. */
|
||||
- vpxorq %XMM0, %XMM0, %XMM0
|
||||
-
|
||||
/* Go to 4x VEC loop. */
|
||||
cmpq $(CHAR_PER_VEC * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
|
||||
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
||||
- oring with YMM3. Result is stored in YMM4. */
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ oring with YMM1. Result is stored in YMM4. */
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+
|
||||
+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+
|
||||
+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.
|
||||
+ */
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_0_1_2_3)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
- /* NB: aligning 32 here allows for the rest of the jump targets
|
||||
- to be tuned for 32 byte alignment. Most important this ensures
|
||||
- the L(more_8x_vec) loop is 32 byte aligned. */
|
||||
- .p2align 5
|
||||
-L(less_vec):
|
||||
- /* Check if one or less CHAR. This is necessary for size = 0 but
|
||||
- is also faster for size = CHAR_SIZE. */
|
||||
- cmpl $1, %edx
|
||||
- jbe L(one_or_less)
|
||||
+ .p2align 4
|
||||
+L(8x_end_return_vec_0_1_2_3):
|
||||
+ movq %rdx, %rdi
|
||||
+L(8x_return_vec_0_1_2_3):
|
||||
+ addq %rdi, %rsi
|
||||
+L(return_vec_0_1_2_3):
|
||||
+ VPTEST %YMM1, %YMM1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
- /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
- page cross. This can have false positives but is by far the
|
||||
- fastest method. */
|
||||
- movl %edi, %eax
|
||||
- orl %esi, %eax
|
||||
- andl $(PAGE_SIZE - 1), %eax
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
- jg L(page_cross_less_vec)
|
||||
+ VPTEST %YMM2, %YMM2, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_1)
|
||||
|
||||
- /* No page cross possible. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMP $4, (%rdi), %YMM2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Create mask in ecx for potentially in bound matches. */
|
||||
- bzhil %edx, %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
+ VPTEST %YMM3, %YMM3, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_2)
|
||||
+L(return_vec_3):
|
||||
+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
|
||||
+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
|
||||
+ line. */
|
||||
+ bsfl %ecx, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
@@ -209,10 +240,11 @@ L(return_vec_0):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: No p2align necessary. Alignment % 16 is naturally 1
|
||||
- which is good enough for a target not in a loop. */
|
||||
+ .p2align 4
|
||||
L(return_vec_1):
|
||||
- tzcntl %eax, %eax
|
||||
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
|
||||
+ fetch block. */
|
||||
+ bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
@@ -226,10 +258,11 @@ L(return_vec_1):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: No p2align necessary. Alignment % 16 is naturally 2
|
||||
- which is good enough for a target not in a loop. */
|
||||
+ .p2align 4,, 10
|
||||
L(return_vec_2):
|
||||
- tzcntl %eax, %eax
|
||||
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
|
||||
+ fetch block. */
|
||||
+ bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
@@ -243,40 +276,6 @@ L(return_vec_2):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(8x_return_vec_0_1_2_3):
|
||||
- /* Returning from L(more_8x_vec) requires restoring rsi. */
|
||||
- addq %rdi, %rsi
|
||||
-L(return_vec_0_1_2_3):
|
||||
- VPCMP $4, %YMM1, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
-
|
||||
- VPCMP $4, %YMM2, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_1)
|
||||
-
|
||||
- VPCMP $4, %YMM3, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_2)
|
||||
-L(return_vec_3):
|
||||
- tzcntl %ecx, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
||||
- xorl %edx, %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
.p2align 4
|
||||
L(more_8x_vec):
|
||||
/* Set end of s1 in rdx. */
|
||||
@@ -288,21 +287,19 @@ L(more_8x_vec):
|
||||
andq $-VEC_SIZE, %rdi
|
||||
/* Adjust because first 4x vec where check already. */
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
+
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
VMOVU (%rsi, %rdi), %YMM1
|
||||
vpxorq (%rdi), %YMM1, %YMM1
|
||||
-
|
||||
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
||||
vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
|
||||
-
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
-
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(8x_return_vec_0_1_2_3)
|
||||
@@ -319,28 +316,25 @@ L(loop_4x_vec):
|
||||
cmpl $(VEC_SIZE * 2), %edi
|
||||
jae L(8x_last_2x_vec)
|
||||
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
||||
+
|
||||
VMOVU (%rsi, %rdx), %YMM1
|
||||
vpxorq (%rdx), %YMM1, %YMM1
|
||||
|
||||
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
||||
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
||||
-
|
||||
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
-
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
- /* Restore s1 pointer to rdi. */
|
||||
- movq %rdx, %rdi
|
||||
testl %ecx, %ecx
|
||||
- jnz L(8x_return_vec_0_1_2_3)
|
||||
+ jnz L(8x_end_return_vec_0_1_2_3)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
/* Only entry is from L(more_8x_vec). */
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(8x_last_2x_vec):
|
||||
VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
|
||||
jnz L(8x_return_vec_3)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ /* Not ideally aligned (at offset +9 bytes in fetch block) but
|
||||
+ not aligning keeps it in the same cache line as
|
||||
+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
|
||||
+ size. */
|
||||
+ .p2align 4,, 4
|
||||
+L(8x_return_vec_2):
|
||||
+ subq $VEC_SIZE, %rdx
|
||||
+L(8x_return_vec_3):
|
||||
+ bsfl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ addq %rdx, %rax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
L(last_2x_vec):
|
||||
/* Check second to last VEC. */
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
||||
@@ -374,26 +392,49 @@ L(last_1x_vec):
|
||||
jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(8x_return_vec_2):
|
||||
- subq $VEC_SIZE, %rdx
|
||||
-L(8x_return_vec_3):
|
||||
- tzcntl %eax, %eax
|
||||
+ .p2align 4,, 10
|
||||
+L(return_vec_1_end):
|
||||
+ /* Use bsf to save code size. This is necessary to have
|
||||
+ L(one_or_less) fit in aligning bytes between. */
|
||||
+ bsfl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
- movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- addq %rdx, %rax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
- movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
+ /* NB: L(one_or_less) fits in alignment padding between
|
||||
+ L(return_vec_1_end) and L(return_vec_0_end). */
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movl (%rdi), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi), %ecx
|
||||
+ je L(zero)
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+ ret
|
||||
+# else
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(return_vec_0_end):
|
||||
tzcntl %eax, %eax
|
||||
@@ -412,23 +453,56 @@ L(return_vec_0_end):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(return_vec_1_end):
|
||||
+L(less_vec):
|
||||
+ /* Check if one or less CHAR. This is necessary for size == 0
|
||||
+ but is also faster for size == CHAR_SIZE. */
|
||||
+ cmpl $1, %edx
|
||||
+ jbe L(one_or_less)
|
||||
+
|
||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
+ page cross. This can have false positives but is by far the
|
||||
+ fastest method. */
|
||||
+ movl %edi, %eax
|
||||
+ orl %esi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(page_cross_less_vec)
|
||||
+
|
||||
+ /* No page cross possible. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMP $4, (%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Check if any matches where in bounds. Intentionally not
|
||||
+ storing result in eax to limit dependency chain if it goes to
|
||||
+ L(return_vec_0_lv). */
|
||||
+ bzhil %edx, %eax, %edx
|
||||
+ jnz L(return_vec_0_lv)
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ /* Essentially duplicate of L(return_vec_0). Ends up not costing
|
||||
+ any code as shrinks L(less_vec) by allowing 2-byte encoding of
|
||||
+ the jump and ends up fitting in aligning bytes. As well fits on
|
||||
+ same cache line as L(less_vec) so also saves a line from having
|
||||
+ to be fetched on cold calls to memcmp. */
|
||||
+ .p2align 4,, 4
|
||||
+L(return_vec_0_lv):
|
||||
tzcntl %eax, %eax
|
||||
- addl %edx, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
-
|
||||
.p2align 4
|
||||
L(page_cross_less_vec):
|
||||
/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
-L(between_2_3):
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
- .p2align 4
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movzbl (%rsi), %ecx
|
||||
- movzbl (%rdi), %eax
|
||||
- subl %ecx, %eax
|
||||
+ jb L(between_2_3)
|
||||
+
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
||||
+ */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* edx is guranteed to be positive int32 in range [4, 7]. */
|
||||
+ cmovne %edx, %eax
|
||||
+ /* ecx is -1 if rcx > rax. Otherwise 0. */
|
||||
+ sbbl %ecx, %ecx
|
||||
+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
|
||||
+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so
|
||||
+ eax doesn't matter. */
|
||||
+ orl %ecx, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 8
|
||||
L(between_8_15):
|
||||
# endif
|
||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
- vmovq (%rdi), %XMM1
|
||||
- vmovq (%rsi), %XMM2
|
||||
- VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ vmovq (%rdi), %xmm1
|
||||
+ vmovq (%rsi), %xmm2
|
||||
+ VPCMP $4, %xmm1, %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
+ jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
- vmovq (%rdi), %XMM1
|
||||
- vmovq (%rsi), %XMM2
|
||||
- VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
|
||||
+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ VPCMP $4, %xmm1, %xmm2, %k1
|
||||
+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 8
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
- VMOVU (%rsi), %XMM2
|
||||
- VPCMP $4, (%rdi), %XMM2, %k1
|
||||
+
|
||||
+ /* Use movups to save code size. */
|
||||
+ movups (%rsi), %xmm2
|
||||
+ VPCMP $4, (%rdi), %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
-
|
||||
+ jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
-
|
||||
- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
|
||||
- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
- VPCMP $4, (%rdi), %XMM2, %k1
|
||||
+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
- ret
|
||||
-
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- .p2align 4
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movl (%rdi), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
ret
|
||||
-# else
|
||||
|
||||
- .p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- jz L(zero_4_7)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
-L(zero_4_7):
|
||||
+# ifndef USE_AS_WMEMCMP
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ shll $8, %eax
|
||||
+ shll $8, %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
+ subl %ecx, %eax
|
||||
ret
|
||||
# endif
|
||||
-
|
||||
END (MEMCMP)
|
||||
#endif
|
|
@ -0,0 +1,497 @@
|
|||
commit 6d18a93dbbde2958001d65dff3080beed7ae675a
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Sep 20 16:20:15 2021 -0500
|
||||
|
||||
x86: Optimize memset-vec-unaligned-erms.S
|
||||
|
||||
No bug.
|
||||
|
||||
Optimization are
|
||||
|
||||
1. change control flow for L(more_2x_vec) to fall through to loop and
|
||||
jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
|
||||
size and saves jumps for length > 4x VEC_SIZE.
|
||||
|
||||
2. For EVEX/AVX512 move L(less_vec) closer to entry.
|
||||
|
||||
3. Avoid complex address mode for length > 2x VEC_SIZE
|
||||
|
||||
4. Slightly better aligning code for the loop from the perspective of
|
||||
code size and uops.
|
||||
|
||||
5. Align targets so they make full use of their fetch block and if
|
||||
possible cache line.
|
||||
|
||||
6. Try and reduce total number of icache lines that will need to be
|
||||
pulled in for a given length.
|
||||
|
||||
7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
|
||||
jumping to the stosb target in the sse2 code section will almost
|
||||
certainly be to a new page. The new version does increase code size
|
||||
marginally by duplicating the target but should get better iTLB
|
||||
behavior as a result.
|
||||
|
||||
test-memset, test-wmemset, and test-bzero are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -18,13 +18,15 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
+#define USE_WITH_SSE2 1
|
||||
|
||||
#define VEC_SIZE 16
|
||||
+#define MOV_SIZE 3
|
||||
+#define RET_SIZE 1
|
||||
+
|
||||
#define VEC(i) xmm##i
|
||||
-/* Don't use movups and movaps since it will get larger nop paddings for
|
||||
- alignment. */
|
||||
-#define VMOVU movdqu
|
||||
-#define VMOVA movdqa
|
||||
+#define VMOVU movups
|
||||
+#define VMOVA movaps
|
||||
|
||||
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index ae0860f36a47d594..1af668af0aeda59e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -1,8 +1,14 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_AVX2 1
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
+# define MOV_SIZE 4
|
||||
+# define RET_SIZE 4
|
||||
+
|
||||
# define VEC(i) ymm##i
|
||||
-# define VMOVU vmovdqu
|
||||
-# define VMOVA vmovdqa
|
||||
+
|
||||
+# define VMOVU vmovdqu
|
||||
+# define VMOVA vmovdqa
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 8ad842fc2f140527..f14d6f8493c21a36 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -1,11 +1,18 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_AVX512 1
|
||||
+
|
||||
# define VEC_SIZE 64
|
||||
+# define MOV_SIZE 6
|
||||
+# define RET_SIZE 1
|
||||
+
|
||||
# define XMM0 xmm16
|
||||
# define YMM0 ymm16
|
||||
# define VEC0 zmm16
|
||||
# define VEC(i) VEC##i
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 640f092903302ad0..64b09e77cc20cc42 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -1,11 +1,18 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_EVEX 1
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
+# define MOV_SIZE 6
|
||||
+# define RET_SIZE 1
|
||||
+
|
||||
# define XMM0 xmm16
|
||||
# define YMM0 ymm16
|
||||
# define VEC0 ymm16
|
||||
# define VEC(i) VEC##i
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index ff196844a093dc3b..e723413a664c088f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -63,8 +63,27 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+#if VEC_SIZE == 64
|
||||
+# define LOOP_4X_OFFSET (VEC_SIZE * 4)
|
||||
+#else
|
||||
+# define LOOP_4X_OFFSET (0)
|
||||
+#endif
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+# define END_REG rcx
|
||||
+# define LOOP_REG rdi
|
||||
+#else
|
||||
+# define END_REG rdi
|
||||
+# define LOOP_REG rdx
|
||||
+#endif
|
||||
+
|
||||
#define PAGE_SIZE 4096
|
||||
|
||||
+/* Macro to calculate size of small memset block for aligning
|
||||
+ purposes. */
|
||||
+#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
|
||||
+
|
||||
+
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
@@ -74,6 +93,7 @@
|
||||
ENTRY (__bzero)
|
||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
+ xorl %esi, %esi
|
||||
pxor %XMM0, %XMM0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
jb L(less_vec)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
+ */
|
||||
+ VMOVU %VEC(0), (%rax)
|
||||
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(stosb_more_2x_vec):
|
||||
- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
||||
- ja L(stosb)
|
||||
-#else
|
||||
- .p2align 4
|
||||
#endif
|
||||
-L(more_2x_vec):
|
||||
- /* Stores to first 2x VEC before cmp as any path forward will
|
||||
- require it. */
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_start)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
-L(return):
|
||||
-#if VEC_SIZE > 16
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
+L(last_2x_vec):
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||||
#else
|
||||
- ret
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||||
#endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(loop_start):
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
- cmpq $(VEC_SIZE * 8), %rdx
|
||||
- jbe L(loop_end)
|
||||
- andq $-(VEC_SIZE * 2), %rdi
|
||||
- subq $-(VEC_SIZE * 4), %rdi
|
||||
- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
- VMOVA %VEC(0), (%rdi)
|
||||
- VMOVA %VEC(0), VEC_SIZE(%rdi)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
- subq $-(VEC_SIZE * 4), %rdi
|
||||
- cmpq %rcx, %rdi
|
||||
- jb L(loop)
|
||||
-L(loop_end):
|
||||
- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
||||
- rdx as length is also unchanged. */
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
- VZEROUPPER_SHORT_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
+ /* If have AVX512 mask instructions put L(less_vec) close to
|
||||
+ entry as it doesn't take much space and is likely a hot target.
|
||||
+ */
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ .p2align 4,, 10
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
# endif
|
||||
-# ifdef USE_LESS_VEC_MASK_STORE
|
||||
/* Clear high bits from edi. Only keeping bits relevant to page
|
||||
cross check. Note that we are using rax which is set in
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
|
||||
- */
|
||||
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||
andl $(PAGE_SIZE - 1), %edi
|
||||
- /* Check if VEC_SIZE store cross page. Mask stores suffer serious
|
||||
- performance degradation when it has to fault supress. */
|
||||
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||
+ serious performance degradation when it has to fault supress.
|
||||
+ */
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||
+ /* This is generally considered a cold target. */
|
||||
ja L(cross_page)
|
||||
# if VEC_SIZE > 32
|
||||
movq $-1, %rcx
|
||||
@@ -247,58 +235,185 @@ L(less_vec):
|
||||
bzhil %edx, %ecx, %ecx
|
||||
kmovd %ecx, %k1
|
||||
# endif
|
||||
- vmovdqu8 %VEC(0), (%rax) {%k1}
|
||||
+ vmovdqu8 %VEC(0), (%rax){%k1}
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ /* Include L(stosb_local) here if including L(less_vec) between
|
||||
+ L(stosb_more_2x_vec) and ENTRY. This is to cache align the
|
||||
+ L(stosb_more_2x_vec) target. */
|
||||
+ .p2align 4,, 10
|
||||
+L(stosb_local):
|
||||
+ movzbl %sil, %eax
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
+ rep stosb
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
+ VZEROUPPER_RETURN
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
.p2align 4
|
||||
-L(cross_page):
|
||||
+L(stosb_more_2x_vec):
|
||||
+ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
||||
+ ja L(stosb_local)
|
||||
+#endif
|
||||
+ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||||
+ and (4x, 8x] jump to target. */
|
||||
+L(more_2x_vec):
|
||||
+
|
||||
+ /* Two different methods of setting up pointers / compare. The
|
||||
+ two methods are based on the fact that EVEX/AVX512 mov
|
||||
+ instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||||
+ well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||||
+ setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||||
+ this saves code size and keeps a few targets in one fetch block.
|
||||
+ For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||||
+ LOOP_4X_OFFSET) with LEA_BID. */
|
||||
+
|
||||
+ /* END_REG is rcx for EVEX/AVX512. */
|
||||
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
+#endif
|
||||
+
|
||||
+ /* Stores to first 2x VEC before cmp as any path forward will
|
||||
+ require it. */
|
||||
+ VMOVU %VEC(0), (%rax)
|
||||
+ VMOVU %VEC(0), VEC_SIZE(%rax)
|
||||
+
|
||||
+
|
||||
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
+ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||||
+ addq %rdx, %END_REG
|
||||
+#endif
|
||||
+
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
+
|
||||
+ /* Store next 2x vec regardless. */
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||||
+
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
|
||||
+ extra offset to addresses in loop. Used for AVX512 to save space
|
||||
+ as no way to get (VEC_SIZE * 4) in imm8. */
|
||||
+# if LOOP_4X_OFFSET == 0
|
||||
+ subq $-(VEC_SIZE * 4), %LOOP_REG
|
||||
# endif
|
||||
-# if VEC_SIZE > 32
|
||||
- cmpb $32, %dl
|
||||
- jae L(between_32_63)
|
||||
+ /* Avoid imm32 compare here to save code size. */
|
||||
+ cmpq %rdi, %rcx
|
||||
+#else
|
||||
+ addq $-(VEC_SIZE * 4), %END_REG
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+#endif
|
||||
+ jbe L(last_4x_vec)
|
||||
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
+ /* Set LOOP_REG (rdx). */
|
||||
+ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
|
||||
+#endif
|
||||
+ /* Align dst for loop. */
|
||||
+ andq $(VEC_SIZE * -2), %LOOP_REG
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ subq $-(VEC_SIZE * 4), %LOOP_REG
|
||||
+ cmpq %END_REG, %LOOP_REG
|
||||
+ jb L(loop)
|
||||
+ .p2align 4,, MOV_SIZE
|
||||
+L(last_4x_vec):
|
||||
+ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
|
||||
+L(return):
|
||||
+#if VEC_SIZE > 16
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+#else
|
||||
+ ret
|
||||
+#endif
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
+#ifndef USE_LESS_VEC_MASK_STORE
|
||||
+# if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||
+ range for 2-byte jump encoding. */
|
||||
+L(stosb_local):
|
||||
+ movzbl %sil, %eax
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
+ rep stosb
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
-# if VEC_SIZE > 16
|
||||
- cmpb $16, %dl
|
||||
+ /* Define L(less_vec) only if not otherwise defined. */
|
||||
+ .p2align 4
|
||||
+L(less_vec):
|
||||
+#endif
|
||||
+L(cross_page):
|
||||
+#if VEC_SIZE > 32
|
||||
+ cmpl $32, %edx
|
||||
+ jae L(between_32_63)
|
||||
+#endif
|
||||
+#if VEC_SIZE > 16
|
||||
+ cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
-# endif
|
||||
- MOVQ %XMM0, %rcx
|
||||
- cmpb $8, %dl
|
||||
+#endif
|
||||
+ MOVQ %XMM0, %rdi
|
||||
+ cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
- cmpb $4, %dl
|
||||
+ cmpl $4, %edx
|
||||
jae L(between_4_7)
|
||||
- cmpb $1, %dl
|
||||
+ cmpl $1, %edx
|
||||
ja L(between_2_3)
|
||||
- jb 1f
|
||||
- movb %cl, (%rax)
|
||||
-1:
|
||||
+ jb L(return)
|
||||
+ movb %sil, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
-# if VEC_SIZE > 32
|
||||
+
|
||||
+ /* Align small targets only if not doing so would cross a fetch
|
||||
+ line. */
|
||||
+#if VEC_SIZE > 32
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- VMOVU %YMM0, -32(%rax,%rdx)
|
||||
VMOVU %YMM0, (%rax)
|
||||
+ VMOVU %YMM0, -32(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-# endif
|
||||
-# if VEC_SIZE > 16
|
||||
- /* From 16 to 31. No branch when size == 16. */
|
||||
+#endif
|
||||
+
|
||||
+#if VEC_SIZE >= 32
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
L(between_16_31):
|
||||
- VMOVU %XMM0, -16(%rax,%rdx)
|
||||
+ /* From 16 to 31. No branch when size == 16. */
|
||||
VMOVU %XMM0, (%rax)
|
||||
+ VMOVU %XMM0, -16(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-# endif
|
||||
- /* From 8 to 15. No branch when size == 8. */
|
||||
+#endif
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
L(between_8_15):
|
||||
- movq %rcx, -8(%rax,%rdx)
|
||||
- movq %rcx, (%rax)
|
||||
+ /* From 8 to 15. No branch when size == 8. */
|
||||
+ movq %rdi, (%rax)
|
||||
+ movq %rdi, -8(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
- movl %ecx, -4(%rax,%rdx)
|
||||
- movl %ecx, (%rax)
|
||||
+ movl %edi, (%rax)
|
||||
+ movl %edi, -4(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
- movw %cx, -2(%rax,%rdx)
|
||||
- movw %cx, (%rax)
|
||||
+ movw %di, (%rax)
|
||||
+ movb %dil, -1(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
@ -0,0 +1,40 @@
|
|||
commit baf3ece63453adac59c5688930324a78ced5b2e4
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sat Oct 23 01:26:47 2021 -0400
|
||||
|
||||
x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
|
||||
|
||||
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
|
||||
|
||||
it could potentially be dangerous to use SSE2 if this function is ever
|
||||
called without using 'vzeroupper' beforehand. While compilers appear
|
||||
to use 'vzeroupper' before function calls if AVX2 has been used, using
|
||||
SSE2 here is more brittle. Since it is not absolutely necessary it
|
||||
should be avoided.
|
||||
|
||||
It costs 2-extra bytes but the extra bytes should only eat into
|
||||
alignment padding.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 2761b54f2e7dea9f..640f6757fac8a356 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -561,13 +561,13 @@ L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
|
||||
/* Use movups to save code size. */
|
||||
- movups (%rsi), %xmm2
|
||||
+ vmovdqu (%rsi), %xmm2
|
||||
VPCMP $4, (%rdi), %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
|
@ -0,0 +1,690 @@
|
|||
commit f35ad30da4880a1574996df0674986ecf82fa7ae
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Oct 29 12:40:20 2021 -0700
|
||||
|
||||
x86-64: Improve EVEX strcmp with masked load
|
||||
|
||||
In strcmp-evex.S, to compare 2 32-byte strings, replace
|
||||
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
VMOVU (%rsi, %rdx), %YMM1
|
||||
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
VPCMP $4, %YMM0, %YMM1, %k0
|
||||
VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
kord %k1, %k2, %k1
|
||||
/* Each bit in K1 represents a NULL or a mismatch. */
|
||||
kord %k0, %k1, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
with
|
||||
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
VPTESTM %YMM0, %YMM0, %k2
|
||||
/* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
in YMM0 and 32 bytes at (%rsi, %rdx). */
|
||||
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
incl %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
|
||||
and Ice Lake.
|
||||
|
||||
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
(cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -41,6 +41,8 @@
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Compare packed dwords. */
|
||||
# define VPCMP vpcmpd
|
||||
+# define VPMINU vpminud
|
||||
+# define VPTESTM vptestmd
|
||||
# define SHIFT_REG32 r8d
|
||||
# define SHIFT_REG64 r8
|
||||
/* 1 dword char == 4 bytes. */
|
||||
@@ -48,6 +50,8 @@
|
||||
# else
|
||||
/* Compare packed bytes. */
|
||||
# define VPCMP vpcmpb
|
||||
+# define VPMINU vpminub
|
||||
+# define VPTESTM vptestmb
|
||||
# define SHIFT_REG32 ecx
|
||||
# define SHIFT_REG64 rcx
|
||||
/* 1 byte char == 1 byte. */
|
||||
@@ -67,6 +71,9 @@
|
||||
# define YMM5 ymm22
|
||||
# define YMM6 ymm23
|
||||
# define YMM7 ymm24
|
||||
+# define YMM8 ymm25
|
||||
+# define YMM9 ymm26
|
||||
+# define YMM10 ymm27
|
||||
|
||||
/* Warning!
|
||||
wcscmp/wcsncmp have to use SIGNED comparison for elements.
|
||||
@@ -76,7 +83,7 @@
|
||||
/* The main idea of the string comparison (byte or dword) using 256-bit
|
||||
EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
|
||||
latter can be on either packed bytes or dwords depending on
|
||||
- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
|
||||
+ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
|
||||
matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
|
||||
KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
|
||||
are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
|
||||
@@ -123,27 +130,21 @@ ENTRY (STRCMP)
|
||||
jg L(cross_page)
|
||||
/* Start comparing 4 vectors. */
|
||||
VMOVU (%rdi), %YMM0
|
||||
- VMOVU (%rsi), %YMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
|
||||
- /* Check for NULL in YMM0. */
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- /* Check for NULL in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (%rsi). */
|
||||
+ VPCMP $0, (%rsi), %YMM0, %k1{%k2}
|
||||
|
||||
- /* Each bit in K1 represents:
|
||||
- 1. A mismatch in YMM0 and YMM1. Or
|
||||
- 2. A NULL in YMM0 or YMM1.
|
||||
- */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- ktestd %k1, %k1
|
||||
- je L(next_3_vectors)
|
||||
kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ je L(next_3_vectors)
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -172,9 +173,7 @@ L(return):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -210,9 +209,7 @@ L(return_vec_size):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_2_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -248,9 +245,7 @@ L(return_2_vec_size):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_3_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -289,43 +284,45 @@ L(return_3_vec_size):
|
||||
.p2align 4
|
||||
L(next_3_vectors):
|
||||
VMOVU VEC_SIZE(%rdi), %YMM0
|
||||
- VMOVU VEC_SIZE(%rsi), %YMM1
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
|
||||
+ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_vec_size)
|
||||
|
||||
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
|
||||
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
|
||||
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
|
||||
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
|
||||
- VPCMP $4, %YMM2, %YMM4, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_2_vec_size)
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
|
||||
- VPCMP $4, %YMM3, %YMM5, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_3_vec_size)
|
||||
L(main_loop_header):
|
||||
leaq (VEC_SIZE * 4)(%rdi), %rdx
|
||||
@@ -375,56 +372,51 @@ L(back_to_loop):
|
||||
VMOVA VEC_SIZE(%rax), %YMM2
|
||||
VMOVA (VEC_SIZE * 2)(%rax), %YMM4
|
||||
VMOVA (VEC_SIZE * 3)(%rax), %YMM6
|
||||
- VMOVU (%rdx), %YMM1
|
||||
- VMOVU VEC_SIZE(%rdx), %YMM3
|
||||
- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
|
||||
- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
|
||||
-
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
|
||||
- YMM1. */
|
||||
- kord %k0, %k1, %k4
|
||||
-
|
||||
- VPCMP $4, %YMM2, %YMM3, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
|
||||
- YMM3. */
|
||||
- kord %k0, %k1, %k5
|
||||
-
|
||||
- VPCMP $4, %YMM4, %YMM5, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
|
||||
- YMM5. */
|
||||
- kord %k0, %k1, %k6
|
||||
-
|
||||
- VPCMP $4, %YMM6, %YMM7, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM6, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM7, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
|
||||
- YMM7. */
|
||||
- kord %k0, %k1, %k7
|
||||
-
|
||||
- kord %k4, %k5, %k0
|
||||
- kord %k6, %k7, %k1
|
||||
-
|
||||
- /* Test each mask (32 bits) individually because for VEC_SIZE
|
||||
- == 32 is not possible to OR the four masks and keep all bits
|
||||
- in a 64-bit integer register, differing from SSE2 strcmp
|
||||
- where ORing is possible. */
|
||||
- kortestd %k0, %k1
|
||||
- je L(loop)
|
||||
- ktestd %k4, %k4
|
||||
+
|
||||
+ VPMINU %YMM0, %YMM2, %YMM8
|
||||
+ VPMINU %YMM4, %YMM6, %YMM9
|
||||
+
|
||||
+ /* A zero CHAR in YMM8 means that there is a null CHAR. */
|
||||
+ VPMINU %YMM8, %YMM9, %YMM8
|
||||
+
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM8. */
|
||||
+ VPTESTM %YMM8, %YMM8, %k1
|
||||
+
|
||||
+ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
|
||||
+ vpxorq (%rdx), %YMM0, %YMM1
|
||||
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
|
||||
+ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
|
||||
+
|
||||
+ vporq %YMM1, %YMM3, %YMM9
|
||||
+ vporq %YMM5, %YMM7, %YMM10
|
||||
+
|
||||
+ /* A non-zero CHAR in YMM9 represents a mismatch. */
|
||||
+ vporq %YMM9, %YMM10, %YMM9
|
||||
+
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
|
||||
+ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ je L(loop)
|
||||
+
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and (%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_vec)
|
||||
- kmovd %k4, %edi
|
||||
- tzcntl %edi, %ecx
|
||||
+ tzcntl %ecx, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %ecx
|
||||
@@ -466,9 +458,18 @@ L(test_vec):
|
||||
cmpq $VEC_SIZE, %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- ktestd %k5, %k5
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM2. */
|
||||
+ VPTESTM %YMM2, %YMM2, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM2 and VEC_SIZE(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_2_vec)
|
||||
- kmovd %k5, %ecx
|
||||
tzcntl %ecx, %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -512,9 +513,18 @@ L(test_2_vec):
|
||||
cmpq $(VEC_SIZE * 2), %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- ktestd %k6, %k6
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM4. */
|
||||
+ VPTESTM %YMM4, %YMM4, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM4 and (VEC_SIZE * 2)(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_3_vec)
|
||||
- kmovd %k6, %ecx
|
||||
tzcntl %ecx, %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -558,8 +568,18 @@ L(test_3_vec):
|
||||
cmpq $(VEC_SIZE * 3), %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- kmovd %k7, %esi
|
||||
- tzcntl %esi, %ecx
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM6. */
|
||||
+ VPTESTM %YMM6, %YMM6, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM6 and (VEC_SIZE * 3)(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ tzcntl %ecx, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %ecx
|
||||
@@ -615,39 +635,51 @@ L(loop_cross_page):
|
||||
|
||||
VMOVU (%rax, %r10), %YMM2
|
||||
VMOVU VEC_SIZE(%rax, %r10), %YMM3
|
||||
- VMOVU (%rdx, %r10), %YMM4
|
||||
- VMOVU VEC_SIZE(%rdx, %r10), %YMM5
|
||||
-
|
||||
- VPCMP $4, %YMM4, %YMM2, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
|
||||
- YMM4. */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- VPCMP $4, %YMM5, %YMM3, %k3
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k4
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k5
|
||||
- kord %k4, %k5, %k4
|
||||
- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
|
||||
- YMM5. */
|
||||
- kord %k3, %k4, %k3
|
||||
+
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM2. */
|
||||
+ VPTESTM %YMM2, %YMM2, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM2 and 32 bytes at (%rdx, %r10). */
|
||||
+ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
|
||||
+ kmovd %k1, %r9d
|
||||
+ /* Don't use subl since it is the lower 16/32 bits of RDI
|
||||
+ below. */
|
||||
+ notl %r9d
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Only last 8 bits are valid. */
|
||||
+ andl $0xff, %r9d
|
||||
+# endif
|
||||
+
|
||||
+ /* Each bit set in K4 represents a non-null CHAR in YMM3. */
|
||||
+ VPTESTM %YMM3, %YMM3, %k4
|
||||
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
|
||||
+ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
||||
+ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
||||
+ kmovd %k3, %edi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
+ notl %edi
|
||||
+ andl $0xff, %edi
|
||||
+# else
|
||||
+ incl %edi
|
||||
+# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Each bit in K1/K3 represents 4-byte element. */
|
||||
- kshiftlw $8, %k3, %k2
|
||||
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
|
||||
+ sall $8, %edi
|
||||
/* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
bytes. */
|
||||
movl %ecx, %SHIFT_REG32
|
||||
sarl $2, %SHIFT_REG32
|
||||
+
|
||||
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
|
||||
+ orl %r9d, %edi
|
||||
# else
|
||||
- kshiftlq $32, %k3, %k2
|
||||
-# endif
|
||||
+ salq $32, %rdi
|
||||
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korq %k1, %k2, %k1
|
||||
- kmovq %k1, %rdi
|
||||
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
|
||||
+ orq %r9, %rdi
|
||||
+# endif
|
||||
|
||||
/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
|
||||
shrxq %SHIFT_REG64, %rdi, %rdi
|
||||
@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
|
||||
/* The first VEC_SIZE * 2 bytes match or are ignored. */
|
||||
VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
|
||||
VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
|
||||
- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
|
||||
- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
|
||||
-
|
||||
- VPCMP $4, %YMM0, %YMM2, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
|
||||
- YMM2. */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- VPCMP $4, %YMM1, %YMM3, %k3
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k4
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k5
|
||||
- kord %k4, %k5, %k4
|
||||
- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
|
||||
- YMM3. */
|
||||
- kord %k3, %k4, %k3
|
||||
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %r9d
|
||||
+ /* Don't use subl since it is the lower 16/32 bits of RDI
|
||||
+ below. */
|
||||
+ notl %r9d
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Each bit in K1/K3 represents 4-byte element. */
|
||||
- kshiftlw $8, %k3, %k2
|
||||
+ /* Only last 8 bits are valid. */
|
||||
+ andl $0xff, %r9d
|
||||
+# endif
|
||||
+
|
||||
+ VPTESTM %YMM1, %YMM1, %k4
|
||||
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
|
||||
+ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
||||
+ kmovd %k3, %edi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
+ notl %edi
|
||||
+ andl $0xff, %edi
|
||||
# else
|
||||
- kshiftlq $32, %k3, %k2
|
||||
+ incl %edi
|
||||
# endif
|
||||
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korq %k1, %k2, %k1
|
||||
- kmovq %k1, %rdi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
|
||||
+ sall $8, %edi
|
||||
+
|
||||
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
|
||||
+ orl %r9d, %edi
|
||||
+# else
|
||||
+ salq $32, %rdi
|
||||
+
|
||||
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
|
||||
+ orq %r9, %rdi
|
||||
+# endif
|
||||
|
||||
xorl %r8d, %r8d
|
||||
/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
|
||||
@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
|
||||
/* R8 has number of bytes skipped. */
|
||||
movl %ecx, %r8d
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
+ /* NB: Divide shift count by 4 since each bit in RDI represent 4
|
||||
bytes. */
|
||||
sarl $2, %ecx
|
||||
-# endif
|
||||
+ /* Skip ECX bytes. */
|
||||
+ shrl %cl, %edi
|
||||
+# else
|
||||
/* Skip ECX bytes. */
|
||||
shrq %cl, %rdi
|
||||
+# endif
|
||||
1:
|
||||
/* Before jumping back to the loop, set ESI to the number of
|
||||
VEC_SIZE * 4 blocks before page crossing. */
|
||||
@@ -818,7 +863,7 @@ L(cross_page_loop):
|
||||
movzbl (%rdi, %rdx), %eax
|
||||
movzbl (%rsi, %rdx), %ecx
|
||||
# endif
|
||||
- /* Check null char. */
|
||||
+ /* Check null CHAR. */
|
||||
testl %eax, %eax
|
||||
jne L(cross_page_loop)
|
||||
/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
|
||||
@@ -901,18 +946,17 @@ L(cross_page):
|
||||
jg L(cross_page_1_vector)
|
||||
L(loop_1_vector):
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
- VMOVU (%rsi, %rdx), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
+
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (%rsi, %rdx). */
|
||||
+ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
- testl %ecx, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(last_vector)
|
||||
|
||||
addl $VEC_SIZE, %edx
|
||||
@@ -931,18 +975,17 @@ L(cross_page_1_vector):
|
||||
cmpl $(PAGE_SIZE - 16), %eax
|
||||
jg L(cross_page_1_xmm)
|
||||
VMOVU (%rdi, %rdx), %XMM0
|
||||
- VMOVU (%rsi, %rdx), %XMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- korw %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korw %k0, %k1, %k1
|
||||
- kmovw %k1, %ecx
|
||||
- testl %ecx, %ecx
|
||||
+
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and 16 bytes at (%rsi, %rdx). */
|
||||
+ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xf, %ecx
|
||||
+# else
|
||||
+ subl $0xffff, %ecx
|
||||
+# endif
|
||||
jne L(last_vector)
|
||||
|
||||
addl $16, %edx
|
||||
@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
|
||||
vmovq (%rdi, %rdx), %XMM0
|
||||
vmovq (%rsi, %rdx), %XMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- kmovd %k1, %ecx
|
||||
-
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and XMM1. */
|
||||
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
|
||||
+ kmovb %k1, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* Only last 2 bits are valid. */
|
||||
- andl $0x3, %ecx
|
||||
+ subl $0x3, %ecx
|
||||
# else
|
||||
- /* Only last 8 bits are valid. */
|
||||
- andl $0xff, %ecx
|
||||
+ subl $0xff, %ecx
|
||||
# endif
|
||||
-
|
||||
- testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
addl $8, %edx
|
||||
@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
|
||||
vmovd (%rdi, %rdx), %XMM0
|
||||
vmovd (%rsi, %rdx), %XMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and XMM1. */
|
||||
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
-
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* Only the last bit is valid. */
|
||||
- andl $0x1, %ecx
|
||||
+ subl $0x1, %ecx
|
||||
# else
|
||||
- /* Only last 4 bits are valid. */
|
||||
- andl $0xf, %ecx
|
||||
+ subl $0xf, %ecx
|
||||
# endif
|
||||
-
|
||||
- testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
addl $4, %edx
|
|
@ -0,0 +1,85 @@
|
|||
commit a182bb7a3922404f79def09d79ef89678b4049f0
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Oct 29 12:56:53 2021 -0700
|
||||
|
||||
x86-64: Remove Prefer_AVX2_STRCMP
|
||||
|
||||
Remove Prefer_AVX2_STRCMP to enable EVEX strcmp. When comparing 2 32-byte
|
||||
strings, EVEX strcmp has been improved to require 1 load, 1 VPTESTM, 1
|
||||
VPCMP, 1 KMOVD and 1 INCL instead of 2 loads, 3 VPCMPs, 2 KORDs, 1 KMOVD
|
||||
and 1 TESTL while AVX2 strcmp requires 1 load, 2 VPCMPEQs, 1 VPMINU, 1
|
||||
VPMOVMSKB and 1 TESTL. EVEX strcmp is now faster than AVX2 strcmp by up
|
||||
to 40% on Tiger Lake and Ice Lake.
|
||||
|
||||
(cherry picked from commit 14dbbf46a007ae5df36646b51ad0c9e5f5259f30)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index de4e3c3b7258120d..f4d4049e391cbabd 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -574,14 +574,6 @@ disable_tsx:
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
||||
-
|
||||
- /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
|
||||
- requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
|
||||
- requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
|
||||
- AVX2 strcmp is faster than EVEX strcmp. */
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
- cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
|
||||
- |= bit_arch_Prefer_AVX2_STRCMP;
|
||||
}
|
||||
|
||||
/* Avoid avoid short distance REP MOVSB on processor with FSRM. */
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index 58f2fad4323d5d91..957db3ad229ba39f 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -239,8 +239,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||
Fast_Copy_Backward,
|
||||
disable, 18);
|
||||
- CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||||
- (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
|
||||
}
|
||||
break;
|
||||
case 19:
|
||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
index 3bdc76cf71007948..8250bfcbecd29a9f 100644
|
||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
@@ -31,5 +31,4 @@ BIT (Prefer_ERMS)
|
||||
BIT (Prefer_No_AVX512)
|
||||
BIT (MathVec_Prefer_No_AVX512)
|
||||
BIT (Prefer_FSRM)
|
||||
-BIT (Prefer_AVX2_STRCMP)
|
||||
BIT (Avoid_Short_Distance_REP_MOVSB)
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
|
||||
index 62b7abeeee646ab4..7c2901bf44456259 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp.c
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp.c
|
||||
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
|
||||
index 60ba0fe356b31779..f94a421784bfe923 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strncmp.c
|
||||
+++ b/sysdeps/x86_64/multiarch/strncmp.c
|
||||
@@ -43,8 +43,7 @@ IFUNC_SELECTOR (void)
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
@ -0,0 +1,48 @@
|
|||
commit 2e64237a8744dd50f9222293275fa52e7248ff76
|
||||
Author: Fangrui Song <maskray@google.com>
|
||||
Date: Tue Nov 2 20:59:52 2021 -0700
|
||||
|
||||
x86-64: Replace movzx with movzbl
|
||||
|
||||
Clang cannot assemble movzx in the AT&T dialect mode.
|
||||
|
||||
../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
|
||||
movzx (%rsi), %ecx
|
||||
^~~~
|
||||
|
||||
Change movzx to movzbl, which follows the AT&T dialect and is used
|
||||
elsewhere in the file.
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
index bc19547b09639071..6197a723b9e0606e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
|
||||
.p2align 4
|
||||
// XXX Same as code above
|
||||
LABEL(Byte0):
|
||||
- movzx (%rsi), %ecx
|
||||
- movzx (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
|
||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
|
||||
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
||||
index 824e648230a15739..7f8a1bc756f86aee 100644
|
||||
--- a/sysdeps/x86_64/strcmp.S
|
||||
+++ b/sysdeps/x86_64/strcmp.S
|
||||
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
|
||||
|
||||
.p2align 4
|
||||
LABEL(Byte0):
|
||||
- movzx (%rsi), %ecx
|
||||
- movzx (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
|
||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
|
|
@ -0,0 +1,843 @@
|
|||
commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Nov 1 00:49:51 2021 -0500
|
||||
|
||||
x86: Optimize memmove-vec-unaligned-erms.S
|
||||
|
||||
No bug.
|
||||
|
||||
The optimizations are as follows:
|
||||
|
||||
1) Always align entry to 64 bytes. This makes behavior more
|
||||
predictable and makes other frontend optimizations easier.
|
||||
|
||||
2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
|
||||
significant benefits in the case that:
|
||||
0 < (dst - src) < [256, 512]
|
||||
|
||||
3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
|
||||
improvement and for FSRM [-10%, 25%].
|
||||
|
||||
In addition to these primary changes there is general cleanup
|
||||
throughout to optimize the aligning routines and control flow logic.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
|
||||
index db106a7a1f23f268..b2b318084823dceb 100644
|
||||
--- a/sysdeps/x86_64/memmove.S
|
||||
+++ b/sysdeps/x86_64/memmove.S
|
||||
@@ -25,7 +25,7 @@
|
||||
/* Use movups and movaps for smaller code sizes. */
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
-
|
||||
+#define MOV_SIZE 3
|
||||
#define SECTION(p) p
|
||||
|
||||
#ifdef USE_MULTIARCH
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
||||
index 1ec1962e861dbf63..67a55f0c85af841c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
||||
@@ -4,7 +4,7 @@
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
-
|
||||
+# define MOV_SIZE 4
|
||||
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
||||
index e195e93f153c9512..975ae6c0515b83cb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
||||
@@ -4,7 +4,7 @@
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
-
|
||||
+# define MOV_SIZE 4
|
||||
# define SECTION(p) p##.avx
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
index 848848ab39ff9326..0fa7126830af7acb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
@@ -25,7 +25,7 @@
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
# define VZEROUPPER
|
||||
-
|
||||
+# define MOV_SIZE 6
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
index 0cbce8f944da51a0..88715441feaaccf5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
@@ -25,7 +25,7 @@
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
# define VZEROUPPER
|
||||
-
|
||||
+# define MOV_SIZE 6
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index abde8438d41f2320..7b27cbdda5fb99f7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -76,6 +76,25 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+/* Whether to align before movsb. Ultimately we want 64 byte
|
||||
+ align and not worth it to load 4x VEC for VEC_SIZE == 16. */
|
||||
+#define ALIGN_MOVSB (VEC_SIZE > 16)
|
||||
+/* Number of bytes to align movsb to. */
|
||||
+#define MOVSB_ALIGN_TO 64
|
||||
+
|
||||
+#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
|
||||
+#define LARGE_MOV_SIZE (MOV_SIZE > 4)
|
||||
+
|
||||
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
|
||||
+# error MOV_SIZE Unknown
|
||||
+#endif
|
||||
+
|
||||
+#if LARGE_MOV_SIZE
|
||||
+# define SMALL_SIZE_OFFSET (4)
|
||||
+#else
|
||||
+# define SMALL_SIZE_OFFSET (0)
|
||||
+#endif
|
||||
+
|
||||
#ifndef PAGE_SIZE
|
||||
# define PAGE_SIZE 4096
|
||||
#endif
|
||||
@@ -199,25 +218,21 @@ L(start):
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
+ /* Load regardless. */
|
||||
+ VMOVU (%rsi), %VEC(0)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(more_2x_vec)
|
||||
-#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
-L(last_2x_vec):
|
||||
-#endif
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU (%rsi), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
-#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
-L(nop):
|
||||
- ret
|
||||
+#if !(defined USE_MULTIARCH && IS_IN (libc))
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
#else
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
-
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__mempcpy_chk_erms)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
|
||||
movq %rdi, %rax
|
||||
L(start_erms):
|
||||
# ifdef __ILP32__
|
||||
@@ -298,310 +313,448 @@ L(start_erms):
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
+ /* Load regardless. */
|
||||
+ VMOVU (%rsi), %VEC(0)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(movsb_more_2x_vec)
|
||||
-L(last_2x_vec):
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU (%rsi), %VEC(0)
|
||||
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
+ */
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
|
||||
L(return):
|
||||
-#if VEC_SIZE > 16
|
||||
+# if VEC_SIZE > 16
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
-#else
|
||||
+# else
|
||||
ret
|
||||
+# endif
|
||||
#endif
|
||||
|
||||
-L(movsb):
|
||||
- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
||||
- jae L(more_8x_vec)
|
||||
- cmpq %rsi, %rdi
|
||||
- jb 1f
|
||||
- /* Source == destination is less common. */
|
||||
- je L(nop)
|
||||
- leaq (%rsi,%rdx), %r9
|
||||
- cmpq %r9, %rdi
|
||||
- /* Avoid slow backward REP MOVSB. */
|
||||
- jb L(more_8x_vec_backward)
|
||||
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
- jz 3f
|
||||
- movq %rdi, %rcx
|
||||
- subq %rsi, %rcx
|
||||
- jmp 2f
|
||||
-# endif
|
||||
-1:
|
||||
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
- jz 3f
|
||||
- movq %rsi, %rcx
|
||||
- subq %rdi, %rcx
|
||||
-2:
|
||||
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
||||
- is N*4GB + [1..63] with N >= 0. */
|
||||
- cmpl $63, %ecx
|
||||
- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
||||
-3:
|
||||
-# endif
|
||||
- mov %RDX_LP, %RCX_LP
|
||||
- rep movsb
|
||||
-L(nop):
|
||||
+#if LARGE_MOV_SIZE
|
||||
+ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
|
||||
+ ENTRY block and L(less_vec). */
|
||||
+ .p2align 4,, 8
|
||||
+L(between_4_7):
|
||||
+ /* From 4 to 7. No branch when size == 4. */
|
||||
+ movl (%rsi), %ecx
|
||||
+ movl (%rsi, %rdx), %esi
|
||||
+ movl %ecx, (%rdi)
|
||||
+ movl %esi, (%rdi, %rdx)
|
||||
ret
|
||||
#endif
|
||||
|
||||
+ .p2align 4
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
#endif
|
||||
#if VEC_SIZE > 32
|
||||
- cmpb $32, %dl
|
||||
+ cmpl $32, %edx
|
||||
jae L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
- cmpb $16, %dl
|
||||
+ cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
#endif
|
||||
- cmpb $8, %dl
|
||||
+ cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
- cmpb $4, %dl
|
||||
+#if SMALL_MOV_SIZE
|
||||
+ cmpl $4, %edx
|
||||
+#else
|
||||
+ subq $4, %rdx
|
||||
+#endif
|
||||
jae L(between_4_7)
|
||||
- cmpb $1, %dl
|
||||
- ja L(between_2_3)
|
||||
- jb 1f
|
||||
- movzbl (%rsi), %ecx
|
||||
+ cmpl $(1 - SMALL_SIZE_OFFSET), %edx
|
||||
+ jl L(copy_0)
|
||||
+ movb (%rsi), %cl
|
||||
+ je L(copy_1)
|
||||
+ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
|
||||
+ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
|
||||
+L(copy_1):
|
||||
movb %cl, (%rdi)
|
||||
-1:
|
||||
+L(copy_0):
|
||||
ret
|
||||
+
|
||||
+#if SMALL_MOV_SIZE
|
||||
+ .p2align 4,, 8
|
||||
+L(between_4_7):
|
||||
+ /* From 4 to 7. No branch when size == 4. */
|
||||
+ movl -4(%rsi, %rdx), %ecx
|
||||
+ movl (%rsi), %esi
|
||||
+ movl %ecx, -4(%rdi, %rdx)
|
||||
+ movl %esi, (%rdi)
|
||||
+ ret
|
||||
+#endif
|
||||
+
|
||||
+#if VEC_SIZE > 16
|
||||
+ /* From 16 to 31. No branch when size == 16. */
|
||||
+ .p2align 4,, 8
|
||||
+L(between_16_31):
|
||||
+ vmovdqu (%rsi), %xmm0
|
||||
+ vmovdqu -16(%rsi, %rdx), %xmm1
|
||||
+ vmovdqu %xmm0, (%rdi)
|
||||
+ vmovdqu %xmm1, -16(%rdi, %rdx)
|
||||
+ /* No ymm registers have been touched. */
|
||||
+ ret
|
||||
+#endif
|
||||
+
|
||||
#if VEC_SIZE > 32
|
||||
+ .p2align 4,, 10
|
||||
L(between_32_63):
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
VMOVU (%rsi), %YMM0
|
||||
- VMOVU -32(%rsi,%rdx), %YMM1
|
||||
+ VMOVU -32(%rsi, %rdx), %YMM1
|
||||
VMOVU %YMM0, (%rdi)
|
||||
- VMOVU %YMM1, -32(%rdi,%rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
-#endif
|
||||
-#if VEC_SIZE > 16
|
||||
- /* From 16 to 31. No branch when size == 16. */
|
||||
-L(between_16_31):
|
||||
- VMOVU (%rsi), %XMM0
|
||||
- VMOVU -16(%rsi,%rdx), %XMM1
|
||||
- VMOVU %XMM0, (%rdi)
|
||||
- VMOVU %XMM1, -16(%rdi,%rdx)
|
||||
+ VMOVU %YMM1, -32(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
- movq -8(%rsi,%rdx), %rcx
|
||||
+ movq -8(%rsi, %rdx), %rcx
|
||||
movq (%rsi), %rsi
|
||||
- movq %rcx, -8(%rdi,%rdx)
|
||||
movq %rsi, (%rdi)
|
||||
+ movq %rcx, -8(%rdi, %rdx)
|
||||
ret
|
||||
-L(between_4_7):
|
||||
- /* From 4 to 7. No branch when size == 4. */
|
||||
- movl -4(%rsi,%rdx), %ecx
|
||||
- movl (%rsi), %esi
|
||||
- movl %ecx, -4(%rdi,%rdx)
|
||||
- movl %esi, (%rdi)
|
||||
- ret
|
||||
-L(between_2_3):
|
||||
- /* From 2 to 3. No branch when size == 2. */
|
||||
- movzwl -2(%rsi,%rdx), %ecx
|
||||
- movzwl (%rsi), %esi
|
||||
- movw %cx, -2(%rdi,%rdx)
|
||||
- movw %si, (%rdi)
|
||||
- ret
|
||||
|
||||
+ .p2align 4,, 10
|
||||
+L(last_4x_vec):
|
||||
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
||||
+
|
||||
+ /* VEC(0) and VEC(1) have already been loaded. */
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
|
||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
|
||||
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
L(movsb_more_2x_vec):
|
||||
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
||||
ja L(movsb)
|
||||
#endif
|
||||
L(more_2x_vec):
|
||||
- /* More than 2 * VEC and there may be overlap between destination
|
||||
- and source. */
|
||||
+ /* More than 2 * VEC and there may be overlap between
|
||||
+ destination and source. */
|
||||
cmpq $(VEC_SIZE * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
+ /* Load VEC(1) regardless. VEC(0) has already been loaded. */
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_4x_vec)
|
||||
- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
||||
- VMOVU (%rsi), %VEC(0)
|
||||
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
|
||||
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
|
||||
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
|
||||
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
|
||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
|
||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
|
||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
|
||||
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
-L(last_4x_vec):
|
||||
- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
||||
- VMOVU (%rsi), %VEC(0)
|
||||
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
||||
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
|
||||
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
|
||||
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 4
|
||||
L(more_8x_vec):
|
||||
+ movq %rdi, %rcx
|
||||
+ subq %rsi, %rcx
|
||||
+ /* Go to backwards temporal copy if overlap no matter what as
|
||||
+ backward REP MOVSB is slow and we don't want to use NT stores if
|
||||
+ there is overlap. */
|
||||
+ cmpq %rdx, %rcx
|
||||
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
||||
+ jb L(more_8x_vec_backward_check_nop)
|
||||
/* Check if non-temporal move candidate. */
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
/* Check non-temporal store threshold. */
|
||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
ja L(large_memcpy_2x)
|
||||
#endif
|
||||
- /* Entry if rdx is greater than non-temporal threshold but there
|
||||
- is overlap. */
|
||||
+ /* To reach this point there cannot be overlap and dst > src. So
|
||||
+ check for overlap and src > dst in which case correctness
|
||||
+ requires forward copy. Otherwise decide between backward/forward
|
||||
+ copy depending on address aliasing. */
|
||||
+
|
||||
+ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
|
||||
+ but less than __x86_shared_non_temporal_threshold. */
|
||||
L(more_8x_vec_check):
|
||||
- cmpq %rsi, %rdi
|
||||
- ja L(more_8x_vec_backward)
|
||||
- /* Source == destination is less common. */
|
||||
- je L(nop)
|
||||
- /* Load the first VEC and last 4 * VEC to support overlapping
|
||||
- addresses. */
|
||||
- VMOVU (%rsi), %VEC(4)
|
||||
+ /* rcx contains dst - src. Add back length (rdx). */
|
||||
+ leaq (%rcx, %rdx), %r8
|
||||
+ /* If r8 has different sign than rcx then there is overlap so we
|
||||
+ must do forward copy. */
|
||||
+ xorq %rcx, %r8
|
||||
+ /* Isolate just sign bit of r8. */
|
||||
+ shrq $63, %r8
|
||||
+ /* Get 4k difference dst - src. */
|
||||
+ andl $(PAGE_SIZE - 256), %ecx
|
||||
+ /* If r8 is non-zero must do foward for correctness. Otherwise
|
||||
+ if ecx is non-zero there is 4k False Alaising so do backward
|
||||
+ copy. */
|
||||
+ addl %r8d, %ecx
|
||||
+ jz L(more_8x_vec_backward)
|
||||
+
|
||||
+ /* if rdx is greater than __x86_shared_non_temporal_threshold
|
||||
+ but there is overlap, or from short distance movsb. */
|
||||
+L(more_8x_vec_forward):
|
||||
+ /* Load first and last 4 * VEC to support overlapping addresses.
|
||||
+ */
|
||||
+
|
||||
+ /* First vec was already loaded into VEC(0). */
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
|
||||
+ /* Save begining of dst. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align dst to VEC_SIZE - 1. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
|
||||
- /* Save start and stop of the destination buffer. */
|
||||
- movq %rdi, %r11
|
||||
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
|
||||
- /* Align destination for aligned stores in the loop. Compute
|
||||
- how much destination is misaligned. */
|
||||
- movq %rdi, %r8
|
||||
- andq $(VEC_SIZE - 1), %r8
|
||||
- /* Get the negative of offset for alignment. */
|
||||
- subq $VEC_SIZE, %r8
|
||||
- /* Adjust source. */
|
||||
- subq %r8, %rsi
|
||||
- /* Adjust destination which should be aligned now. */
|
||||
- subq %r8, %rdi
|
||||
- /* Adjust length. */
|
||||
- addq %r8, %rdx
|
||||
|
||||
- .p2align 4
|
||||
+ /* Subtract dst from src. Add back after dst aligned. */
|
||||
+ subq %rcx, %rsi
|
||||
+ /* Finish aligning dst. */
|
||||
+ incq %rdi
|
||||
+ /* Restore src adjusted with new value for aligned dst. */
|
||||
+ addq %rdi, %rsi
|
||||
+ /* Store end of buffer minus tail in rdx. */
|
||||
+ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
|
||||
+
|
||||
+ /* Dont use multi-byte nop to align. */
|
||||
+ .p2align 4,, 11
|
||||
L(loop_4x_vec_forward):
|
||||
/* Copy 4 * VEC a time forward. */
|
||||
- VMOVU (%rsi), %VEC(0)
|
||||
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
+ VMOVU (%rsi), %VEC(1)
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
|
||||
subq $-(VEC_SIZE * 4), %rsi
|
||||
- addq $-(VEC_SIZE * 4), %rdx
|
||||
- VMOVA %VEC(0), (%rdi)
|
||||
- VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
+ VMOVA %VEC(1), (%rdi)
|
||||
+ VMOVA %VEC(2), VEC_SIZE(%rdi)
|
||||
+ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ cmpq %rdi, %rdx
|
||||
ja L(loop_4x_vec_forward)
|
||||
/* Store the last 4 * VEC. */
|
||||
- VMOVU %VEC(5), (%rcx)
|
||||
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||||
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||||
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
|
||||
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
|
||||
+ VMOVU %VEC(7), VEC_SIZE(%rdx)
|
||||
+ VMOVU %VEC(8), (%rdx)
|
||||
/* Store the first VEC. */
|
||||
- VMOVU %VEC(4), (%r11)
|
||||
+ VMOVU %VEC(0), (%rcx)
|
||||
+ /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
|
||||
+ */
|
||||
+L(nop_backward):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 8
|
||||
+L(more_8x_vec_backward_check_nop):
|
||||
+ /* rcx contains dst - src. Test for dst == src to skip all of
|
||||
+ memmove. */
|
||||
+ testq %rcx, %rcx
|
||||
+ jz L(nop_backward)
|
||||
L(more_8x_vec_backward):
|
||||
/* Load the first 4 * VEC and last VEC to support overlapping
|
||||
addresses. */
|
||||
- VMOVU (%rsi), %VEC(4)
|
||||
+
|
||||
+ /* First vec was also loaded into VEC(0). */
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(5)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
|
||||
+ /* Begining of region for 4x backward copy stored in rcx. */
|
||||
+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
|
||||
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
|
||||
- /* Save stop of the destination buffer. */
|
||||
- leaq -VEC_SIZE(%rdi, %rdx), %r11
|
||||
- /* Align destination end for aligned stores in the loop. Compute
|
||||
- how much destination end is misaligned. */
|
||||
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
|
||||
- movq %r11, %r9
|
||||
- movq %r11, %r8
|
||||
- andq $(VEC_SIZE - 1), %r8
|
||||
- /* Adjust source. */
|
||||
- subq %r8, %rcx
|
||||
- /* Adjust the end of destination which should be aligned now. */
|
||||
- subq %r8, %r9
|
||||
- /* Adjust length. */
|
||||
- subq %r8, %rdx
|
||||
-
|
||||
- .p2align 4
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
|
||||
+ /* Subtract dst from src. Add back after dst aligned. */
|
||||
+ subq %rdi, %rsi
|
||||
+ /* Align dst. */
|
||||
+ andq $-(VEC_SIZE), %rcx
|
||||
+ /* Restore src. */
|
||||
+ addq %rcx, %rsi
|
||||
+
|
||||
+ /* Don't use multi-byte nop to align. */
|
||||
+ .p2align 4,, 11
|
||||
L(loop_4x_vec_backward):
|
||||
/* Copy 4 * VEC a time backward. */
|
||||
- VMOVU (%rcx), %VEC(0)
|
||||
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
- addq $-(VEC_SIZE * 4), %rcx
|
||||
- addq $-(VEC_SIZE * 4), %rdx
|
||||
- VMOVA %VEC(0), (%r9)
|
||||
- VMOVA %VEC(1), -VEC_SIZE(%r9)
|
||||
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
- addq $-(VEC_SIZE * 4), %r9
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_4x_vec_backward)
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
|
||||
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
|
||||
+ addq $(VEC_SIZE * -4), %rsi
|
||||
+ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
|
||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
|
||||
+ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
|
||||
+ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
|
||||
+ addq $(VEC_SIZE * -4), %rcx
|
||||
+ cmpq %rcx, %rdi
|
||||
+ jb L(loop_4x_vec_backward)
|
||||
/* Store the first 4 * VEC. */
|
||||
- VMOVU %VEC(4), (%rdi)
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
- VMOVU %VEC(8), (%r11)
|
||||
+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ /* L(skip_short_movsb_check) is only used with ERMS. Not for
|
||||
+ FSRM. */
|
||||
+ .p2align 5,, 16
|
||||
+# if ALIGN_MOVSB
|
||||
+L(skip_short_movsb_check):
|
||||
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
+# endif
|
||||
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
||||
+# error Unsupported MOVSB_ALIGN_TO
|
||||
+# endif
|
||||
+ /* If CPU does not have FSRM two options for aligning. Align src
|
||||
+ if dst and src 4k alias. Otherwise align dst. */
|
||||
+ testl $(PAGE_SIZE - 512), %ecx
|
||||
+ jnz L(movsb_align_dst)
|
||||
+ /* Fall through. dst and src 4k alias. It's better to align src
|
||||
+ here because the bottleneck will be loads dues to the false
|
||||
+ dependency on dst. */
|
||||
+
|
||||
+ /* rcx already has dst - src. */
|
||||
+ movq %rcx, %r9
|
||||
+ /* Add src to len. Subtract back after src aligned. -1 because
|
||||
+ src is initially aligned to MOVSB_ALIGN_TO - 1. */
|
||||
+ leaq -1(%rsi, %rdx), %rcx
|
||||
+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
|
||||
+ orq $(MOVSB_ALIGN_TO - 1), %rsi
|
||||
+ /* Restore dst and len adjusted with new values for aligned dst.
|
||||
+ */
|
||||
+ leaq 1(%rsi, %r9), %rdi
|
||||
+ subq %rsi, %rcx
|
||||
+ /* Finish aligning src. */
|
||||
+ incq %rsi
|
||||
+
|
||||
+ rep movsb
|
||||
+
|
||||
+ VMOVU %VEC(0), (%r8)
|
||||
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
+ VMOVU %VEC(1), VEC_SIZE(%r8)
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
+
|
||||
+ .p2align 4,, 12
|
||||
+L(movsb):
|
||||
+ movq %rdi, %rcx
|
||||
+ subq %rsi, %rcx
|
||||
+ /* Go to backwards temporal copy if overlap no matter what as
|
||||
+ backward REP MOVSB is slow and we don't want to use NT stores if
|
||||
+ there is overlap. */
|
||||
+ cmpq %rdx, %rcx
|
||||
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
||||
+ jb L(more_8x_vec_backward_check_nop)
|
||||
+# if ALIGN_MOVSB
|
||||
+ /* Save dest for storing aligning VECs later. */
|
||||
+ movq %rdi, %r8
|
||||
+# endif
|
||||
+ /* If above __x86_rep_movsb_stop_threshold most likely is
|
||||
+ candidate for NT moves aswell. */
|
||||
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
||||
+ jae L(large_memcpy_2x_check)
|
||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
|
||||
+ /* Only avoid short movsb if CPU has FSRM. */
|
||||
+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
+ jz L(skip_short_movsb_check)
|
||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
+ /* Avoid "rep movsb" if RCX, the distance between source and
|
||||
+ destination, is N*4GB + [1..63] with N >= 0. */
|
||||
+
|
||||
+ /* ecx contains dst - src. Early check for backward copy
|
||||
+ conditions means only case of slow movsb with src = dst + [0,
|
||||
+ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
|
||||
+ for that case. */
|
||||
+ cmpl $-64, %ecx
|
||||
+ ja L(more_8x_vec_forward)
|
||||
+# endif
|
||||
+# endif
|
||||
+# if ALIGN_MOVSB
|
||||
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
+# endif
|
||||
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
||||
+# error Unsupported MOVSB_ALIGN_TO
|
||||
+# endif
|
||||
+ /* Fall through means cpu has FSRM. In that case exclusively
|
||||
+ align destination. */
|
||||
+L(movsb_align_dst):
|
||||
+ /* Subtract dst from src. Add back after dst aligned. */
|
||||
+ subq %rdi, %rsi
|
||||
+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
|
||||
+ addq $(MOVSB_ALIGN_TO - 1), %rdi
|
||||
+ /* Add dst to len. Subtract back after dst aligned. */
|
||||
+ leaq (%r8, %rdx), %rcx
|
||||
+ /* Finish aligning dst. */
|
||||
+ andq $-(MOVSB_ALIGN_TO), %rdi
|
||||
+ /* Restore src and len adjusted with new values for aligned dst.
|
||||
+ */
|
||||
+ addq %rdi, %rsi
|
||||
+ subq %rdi, %rcx
|
||||
+
|
||||
+ rep movsb
|
||||
+
|
||||
+ /* Store VECs loaded for aligning. */
|
||||
+ VMOVU %VEC(0), (%r8)
|
||||
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
+ VMOVU %VEC(1), VEC_SIZE(%r8)
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+# else /* !ALIGN_MOVSB. */
|
||||
+L(skip_short_movsb_check):
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ rep movsb
|
||||
+ ret
|
||||
+# endif
|
||||
+#endif
|
||||
|
||||
+ .p2align 4,, 10
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
- .p2align 4
|
||||
+L(large_memcpy_2x_check):
|
||||
+ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
||||
+ jb L(more_8x_vec_check)
|
||||
L(large_memcpy_2x):
|
||||
- /* Compute absolute value of difference between source and
|
||||
- destination. */
|
||||
- movq %rdi, %r9
|
||||
- subq %rsi, %r9
|
||||
- movq %r9, %r8
|
||||
- leaq -1(%r9), %rcx
|
||||
- sarq $63, %r8
|
||||
- xorq %r8, %r9
|
||||
- subq %r8, %r9
|
||||
- /* Don't use non-temporal store if there is overlap between
|
||||
- destination and source since destination may be in cache when
|
||||
- source is loaded. */
|
||||
- cmpq %r9, %rdx
|
||||
- ja L(more_8x_vec_check)
|
||||
+ /* To reach this point it is impossible for dst > src and
|
||||
+ overlap. Remaining to check is src > dst and overlap. rcx
|
||||
+ already contains dst - src. Negate rcx to get src - dst. If
|
||||
+ length > rcx then there is overlap and forward copy is best. */
|
||||
+ negq %rcx
|
||||
+ cmpq %rcx, %rdx
|
||||
+ ja L(more_8x_vec_forward)
|
||||
|
||||
/* Cache align destination. First store the first 64 bytes then
|
||||
adjust alignments. */
|
||||
- VMOVU (%rsi), %VEC(8)
|
||||
-#if VEC_SIZE < 64
|
||||
- VMOVU VEC_SIZE(%rsi), %VEC(9)
|
||||
-#if VEC_SIZE < 32
|
||||
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
||||
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
||||
-#endif
|
||||
-#endif
|
||||
- VMOVU %VEC(8), (%rdi)
|
||||
-#if VEC_SIZE < 64
|
||||
- VMOVU %VEC(9), VEC_SIZE(%rdi)
|
||||
-#if VEC_SIZE < 32
|
||||
- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
||||
-#endif
|
||||
-#endif
|
||||
+
|
||||
+ /* First vec was also loaded into VEC(0). */
|
||||
+# if VEC_SIZE < 64
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
+# if VEC_SIZE < 32
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
+# endif
|
||||
+# endif
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+# if VEC_SIZE < 64
|
||||
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
+# if VEC_SIZE < 32
|
||||
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
+# endif
|
||||
+# endif
|
||||
+
|
||||
/* Adjust source, destination, and size. */
|
||||
movq %rdi, %r8
|
||||
andq $63, %r8
|
||||
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
|
||||
/* Adjust length. */
|
||||
addq %r8, %rdx
|
||||
|
||||
- /* Test if source and destination addresses will alias. If they do
|
||||
- the larger pipeline in large_memcpy_4x alleviated the
|
||||
+ /* Test if source and destination addresses will alias. If they
|
||||
+ do the larger pipeline in large_memcpy_4x alleviated the
|
||||
performance drop. */
|
||||
+
|
||||
+ /* ecx contains -(dst - src). not ecx will return dst - src - 1
|
||||
+ which works for testing aliasing. */
|
||||
+ notl %ecx
|
||||
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
||||
jz L(large_memcpy_4x)
|
||||
|
||||
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
|
||||
/* ecx stores inner loop counter. */
|
||||
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||||
L(loop_large_memcpy_4x_inner):
|
||||
- /* Only one prefetch set per page as doing 4 pages give more time
|
||||
- for prefetcher to keep up. */
|
||||
+ /* Only one prefetch set per page as doing 4 pages give more
|
||||
+ time for prefetcher to keep up. */
|
||||
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||||
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
|
@ -0,0 +1,131 @@
|
|||
commit cecbac52123456e2fbcff062a4165bf7b9174797
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Nov 1 00:49:52 2021 -0500
|
||||
|
||||
x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
|
||||
|
||||
No bug.
|
||||
|
||||
This patch doubles the rep_movsb_threshold when using ERMS. Based on
|
||||
benchmarks the vector copy loop, especially now that it handles 4k
|
||||
aliasing, is better for these medium ranged.
|
||||
|
||||
On Skylake with ERMS:
|
||||
|
||||
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
|
||||
4096, 0, 0, 0, 0.975
|
||||
4096, 0, 0, 1, 0.953
|
||||
4096, 12, 0, 0, 0.969
|
||||
4096, 12, 0, 1, 0.872
|
||||
4096, 44, 0, 0, 0.979
|
||||
4096, 44, 0, 1, 0.83
|
||||
4096, 0, 12, 0, 1.006
|
||||
4096, 0, 12, 1, 0.989
|
||||
4096, 0, 44, 0, 0.739
|
||||
4096, 0, 44, 1, 0.942
|
||||
4096, 12, 12, 0, 1.009
|
||||
4096, 12, 12, 1, 0.973
|
||||
4096, 44, 44, 0, 0.791
|
||||
4096, 44, 44, 1, 0.961
|
||||
4096, 2048, 0, 0, 0.978
|
||||
4096, 2048, 0, 1, 0.951
|
||||
4096, 2060, 0, 0, 0.986
|
||||
4096, 2060, 0, 1, 0.963
|
||||
4096, 2048, 12, 0, 0.971
|
||||
4096, 2048, 12, 1, 0.941
|
||||
4096, 2060, 12, 0, 0.977
|
||||
4096, 2060, 12, 1, 0.949
|
||||
8192, 0, 0, 0, 0.85
|
||||
8192, 0, 0, 1, 0.845
|
||||
8192, 13, 0, 0, 0.937
|
||||
8192, 13, 0, 1, 0.939
|
||||
8192, 45, 0, 0, 0.932
|
||||
8192, 45, 0, 1, 0.927
|
||||
8192, 0, 13, 0, 0.621
|
||||
8192, 0, 13, 1, 0.62
|
||||
8192, 0, 45, 0, 0.53
|
||||
8192, 0, 45, 1, 0.516
|
||||
8192, 13, 13, 0, 0.664
|
||||
8192, 13, 13, 1, 0.659
|
||||
8192, 45, 45, 0, 0.593
|
||||
8192, 45, 45, 1, 0.575
|
||||
8192, 2048, 0, 0, 0.854
|
||||
8192, 2048, 0, 1, 0.834
|
||||
8192, 2061, 0, 0, 0.863
|
||||
8192, 2061, 0, 1, 0.857
|
||||
8192, 2048, 13, 0, 0.63
|
||||
8192, 2048, 13, 1, 0.629
|
||||
8192, 2061, 13, 0, 0.627
|
||||
8192, 2061, 13, 1, 0.62
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
|
||||
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
|
||||
unsigned int minimum_rep_movsb_threshold;
|
||||
#endif
|
||||
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
|
||||
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
|
||||
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
|
||||
+ threshold is 2048 * (VEC_SIZE / 16). */
|
||||
unsigned int rep_movsb_threshold;
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- rep_movsb_threshold = 2048 * (64 / 16);
|
||||
+ rep_movsb_threshold = 4096 * (64 / 16);
|
||||
#if HAVE_TUNABLES
|
||||
minimum_rep_movsb_threshold = 64 * 8;
|
||||
#endif
|
||||
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
|
||||
AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- rep_movsb_threshold = 2048 * (32 / 16);
|
||||
+ rep_movsb_threshold = 4096 * (32 / 16);
|
||||
#if HAVE_TUNABLES
|
||||
minimum_rep_movsb_threshold = 32 * 8;
|
||||
#endif
|
||||
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
||||
index dd6e1d65c9490d4f..419313804d49cf65 100644
|
||||
--- a/sysdeps/x86/dl-tunables.list
|
||||
+++ b/sysdeps/x86/dl-tunables.list
|
||||
@@ -32,17 +32,21 @@ glibc {
|
||||
}
|
||||
x86_rep_movsb_threshold {
|
||||
type: SIZE_T
|
||||
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
|
||||
- # isn't faster on short data. The memcpy micro benchmark in glibc
|
||||
- # shows that 2KB is the approximate value above which REP MOVSB
|
||||
- # becomes faster than SSE2 optimization on processors with Enhanced
|
||||
- # REP MOVSB. Since larger register size can move more data with a
|
||||
- # single load and store, the threshold is higher with larger register
|
||||
- # size. Note: Since the REP MOVSB threshold must be greater than 8
|
||||
- # times of vector size and the default value is 2048 * (vector size
|
||||
- # / 16), the default value and the minimum value must be updated at
|
||||
- # run-time. NB: Don't set the default value since we can't tell if
|
||||
- # the tunable value is set by user or not [BZ #27069].
|
||||
+ # Since there is overhead to set up REP MOVSB operation, REP
|
||||
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
|
||||
+ # in glibc shows that 2KB is the approximate value above which
|
||||
+ # REP MOVSB becomes faster than SSE2 optimization on processors
|
||||
+ # with Enhanced REP MOVSB. Since larger register size can move
|
||||
+ # more data with a single load and store, the threshold is
|
||||
+ # higher with larger register size. Micro benchmarks show AVX
|
||||
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
|
||||
+ # threshold is extrapolated to 16KB. For machines with FSRM the
|
||||
+ # threshold is universally set at 2112 bytes. Note: Since the
|
||||
+ # REP MOVSB threshold must be greater than 8 times of vector
|
||||
+ # size and the default value is 4096 * (vector size / 16), the
|
||||
+ # default value and the minimum value must be updated at
|
||||
+ # run-time. NB: Don't set the default value since we can't tell
|
||||
+ # if the tunable value is set by user or not [BZ #27069].
|
||||
minval: 1
|
||||
}
|
||||
x86_rep_stosb_threshold {
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,104 @@
|
|||
commit 4bbd0f866ad0ff197f72346f776ebee9b7e1a706
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Dec 3 15:29:25 2021 -0800
|
||||
|
||||
x86-64: Use notl in EVEX strcmp [BZ #28646]
|
||||
|
||||
Must use notl %edi here as lower bits are for CHAR comparisons
|
||||
potentially out of range thus can be 0 without indicating mismatch.
|
||||
This fixes BZ #28646.
|
||||
|
||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02)
|
||||
|
||||
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
|
||||
index 7feababf4ddc5603..a0255b9625fbcedd 100644
|
||||
--- a/string/test-strcmp.c
|
||||
+++ b/string/test-strcmp.c
|
||||
@@ -25,6 +25,7 @@
|
||||
# define TEST_NAME "strcmp"
|
||||
#endif
|
||||
#include "test-string.h"
|
||||
+#include <support/test-driver.h>
|
||||
|
||||
#ifdef WIDE
|
||||
# include <wchar.h>
|
||||
@@ -392,6 +393,32 @@ check2 (void)
|
||||
}
|
||||
}
|
||||
|
||||
+static void
|
||||
+check3 (void)
|
||||
+{
|
||||
+ size_t size = 0xd000 + 0x4000;
|
||||
+ CHAR *s1, *s2;
|
||||
+ CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
|
||||
+ MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||
+ CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
|
||||
+ MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||
+ if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
|
||||
+ error (EXIT_UNSUPPORTED, errno, "mmap failed");
|
||||
+
|
||||
+ s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
|
||||
+ s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
|
||||
+
|
||||
+ STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
|
||||
+ STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
|
||||
+
|
||||
+ int exp_result = SIMPLE_STRCMP (s1, s2);
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ check_result (impl, s1, s2, exp_result);
|
||||
+
|
||||
+ munmap ((void *) buffer1, size);
|
||||
+ munmap ((void *) buffer2, size);
|
||||
+}
|
||||
+
|
||||
int
|
||||
test_main (void)
|
||||
{
|
||||
@@ -400,6 +427,7 @@ test_main (void)
|
||||
test_init ();
|
||||
check();
|
||||
check2 ();
|
||||
+ check3 ();
|
||||
|
||||
printf ("%23s", "");
|
||||
FOR_EACH_IMPL (impl, 0)
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index 82f12ac89bcae20b..6f5c4bf984da2b80 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -656,12 +656,13 @@ L(loop_cross_page):
|
||||
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
||||
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
||||
kmovd %k3, %edi
|
||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
||||
+ comparisons potentially out of range thus can be 0 without
|
||||
+ indicating mismatch. */
|
||||
+ notl %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
- notl %edi
|
||||
andl $0xff, %edi
|
||||
-# else
|
||||
- incl %edi
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
|
||||
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
||||
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
||||
kmovd %k3, %edi
|
||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
||||
+ comparisons potentially out of range thus can be 0 without
|
||||
+ indicating mismatch. */
|
||||
+ notl %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
- notl %edi
|
||||
andl $0xff, %edi
|
||||
-# else
|
||||
- incl %edi
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
|
@ -0,0 +1,30 @@
|
|||
commit f3a99b2216114f89b20329ae7664b764248b4bbd
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Mon Dec 6 07:14:12 2021 -0800
|
||||
|
||||
x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
|
||||
|
||||
Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
|
||||
they won't lower CPU frequency when ZMM load and store instructions are
|
||||
used.
|
||||
|
||||
(cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index f4d4049e391cbabd..09590d8794b1c6fb 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -566,8 +566,11 @@ disable_tsx:
|
||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
||||
else
|
||||
{
|
||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
- |= bit_arch_Prefer_No_AVX512;
|
||||
+ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
|
||||
+ when ZMM load and store instructions are used. */
|
||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
+ |= bit_arch_Prefer_No_AVX512;
|
||||
|
||||
/* Avoid RTM abort triggered by VZEROUPPER inside a
|
||||
transactionally executing RTM region. */
|
|
@ -0,0 +1,384 @@
|
|||
commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Dec 24 18:54:41 2021 -0600
|
||||
|
||||
x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
|
||||
|
||||
No bug.
|
||||
Optimizations are twofold.
|
||||
|
||||
1) Replace page cross and 0/1 checks with masked load instructions in
|
||||
L(less_vec). In applications this reduces branch-misses in the
|
||||
hot [0, 32] case.
|
||||
2) Change controlflow so that L(less_vec) case gets the fall through.
|
||||
|
||||
Change 2) helps copies in the [0, 32] size range but comes at the cost
|
||||
of copies in the [33, 64] size range. From profiles of GCC and
|
||||
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
|
||||
appears to the the right tradeoff.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 640f6757fac8a356..d2899e7c7078cd41 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -62,15 +62,18 @@ Latency:
|
||||
# define VMOVU vmovdqu64
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+# define VMOVU_MASK vmovdqu32
|
||||
# define CHAR_SIZE 4
|
||||
# define VPCMP vpcmpd
|
||||
# define VPTEST vptestmd
|
||||
# else
|
||||
+# define VMOVU_MASK vmovdqu8
|
||||
# define CHAR_SIZE 1
|
||||
# define VPCMP vpcmpub
|
||||
# define VPTEST vptestmb
|
||||
# endif
|
||||
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
cmp $CHAR_PER_VEC, %RDX_LP
|
||||
- jb L(less_vec)
|
||||
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
|
||||
+ ja L(more_1x_vec)
|
||||
+
|
||||
+ /* Create mask for CHAR's we want to compare. This allows us to
|
||||
+ avoid having to include page cross logic. */
|
||||
+ movl $-1, %ecx
|
||||
+ bzhil %edx, %ecx, %ecx
|
||||
+ kmovd %ecx, %k2
|
||||
+
|
||||
+ /* Safe to load full ymm with mask. */
|
||||
+ VMOVU_MASK (%rsi), %YMM2{%k2}
|
||||
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
+ ret
|
||||
|
||||
+ .p2align 4
|
||||
+L(return_vec_0):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(more_1x_vec):
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %YMM1
|
||||
/* Use compare not equals to directly check for mismatch. */
|
||||
- VPCMP $4, (%rdi), %YMM1, %k1
|
||||
+ VPCMP $4,(%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
/* NB: eax must be destination register if going to
|
||||
L(return_vec_[0,2]). For L(return_vec_3) destination register
|
||||
@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
|
||||
/* Check third and fourth VEC no matter what. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_2)
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_3)
|
||||
@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
||||
oring with YMM1. Result is stored in YMM4. */
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
|
||||
/* Or together YMM2, YMM3, and YMM4 into YMM4. */
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+
|
||||
+ .p2align 4,, 8
|
||||
L(8x_end_return_vec_0_1_2_3):
|
||||
movq %rdx, %rdi
|
||||
L(8x_return_vec_0_1_2_3):
|
||||
@@ -222,23 +262,6 @@ L(return_vec_3):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(return_vec_0):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
- /* NB: no partial register stall here because xorl zero idiom
|
||||
- above. */
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
|
||||
.p2align 4
|
||||
L(return_vec_1):
|
||||
@@ -297,7 +320,7 @@ L(loop_4x_vec):
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
@@ -324,7 +347,7 @@ L(loop_4x_vec):
|
||||
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
||||
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
@@ -336,14 +359,14 @@ L(loop_4x_vec):
|
||||
/* Only entry is from L(more_8x_vec). */
|
||||
.p2align 4,, 10
|
||||
L(8x_last_2x_vec):
|
||||
- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(8x_return_vec_2)
|
||||
/* Naturally aligned to 16 bytes. */
|
||||
L(8x_last_1x_vec):
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
|
||||
- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(8x_return_vec_3)
|
||||
@@ -392,7 +415,9 @@ L(last_1x_vec):
|
||||
jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4,, 10
|
||||
+
|
||||
+ /* Don't align. Takes 2-fetch blocks either way and aligning
|
||||
+ will cause code to spill into another cacheline. */
|
||||
L(return_vec_1_end):
|
||||
/* Use bsf to save code size. This is necessary to have
|
||||
L(one_or_less) fit in aligning bytes between. */
|
||||
@@ -411,31 +436,8 @@ L(return_vec_1_end):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: L(one_or_less) fits in alignment padding between
|
||||
- L(return_vec_1_end) and L(return_vec_0_end). */
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movl (%rdi), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
- ret
|
||||
-# else
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movzbl (%rsi), %ecx
|
||||
- movzbl (%rdi), %eax
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-# endif
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
+ /* Don't align. Takes 2-fetch blocks either way and aligning
|
||||
+ will cause code to spill into another cacheline. */
|
||||
L(return_vec_0_end):
|
||||
tzcntl %eax, %eax
|
||||
addl %edx, %eax
|
||||
@@ -451,146 +453,7 @@ L(return_vec_0_end):
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
+ /* 1-byte until next cache line. */
|
||||
|
||||
- .p2align 4
|
||||
-L(less_vec):
|
||||
- /* Check if one or less CHAR. This is necessary for size == 0
|
||||
- but is also faster for size == CHAR_SIZE. */
|
||||
- cmpl $1, %edx
|
||||
- jbe L(one_or_less)
|
||||
-
|
||||
- /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
- page cross. This can have false positives but is by far the
|
||||
- fastest method. */
|
||||
- movl %edi, %eax
|
||||
- orl %esi, %eax
|
||||
- andl $(PAGE_SIZE - 1), %eax
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
- jg L(page_cross_less_vec)
|
||||
-
|
||||
- /* No page cross possible. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMP $4, (%rdi), %YMM2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Check if any matches where in bounds. Intentionally not
|
||||
- storing result in eax to limit dependency chain if it goes to
|
||||
- L(return_vec_0_lv). */
|
||||
- bzhil %edx, %eax, %edx
|
||||
- jnz L(return_vec_0_lv)
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- /* Essentially duplicate of L(return_vec_0). Ends up not costing
|
||||
- any code as shrinks L(less_vec) by allowing 2-byte encoding of
|
||||
- the jump and ends up fitting in aligning bytes. As well fits on
|
||||
- same cache line as L(less_vec) so also saves a line from having
|
||||
- to be fetched on cold calls to memcmp. */
|
||||
- .p2align 4,, 4
|
||||
-L(return_vec_0_lv):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
- /* NB: no partial register stall here because xorl zero idiom
|
||||
- above. */
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(page_cross_less_vec):
|
||||
- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
- bytes. */
|
||||
- cmpl $(16 / CHAR_SIZE), %edx
|
||||
- jae L(between_16_31)
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
- cmpl $8, %edx
|
||||
- jae L(between_8_15)
|
||||
- cmpl $4, %edx
|
||||
- jb L(between_2_3)
|
||||
-
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- /* edx is guranteed to be positive int32 in range [4, 7]. */
|
||||
- cmovne %edx, %eax
|
||||
- /* ecx is -1 if rcx > rax. Otherwise 0. */
|
||||
- sbbl %ecx, %ecx
|
||||
- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
|
||||
- rax then eax and ecx are zero. If rax < rax then ecx is -1 so
|
||||
- eax doesn't matter. */
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(between_8_15):
|
||||
-# endif
|
||||
- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
- vmovq (%rdi), %xmm1
|
||||
- vmovq (%rsi), %xmm2
|
||||
- VPCMP $4, %xmm1, %xmm2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_lv)
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
|
||||
- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
- VPCMP $4, %xmm1, %xmm2, %k1
|
||||
- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(between_16_31):
|
||||
- /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
-
|
||||
- /* Use movups to save code size. */
|
||||
- vmovdqu (%rsi), %xmm2
|
||||
- VPCMP $4, (%rdi), %xmm2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_lv)
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_end)
|
||||
- ret
|
||||
-
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
-L(between_2_3):
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-# endif
|
||||
END (MEMCMP)
|
||||
#endif
|
|
@ -0,0 +1,42 @@
|
|||
commit 9681691402052b727e01ae3375c73e0f76566593
|
||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Date: Wed Apr 27 13:59:26 2022 -0300
|
||||
|
||||
linux: Fix missing internal 64 bit time_t stat usage
|
||||
|
||||
These are two missing spots initially done by 52a5fe70a2c77935.
|
||||
|
||||
Checked on i686-linux-gnu.
|
||||
|
||||
(cherry picked from commit 834ddd0432f68d6dc85b6aac95065721af0d86e9)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
|
||||
index 13160d32499c4e58..00e4ce7f80ee2dfe 100644
|
||||
--- a/sysdeps/unix/sysv/linux/faccessat.c
|
||||
+++ b/sysdeps/unix/sysv/linux/faccessat.c
|
||||
@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
|
||||
if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
|
||||
return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
|
||||
|
||||
- struct stat64 stats;
|
||||
- if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
|
||||
+ struct __stat64_t64 stats;
|
||||
+ if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
|
||||
return -1;
|
||||
|
||||
mode &= (X_OK | W_OK | R_OK); /* Clear any bogus bits. */
|
||||
diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
|
||||
index b599a66c930cad4d..f79930303118ebcd 100644
|
||||
--- a/sysdeps/unix/sysv/linux/pathconf.c
|
||||
+++ b/sysdeps/unix/sysv/linux/pathconf.c
|
||||
@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
|
||||
&& strcmp (mntbuf.mnt_type, "ext4") != 0)
|
||||
continue;
|
||||
|
||||
- struct stat64 fsst;
|
||||
- if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
|
||||
+ struct __stat64_t64 fsst;
|
||||
+ if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
|
||||
&& st.st_dev == fsst.st_dev)
|
||||
{
|
||||
if (strcmp (mntbuf.mnt_type, "ext4") == 0)
|
|
@ -0,0 +1,39 @@
|
|||
commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe
|
||||
Author: Carlos O'Donell <carlos@redhat.com>
|
||||
Date: Tue Apr 26 10:52:41 2022 -0400
|
||||
|
||||
i386: Regenerate ulps
|
||||
|
||||
These failures were caught while building glibc master for Fedora
|
||||
Rawhide which is built with '-mtune=generic -msse2 -mfpmath=sse'
|
||||
using gcc 11.3 (gcc-11.3.1-2.fc35) on a Cascadelake Intel Xeon
|
||||
processor.
|
||||
|
||||
(cherry picked from commit e465d97653311c3687aee49de782177353acfe86)
|
||||
|
||||
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
|
||||
index 7601049110789201..84e6686eba5fe79a 100644
|
||||
--- a/sysdeps/i386/fpu/libm-test-ulps
|
||||
+++ b/sysdeps/i386/fpu/libm-test-ulps
|
||||
@@ -668,7 +668,7 @@ ldouble: 4
|
||||
|
||||
Function: Imaginary part of "clog10":
|
||||
double: 2
|
||||
-float: 1
|
||||
+float: 2
|
||||
float128: 2
|
||||
ldouble: 2
|
||||
|
||||
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
|
||||
index a39c89cec1141935..cc21e6907fe8b6a3 100644
|
||||
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
|
||||
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
|
||||
@@ -668,7 +668,7 @@ ldouble: 4
|
||||
|
||||
Function: Imaginary part of "clog10":
|
||||
double: 2
|
||||
-float: 1
|
||||
+float: 2
|
||||
float128: 2
|
||||
ldouble: 2
|
||||
|
56
glibc.spec
56
glibc.spec
|
@ -148,7 +148,7 @@ end \
|
|||
Summary: The GNU libc libraries
|
||||
Name: glibc
|
||||
Version: %{glibcversion}
|
||||
Release: 30%{?dist}
|
||||
Release: 31%{?dist}
|
||||
|
||||
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
||||
# libraries.
|
||||
|
@ -379,17 +379,17 @@ Patch175: glibc-rh2058224-2.patch
|
|||
Patch176: glibc-rh2058230.patch
|
||||
Patch177: glibc-rh2054789.patch
|
||||
Patch178: glibc-upstream-2.34-108.patch
|
||||
Patch179: glibc-upstream-2.34-110.patch
|
||||
# glibc-2.34-109-gd64b08d5ba only changes NEWS.
|
||||
Patch179: glibc-upstream-2.34-110.patch
|
||||
Patch180: glibc-upstream-2.34-111.patch
|
||||
Patch181: glibc-upstream-2.34-112.patch
|
||||
Patch182: glibc-upstream-2.34-113.patch
|
||||
Patch183: glibc-upstream-2.34-114.patch
|
||||
# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
|
||||
# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
|
||||
Patch184: glibc-upstream-2.34-117.patch
|
||||
Patch185: glibc-upstream-2.34-118.patch
|
||||
Patch186: glibc-upstream-2.34-119.patch
|
||||
# glibc-2.34-115-gd5d1c95aaf only changes NEWS.
|
||||
# glibc-2.34-116-g852361b5a3 is glibc-rh2054789.patch.
|
||||
Patch187: glibc-upstream-2.34-120.patch
|
||||
Patch188: glibc-upstream-2.34-121.patch
|
||||
Patch189: glibc-upstream-2.34-122.patch
|
||||
|
@ -437,6 +437,28 @@ Patch229: glibc-upstream-2.34-163.patch
|
|||
Patch230: glibc-upstream-2.34-164.patch
|
||||
Patch231: glibc-upstream-2.34-165.patch
|
||||
Patch232: glibc-upstream-2.34-166.patch
|
||||
Patch233: glibc-upstream-2.34-167.patch
|
||||
Patch234: glibc-upstream-2.34-168.patch
|
||||
Patch235: glibc-upstream-2.34-169.patch
|
||||
Patch236: glibc-upstream-2.34-170.patch
|
||||
Patch237: glibc-upstream-2.34-171.patch
|
||||
Patch238: glibc-upstream-2.34-172.patch
|
||||
Patch239: glibc-upstream-2.34-173.patch
|
||||
Patch240: glibc-upstream-2.34-174.patch
|
||||
Patch241: glibc-upstream-2.34-175.patch
|
||||
Patch242: glibc-upstream-2.34-176.patch
|
||||
Patch243: glibc-upstream-2.34-177.patch
|
||||
Patch244: glibc-upstream-2.34-178.patch
|
||||
Patch245: glibc-upstream-2.34-179.patch
|
||||
Patch246: glibc-upstream-2.34-180.patch
|
||||
Patch247: glibc-upstream-2.34-181.patch
|
||||
Patch248: glibc-upstream-2.34-182.patch
|
||||
Patch249: glibc-upstream-2.34-183.patch
|
||||
Patch250: glibc-upstream-2.34-184.patch
|
||||
Patch251: glibc-upstream-2.34-185.patch
|
||||
Patch252: glibc-upstream-2.34-186.patch
|
||||
Patch253: glibc-upstream-2.34-187.patch
|
||||
Patch254: glibc-upstream-2.34-188.patch
|
||||
|
||||
##############################################################################
|
||||
# Continued list of core "glibc" package information:
|
||||
|
@ -2493,6 +2515,32 @@ fi
|
|||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||
|
||||
%changelog
|
||||
* Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31
|
||||
- Sync with upstream branch release/2.34/master,
|
||||
commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe:
|
||||
- i386: Regenerate ulps
|
||||
- linux: Fix missing internal 64 bit time_t stat usage
|
||||
- x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
|
||||
- x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
|
||||
- x86-64: Use notl in EVEX strcmp [BZ #28646]
|
||||
- x86: Shrink memcmp-sse4.S code size
|
||||
- x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
|
||||
- x86: Optimize memmove-vec-unaligned-erms.S
|
||||
- x86-64: Replace movzx with movzbl
|
||||
- x86-64: Remove Prefer_AVX2_STRCMP
|
||||
- x86-64: Improve EVEX strcmp with masked load
|
||||
- x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
|
||||
- x86: Optimize memset-vec-unaligned-erms.S
|
||||
- x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
|
||||
- x86: Modify ENTRY in sysdep.h so that p2align can be specified
|
||||
- x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
|
||||
- scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier
|
||||
- dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078)
|
||||
- INSTALL: Rephrase -with-default-link documentation
|
||||
- misc: Fix rare fortify crash on wchar funcs. [BZ 29030]
|
||||
- Default to --with-default-link=no (bug 25812)
|
||||
- scripts: Add glibcelf.py module
|
||||
|
||||
* Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30
|
||||
- Sync with upstream branch release/2.34/master,
|
||||
commit 71326f1f2fd09dafb9c34404765fb88129e94237:
|
||||
|
|
Loading…
Reference in New Issue