From 624dbc2790d4dda7c34bb6a8cf6c3d19887bf731 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 11 Jun 2018 17:37:06 +0200 Subject: [PATCH 01/44] Split sequential libraries from base package to openblas-serial. --- openblas.spec | 52 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/openblas.spec b/openblas.spec index ec6749e..8911488 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.0 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -97,90 +97,111 @@ Computational Science, ISCAS. http://www.rdcps.ac.cn %{base_description} %package Rblas -Summary: A version of OpenBLAS for R to use as libRblas -Group: Development/Libraries +Summary: A version of OpenBLAS for R to use as libRblas +Group: Development/Libraries %description Rblas %{base_description} +%package serial +Summary: An optimized BLAS library based on GotoBLAS2, serial version +Group: Development/Libraries +Requires: %{name} = %{version}-%{release} + +%description serial +%{base_description} + +This package contains the sequential library compiled with a 32-bit +integer interface. + %package openmp Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description openmp %{base_description} -This package contains the library compiled with OpenMP support. +This package contains the library compiled with OpenMP support with +32-bit integer interface. %package threads Summary: An optimized BLAS library based on GotoBLAS2, pthreads version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description threads %{base_description} -This package contains the library compiled with threading support. +This package contains the library compiled with threading support and +a 32-bit integer interface. %if %build64 %package serial64 Summary: An optimized BLAS library based on GotoBLAS2, serial version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description serial64 %{base_description} This package contains the sequential library compiled with a 64-bit -interface. +integer interface. %package openmp64 Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description openmp64 %{base_description} This package contains the library compiled with OpenMP support and -64-bit interface. +64-bit integer interface. %package threads64 Summary: An optimized BLAS library based on GotoBLAS2, pthreads version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description threads64 %{base_description} This package contains the library compiled with threading support and -64-bit interface. +64-bit integer interface. %package serial64_ Summary: An optimized BLAS library based on GotoBLAS2, serial version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description serial64_ %{base_description} This package contains the sequential library compiled with a 64-bit -interface and a symbol name suffix. +integer interface and a symbol name suffix. %package openmp64_ Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description openmp64_ %{base_description} This package contains the library compiled with OpenMP support and -64-bit interface and a symbol name suffix. +64-bit integer interface and a symbol name suffix. %package threads64_ Summary: An optimized BLAS library based on GotoBLAS2, pthreads version Group: Development/Libraries +Requires: %{name} = %{version}-%{release} %description threads64_ %{base_description} This package contains the library compiled with threading support and -64-bit interface and a symbol name suffix. +64-bit integer interface and a symbol name suffix. %endif @@ -188,6 +209,7 @@ This package contains the library compiled with threading support and Summary: Development headers and libraries for OpenBLAS Group: Development/Libraries Requires: %{name}%{?_isa} = %{version}-%{release} +Requires: %{name}-serial%{?_isa} = %{version}-%{release} Requires: %{name}-openmp%{?_isa} = %{version}-%{release} Requires: %{name}-threads%{?_isa} = %{version}-%{release} %if %build64 @@ -594,7 +616,10 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %files -%doc serial/Changelog.txt serial/GotoBLAS* serial/LICENSE +%license serial/LICENSE +%doc serial/Changelog.txt serial/GotoBLAS* + +%files serial %{_libdir}/lib%{name}-*.so %{_libdir}/lib%{name}.so.* @@ -663,6 +688,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Mon Jun 11 2018 Susi Lehtola - 0.3.0-2 +- Split sequential libraries from core package to openblas-serial. + * Thu May 24 2018 Susi Lehtola - 0.3.0-1 - Update to 0.3.0. From 7b9322f323d505dc62d44247a21b0d9905bbcbfd Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 1 Jul 2018 19:14:18 +0200 Subject: [PATCH 02/44] Update to 0.3.1. --- .gitignore | 1 + openblas.spec | 9 ++++++--- sources | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c8e184f..f5cf4d3 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ /v0.2.18.tar.gz /v0.2.19.tar.gz /v0.3.0.tar.gz +/v0.3.1.tar.gz diff --git a/openblas.spec b/openblas.spec index 8911488..d0ae6c3 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.0 -Release: 2%{?dist} +Version: 0.3.1 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -364,7 +364,7 @@ LAPACKE="NO_LAPACKE=1" NMAX="NUM_THREADS=128" %ifarch %{ix86} x86_64 -TARGET="TARGET=CORE2 DYNAMIC_ARCH=1" +TARGET="TARGET=CORE2 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" # Compability for old versions of GCC %if 0%{?rhel} == 5 @@ -688,6 +688,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Jul 01 2018 Susi Lehtola - 0.3.1-1 +- Update to 0.3.1. + * Mon Jun 11 2018 Susi Lehtola - 0.3.0-2 - Split sequential libraries from core package to openblas-serial. diff --git a/sources b/sources index 24188fc..845060a 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (v0.3.0.tar.gz) = 6a982d2adc13febc162d5c0077cedc116c771409ee1dbb651da6a073e5f6a439e1d0eba0349b3b2e506b274d3014adbf48fc96625ed942a61a54a0c936576b89 +SHA512 (v0.3.1.tar.gz) = 4c8d50114378518c7c85b92dd52cb649290f3bdc119316c0e236ea56353447d1fcef18afb344d4f125ec8b379b10f9d43ab008316517d897bf73e8f0000ee8f7 From 7647c49de045d70594dadaada4b3dfbb51d2d28f Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 1 Jul 2018 19:25:36 +0200 Subject: [PATCH 03/44] Drop obsoleted patch. --- 1572.patch | 23 ----------------------- openblas.spec | 3 --- 2 files changed, 26 deletions(-) delete mode 100644 1572.patch diff --git a/1572.patch b/1572.patch deleted file mode 100644 index 76247ac..0000000 --- a/1572.patch +++ /dev/null @@ -1,23 +0,0 @@ -From 961d25e9c7e4a1758adb1dbeaa15187de69dd052 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 23 May 2018 22:54:39 +0200 -Subject: [PATCH] Use the new zrot.c on POWER8 for crot as well - -fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly) ---- - kernel/power/KERNEL.POWER8 | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 -index 00ff8682a..1aa061078 100644 ---- a/kernel/power/KERNEL.POWER8 -+++ b/kernel/power/KERNEL.POWER8 -@@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c - # - SROTKERNEL = srot.c - DROTKERNEL = drot.c --#CROTKERNEL = ../arm/zrot.c -+CROTKERNEL = zrot.c - ZROTKERNEL = zrot.c - # - SSCALKERNEL = sscal.c diff --git a/openblas.spec b/openblas.spec index d0ae6c3..cd6c4f5 100644 --- a/openblas.spec +++ b/openblas.spec @@ -29,8 +29,6 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.2.19-tests.patch -# Upstream pull 1572, fixes fail on ppc64le -Patch4: 1572.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -252,7 +250,6 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests -%patch4 -p1 -b .ppc64le # Fix source permissions find -name \*.f -exec chmod 644 {} \; From 1f6eccf36004a4aebf660f03c306f448e7ec4c2d Mon Sep 17 00:00:00 2001 From: Fedora Release Engineering Date: Fri, 13 Jul 2018 15:02:45 +0000 Subject: [PATCH 04/44] - Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild Signed-off-by: Fedora Release Engineering --- openblas.spec | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index cd6c4f5..0d3c174 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.1 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -685,6 +685,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Fri Jul 13 2018 Fedora Release Engineering - 0.3.1-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild + * Sun Jul 01 2018 Susi Lehtola - 0.3.1-1 - Update to 0.3.1. From a5bfa867a0624625cc1224ca3c37855c00f9e146 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 22 Jul 2018 17:05:31 +0200 Subject: [PATCH 05/44] Fix crash with multiple instances. --- 1695.patch | 24 ++++++++++++++++++++++++ openblas.spec | 6 ++++++ 2 files changed, 30 insertions(+) create mode 100644 1695.patch diff --git a/1695.patch b/1695.patch new file mode 100644 index 0000000..cf1573f --- /dev/null +++ b/1695.patch @@ -0,0 +1,24 @@ +From 43ac839c168c652e52320267b0504e6933cb9f60 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sun, 22 Jul 2018 09:19:19 +0200 +Subject: [PATCH] Unset memory table entry, not just the temporary pointer to + it on shutdown + +to fix crash with multiple instances of OpenBLAS, #1692 +--- + driver/others/memory.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/driver/others/memory.c b/driver/others/memory.c +index 98bcfb216..3bf6ba019 100644 +--- a/driver/others/memory.c ++++ b/driver/others/memory.c +@@ -1279,7 +1279,7 @@ void blas_shutdown(void){ + struct alloc_t *alloc_info = local_memory_table[thread][pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); +- alloc_info = (void *)0; ++ local_memory_table[thread][pos] = (void *)0; + } + } + } diff --git a/openblas.spec b/openblas.spec index 0d3c174..31dd6af 100644 --- a/openblas.spec +++ b/openblas.spec @@ -29,6 +29,8 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.2.19-tests.patch +# Fix crash with multiple instances +Patch4: https://github.com/xianyi/OpenBLAS/pull/1695.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -250,6 +252,7 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests +%patch4 -p1 -b .multiinst # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -685,6 +688,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Jul 22 2018 Susi Lehtola - 0.3.1-3 +- Fix crash with multiple instances (BZ #1605231). + * Fri Jul 13 2018 Fedora Release Engineering - 0.3.1-2 - Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild From 0e4914d7aa2aa4fcebefceb5600387f27d06eefb Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 23 Jul 2018 09:21:48 +0200 Subject: [PATCH 06/44] Fix build on s390x. --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 31dd6af..39e2384 100644 --- a/openblas.spec +++ b/openblas.spec @@ -461,7 +461,7 @@ suffix="_power8" suffix="_armv8" %endif %ifarch s390x -suffix="_zarch_generic" +suffix="__z13" %endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a From 3ede871da17486491d573270ff5455b8506941bf Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 23 Jul 2018 10:47:32 +0200 Subject: [PATCH 07/44] Only one underscore necessary. --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 39e2384..cf96c0c 100644 --- a/openblas.spec +++ b/openblas.spec @@ -461,7 +461,7 @@ suffix="_power8" suffix="_armv8" %endif %ifarch s390x -suffix="__z13" +suffix="_z13" %endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a From 38b97aef71dfd4cbbacf5917fda2216bd3617c77 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Thu, 2 Aug 2018 14:07:57 +0200 Subject: [PATCH 08/44] Update to 0.3.2. --- .gitignore | 1 + 1695.patch | 24 ------------------------ openblas.spec | 12 ++++++------ sources | 2 +- 4 files changed, 8 insertions(+), 31 deletions(-) delete mode 100644 1695.patch diff --git a/.gitignore b/.gitignore index f5cf4d3..9b6016d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ /v0.2.19.tar.gz /v0.3.0.tar.gz /v0.3.1.tar.gz +/openblas-0.3.2.tar.gz diff --git a/1695.patch b/1695.patch deleted file mode 100644 index cf1573f..0000000 --- a/1695.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 43ac839c168c652e52320267b0504e6933cb9f60 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sun, 22 Jul 2018 09:19:19 +0200 -Subject: [PATCH] Unset memory table entry, not just the temporary pointer to - it on shutdown - -to fix crash with multiple instances of OpenBLAS, #1692 ---- - driver/others/memory.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/driver/others/memory.c b/driver/others/memory.c -index 98bcfb216..3bf6ba019 100644 ---- a/driver/others/memory.c -+++ b/driver/others/memory.c -@@ -1279,7 +1279,7 @@ void blas_shutdown(void){ - struct alloc_t *alloc_info = local_memory_table[thread][pos]; - if (alloc_info) { - alloc_info->release_func(alloc_info); -- alloc_info = (void *)0; -+ local_memory_table[thread][pos] = (void *)0; - } - } - } diff --git a/openblas.spec b/openblas.spec index cf96c0c..ea087b4 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,13 +14,13 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.1 -Release: 2%{?dist} +Version: 0.3.2 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD URL: https://github.com/xianyi/OpenBLAS/ -Source0: https://github.com/xianyi/OpenBLAS/archive/v%{version}.tar.gz +Source0: https://github.com/xianyi/OpenBLAS/archive/v%{version}/openblas-%{version}.tar.gz # Use system lapack Patch0: openblas-0.2.15-system_lapack.patch # Drop extra p from threaded library name @@ -29,8 +29,6 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.2.19-tests.patch -# Fix crash with multiple instances -Patch4: https://github.com/xianyi/OpenBLAS/pull/1695.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -252,7 +250,6 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests -%patch4 -p1 -b .multiinst # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -688,6 +685,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Thu Aug 02 2018 Susi Lehtola - 0.3.2-1 +- Update to 0.3.2. + * Sun Jul 22 2018 Susi Lehtola - 0.3.1-3 - Fix crash with multiple instances (BZ #1605231). diff --git a/sources b/sources index 845060a..bd91689 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (v0.3.1.tar.gz) = 4c8d50114378518c7c85b92dd52cb649290f3bdc119316c0e236ea56353447d1fcef18afb344d4f125ec8b379b10f9d43ab008316517d897bf73e8f0000ee8f7 +SHA512 (openblas-0.3.2.tar.gz) = 13bc2aae763dadfe3ded33d12eb56fa3a042007047e9e84f6d18ed576394a45590f9f248d72c954c90cd2612e5cb0e1238a8bf83520160467fa720cf89ddb101 From bf5eb9904a45bfb0712f7402fc6694419057216e Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Thu, 2 Aug 2018 14:15:24 +0200 Subject: [PATCH 09/44] Update patch. --- openblas-0.3.2-tests.patch | 18 ++++++++++++++++++ openblas.spec | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 openblas-0.3.2-tests.patch diff --git a/openblas-0.3.2-tests.patch b/openblas-0.3.2-tests.patch new file mode 100644 index 0000000..0c75289 --- /dev/null +++ b/openblas-0.3.2-tests.patch @@ -0,0 +1,18 @@ +diff -up OpenBLAS-0.3.2/Makefile.tests OpenBLAS-0.3.2/Makefile +--- OpenBLAS-0.3.2/Makefile.tests 2018-08-02 14:12:01.615117002 +0200 ++++ OpenBLAS-0.3.2/Makefile 2018-08-02 14:13:29.582918971 +0200 +@@ -122,11 +122,11 @@ tests : + ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + touch $(LIBNAME) + ifndef NO_FBLAS +- $(MAKE) -C test all +- $(MAKE) -C utest all ++ $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ++ $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + ifndef NO_CBLAS +- $(MAKE) -C ctest all ++ $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + endif + diff --git a/openblas.spec b/openblas.spec index ea087b4..004fae6 100644 --- a/openblas.spec +++ b/openblas.spec @@ -28,7 +28,7 @@ Patch1: openblas-0.2.5-libname.patch # Don't use constructor priorities on too old architectures Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile -Patch3: openblas-0.2.19-tests.patch +Patch3: openblas-0.3.2-tests.patch BuildRequires: gcc BuildRequires: gcc-gfortran From 1cd0139cf2fe71404ee5fa0ab6de197567a2c5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dan=20Hor=C3=A1k?= Date: Wed, 15 Aug 2018 09:21:30 +0200 Subject: [PATCH 10/44] - Explicitly set the target to generic on s390x to avoid surprises (#1615760) --- openblas.spec | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openblas.spec b/openblas.spec index 004fae6..71390eb 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.2 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -387,6 +387,9 @@ TARGET="TARGET=POWER8 DYNAMIC_ARCH=0" %ifarch aarch64 TARGET="TARGET=ARMV8 DYNAMIC_ARCH=0" %endif +%ifarch s390x +TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=0" +%endif %if 0%{?rhel} == 5 # Gfortran too old to recognize -frecursive @@ -458,7 +461,7 @@ suffix="_power8" suffix="_armv8" %endif %ifarch s390x -suffix="_z13" +suffix="_zarch_generic" %endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a @@ -685,6 +688,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Wed Aug 15 2018 Dan Horák - 0.3.2-2 +- Explicitly set the target to generic on s390x to avoid surprises (#1615760) + * Thu Aug 02 2018 Susi Lehtola - 0.3.2-1 - Update to 0.3.2. From 72915d546431250f5e4d52f7fb1316b745de8a2d Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Fri, 24 Aug 2018 18:40:04 +0200 Subject: [PATCH 11/44] Add missing %{optflags}. --- openblas.spec | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 71390eb..91ef48a 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.2 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -396,6 +396,7 @@ TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=0" COMMON="%{optflags} -fPIC" FCOMMON="%{optflags} -fPIC" %else +COMMON="%{optflags} -fPIC" FCOMMON="%{optflags} -fPIC -frecursive" %endif # Use Fedora linker flags @@ -688,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Thu Aug 02 2018 Susi Lehtola - 0.3.2-3 +- Add missing %%{optflags} to COMMON (see discussion in #1619074). + * Wed Aug 15 2018 Dan Horák - 0.3.2-2 - Explicitly set the target to generic on s390x to avoid surprises (#1615760) From b921c4b4561f9959acd8fdefeea60a1a47f3fff5 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Fri, 24 Aug 2018 18:41:12 +0200 Subject: [PATCH 12/44] Fix date typo. --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 91ef48a..7c41111 100644 --- a/openblas.spec +++ b/openblas.spec @@ -689,7 +689,7 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog -* Thu Aug 02 2018 Susi Lehtola - 0.3.2-3 +* Thu Aug 24 2018 Susi Lehtola - 0.3.2-3 - Add missing %%{optflags} to COMMON (see discussion in #1619074). * Wed Aug 15 2018 Dan Horák - 0.3.2-2 From b24abed397662f6af903a6dabae4db9e862b2846 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Fri, 24 Aug 2018 18:41:43 +0200 Subject: [PATCH 13/44] Fix second date typo... --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 7c41111..13bfa2a 100644 --- a/openblas.spec +++ b/openblas.spec @@ -689,7 +689,7 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog -* Thu Aug 24 2018 Susi Lehtola - 0.3.2-3 +* Fri Aug 24 2018 Susi Lehtola - 0.3.2-3 - Add missing %%{optflags} to COMMON (see discussion in #1619074). * Wed Aug 15 2018 Dan Horák - 0.3.2-2 From 56e91ff6be1c1a9a6f22941118433cfd7d364df7 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Fri, 24 Aug 2018 18:48:32 +0200 Subject: [PATCH 14/44] Add patch to avoid threading issues. --- openblas-0.3.2-threads.patch | 12 ++++++++++++ openblas.spec | 6 ++++++ 2 files changed, 18 insertions(+) create mode 100644 openblas-0.3.2-threads.patch diff --git a/openblas-0.3.2-threads.patch b/openblas-0.3.2-threads.patch new file mode 100644 index 0000000..0fa1d35 --- /dev/null +++ b/openblas-0.3.2-threads.patch @@ -0,0 +1,12 @@ +diff -up OpenBLAS-0.3.2/driver/others/memory.c.threads OpenBLAS-0.3.2/driver/others/memory.c +--- OpenBLAS-0.3.2/driver/others/memory.c.threads 2018-07-30 07:25:01.000000000 +0200 ++++ OpenBLAS-0.3.2/driver/others/memory.c 2018-08-24 18:45:36.826078344 +0200 +@@ -497,7 +497,7 @@ static const int allocation_block_size = + #if defined(SMP) && !defined(USE_OPENMP_UNUSED) + /* This is the number of threads than can be spawned by the server, which is the + server plus the number of threads in the thread pool */ +-# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 ++# define MAX_ALLOCATING_THREADS 8192 + static int next_memory_table_pos = 0; + # if defined(HAS_COMPILER_TLS) + /* Use compiler generated thread-local-storage */ diff --git a/openblas.spec b/openblas.spec index 13bfa2a..abc2d75 100644 --- a/openblas.spec +++ b/openblas.spec @@ -29,6 +29,8 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch +# Crude hack to avoid problems with threading code (see https://github.com/xianyi/OpenBLAS/issues/1735) +Patch4: openblas-0.3.2-threads.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -250,6 +252,7 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests +%patch4 -p1 -b .threads # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -689,6 +692,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Fri Aug 24 2018 Susi Lehtola - 0.3.2-4 +- Patch to avoid threading issues. + * Fri Aug 24 2018 Susi Lehtola - 0.3.2-3 - Add missing %%{optflags} to COMMON (see discussion in #1619074). From 1aae703b57ef1fbd909eaecafbf4c212b64b6269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dan=20Hor=C3=A1k?= Date: Wed, 29 Aug 2018 20:02:08 +0200 Subject: [PATCH 15/44] - Fix precision in generic target on s390x --- openblas-0.3.2-zarch.patch | 24 ++++++++++++++++++++++++ openblas.spec | 8 +++++++- 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 openblas-0.3.2-zarch.patch diff --git a/openblas-0.3.2-zarch.patch b/openblas-0.3.2-zarch.patch new file mode 100644 index 0000000..9857dd8 --- /dev/null +++ b/openblas-0.3.2-zarch.patch @@ -0,0 +1,24 @@ +From f3fd44a731c1997b1d79d4d16abc25d78dce88a7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 28 Aug 2018 21:34:07 +0200 +Subject: [PATCH] Set USE_TRMM for all ZARCH variants to fix TRMM faults with + zarch-generic + +fixes #1743 +--- + kernel/Makefile.L3 | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 +index b37e536ef..9258f216d 100644 +--- a/kernel/Makefile.L3 ++++ b/kernel/Makefile.L3 +@@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) + USE_TRMM = 1 + endif + +-ifeq ($(CORE), Z13) ++ifeq ($(ARCH), zarch) + USE_TRMM = 1 + endif + diff --git a/openblas.spec b/openblas.spec index abc2d75..874a614 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.2 -Release: 3%{?dist} +Release: 5%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -31,6 +31,8 @@ Patch2: openblas-0.2.15-constructor.patch Patch3: openblas-0.3.2-tests.patch # Crude hack to avoid problems with threading code (see https://github.com/xianyi/OpenBLAS/issues/1735) Patch4: openblas-0.3.2-threads.patch +# https://github.com/xianyi/OpenBLAS/commit/e11126b26ada8d97b4a522e461ca92311653bfc6 +Patch5: openblas-0.3.2-zarch.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -253,6 +255,7 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests %patch4 -p1 -b .threads +%patch5 -p1 -b .zarch # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -692,6 +695,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Wed Aug 29 2018 Dan Horák - 0.3.2-5 +- Fix precision in generic target on s390x + * Fri Aug 24 2018 Susi Lehtola - 0.3.2-4 - Patch to avoid threading issues. From 9fdb102afd3d83d6ea4262d88d4b9487966fccdb Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 9 Sep 2018 10:26:33 +0200 Subject: [PATCH 16/44] Update to 0.3.3. --- openblas-0.3.2-threads.patch | 12 ------------ openblas-0.3.2-zarch.patch | 24 ------------------------ openblas.spec | 13 +++++-------- 3 files changed, 5 insertions(+), 44 deletions(-) delete mode 100644 openblas-0.3.2-threads.patch delete mode 100644 openblas-0.3.2-zarch.patch diff --git a/openblas-0.3.2-threads.patch b/openblas-0.3.2-threads.patch deleted file mode 100644 index 0fa1d35..0000000 --- a/openblas-0.3.2-threads.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff -up OpenBLAS-0.3.2/driver/others/memory.c.threads OpenBLAS-0.3.2/driver/others/memory.c ---- OpenBLAS-0.3.2/driver/others/memory.c.threads 2018-07-30 07:25:01.000000000 +0200 -+++ OpenBLAS-0.3.2/driver/others/memory.c 2018-08-24 18:45:36.826078344 +0200 -@@ -497,7 +497,7 @@ static const int allocation_block_size = - #if defined(SMP) && !defined(USE_OPENMP_UNUSED) - /* This is the number of threads than can be spawned by the server, which is the - server plus the number of threads in the thread pool */ --# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 -+# define MAX_ALLOCATING_THREADS 8192 - static int next_memory_table_pos = 0; - # if defined(HAS_COMPILER_TLS) - /* Use compiler generated thread-local-storage */ diff --git a/openblas-0.3.2-zarch.patch b/openblas-0.3.2-zarch.patch deleted file mode 100644 index 9857dd8..0000000 --- a/openblas-0.3.2-zarch.patch +++ /dev/null @@ -1,24 +0,0 @@ -From f3fd44a731c1997b1d79d4d16abc25d78dce88a7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 28 Aug 2018 21:34:07 +0200 -Subject: [PATCH] Set USE_TRMM for all ZARCH variants to fix TRMM faults with - zarch-generic - -fixes #1743 ---- - kernel/Makefile.L3 | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 -index b37e536ef..9258f216d 100644 ---- a/kernel/Makefile.L3 -+++ b/kernel/Makefile.L3 -@@ -44,7 +44,7 @@ ifeq ($(CORE), POWER8) - USE_TRMM = 1 - endif - --ifeq ($(CORE), Z13) -+ifeq ($(ARCH), zarch) - USE_TRMM = 1 - endif - diff --git a/openblas.spec b/openblas.spec index 874a614..90e2a30 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.2 -Release: 5%{?dist} +Version: 0.3.3 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -29,10 +29,6 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch -# Crude hack to avoid problems with threading code (see https://github.com/xianyi/OpenBLAS/issues/1735) -Patch4: openblas-0.3.2-threads.patch -# https://github.com/xianyi/OpenBLAS/commit/e11126b26ada8d97b4a522e461ca92311653bfc6 -Patch5: openblas-0.3.2-zarch.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -254,8 +250,6 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests -%patch4 -p1 -b .threads -%patch5 -p1 -b .zarch # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -695,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Sep 09 2018 Susi Lehtola - 0.3.3-1 +- Update to 0.3.3. + * Wed Aug 29 2018 Dan Horák - 0.3.2-5 - Fix precision in generic target on s390x From 7838a94118094a6348386c967852c7d737cfafbf Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 9 Sep 2018 10:33:57 +0200 Subject: [PATCH 17/44] Update to 0.3.3. --- sources | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources b/sources index bd91689..f0b7069 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.2.tar.gz) = 13bc2aae763dadfe3ded33d12eb56fa3a042007047e9e84f6d18ed576394a45590f9f248d72c954c90cd2612e5cb0e1238a8bf83520160467fa720cf89ddb101 +SHA512 (openblas-0.3.3.tar.gz) = 1c72dbe2b85675f564e777a807d0a8f2ab836abee8223b15ac4eb001c6ca06eeb2db7fa83a66d3f9e8420202b5afca6b6b1acb920e52abb3cec27b6f4629e618 From ebcadc8f7a420cc8d8b1eec6345c7d5ec03280bb Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sat, 29 Sep 2018 19:04:59 +0200 Subject: [PATCH 18/44] Fix segfault. --- openblas-0.3.3-tls.patch | 12 ++++++++++++ openblas.spec | 6 ++++++ 2 files changed, 18 insertions(+) create mode 100644 openblas-0.3.3-tls.patch diff --git a/openblas-0.3.3-tls.patch b/openblas-0.3.3-tls.patch new file mode 100644 index 0000000..38ebefc --- /dev/null +++ b/openblas-0.3.3-tls.patch @@ -0,0 +1,12 @@ +diff -up OpenBLAS-0.3.3/Makefile.rule.tls OpenBLAS-0.3.3/Makefile.rule +--- OpenBLAS-0.3.3/Makefile.rule.tls 2018-08-31 00:07:48.000000000 +0200 ++++ OpenBLAS-0.3.3/Makefile.rule 2018-09-29 19:00:47.804678736 +0200 +@@ -113,7 +113,7 @@ USE_SIMPLE_THREADED_LEVEL3 = 1 + # thread-local storage instead of a central memory buffer in memory.c + # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 + # for this to work. +-USE_TLS = 1 ++#USE_TLS = 1 + + # If you want to drive whole 64bit region by BLAS. Not all Fortran + # compiler supports this. It's safe to keep comment it out if you diff --git a/openblas.spec b/openblas.spec index 90e2a30..a4f6107 100644 --- a/openblas.spec +++ b/openblas.spec @@ -29,6 +29,8 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch +# Fix BZ #1634060 +Patch4: openblas-0.3.3-tls.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -250,6 +252,7 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests +%patch4 -p1 -b .tls # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -689,6 +692,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sat Sep 29 2018 Susi Lehtola - 0.3.3-2 +- Fix segfault (BZ #1634060). + * Sun Sep 09 2018 Susi Lehtola - 0.3.3-1 - Update to 0.3.3. From 2465d2679387354e02c8c0714274681911fb2786 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sat, 29 Sep 2018 19:05:45 +0200 Subject: [PATCH 19/44] Fix segfault. --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index a4f6107..d06a325 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.3 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD From 5ba251ab16024c878c8e8ad4f1210860b602527f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nikola=20Forr=C3=B3?= Date: Fri, 9 Nov 2018 13:02:51 +0100 Subject: [PATCH 20/44] Fix i686-x86_64 multilib difference. Get rid of executable stack in libRblas.so. --- openblas.spec | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index d06a325..554c4a5 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.3 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -35,6 +35,7 @@ Patch4: openblas-0.3.3-tls.patch BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel +BuildRequires: multilib-rpm-config # Do we have execstack? %if 0%{?rhel} == 7 @@ -448,6 +449,9 @@ make -C serial USE_THREAD=0 PREFIX=%{buildroot} OPENBLAS_LIBRARY_DIR=%{buildroot cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} %endif +# Fix i686-x86_64 multilib difference +%multilib_fix_c_header --file %{_includedir}/openblas/openblas_config.h + # Fix name of libraries %ifarch armv7hl suffix="_armv7" @@ -583,6 +587,9 @@ ln -sf ${pname64_}.so lib%{name}p64_.so.0 for lib in %{buildroot}%{_libdir}/libopenblas*.so; do execstack -c $lib done +for lib in %{buildroot}%{_libdir}/R/lib/libRblas*.so; do + execstack -c $lib +done %endif # Get rid of generated CMake config @@ -692,6 +699,10 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Fri Nov 09 2018 Nikola Forró - 0.3.3-3 +- Fix i686-x86_64 multilib difference. +- Get rid of executable stack in libRblas.so. + * Sat Sep 29 2018 Susi Lehtola - 0.3.3-2 - Fix segfault (BZ #1634060). From 53f8e436b9780c291d1fddc9bf6166a5e009d8db Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 7 Jan 2019 16:09:51 +0100 Subject: [PATCH 21/44] Update to 0.3.5, enabling dynamic CPU detection on aarch64. --- openblas-0.3.3-tls.patch | 12 ------------ openblas.spec | 12 ++++++------ sources | 2 +- 3 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 openblas-0.3.3-tls.patch diff --git a/openblas-0.3.3-tls.patch b/openblas-0.3.3-tls.patch deleted file mode 100644 index 38ebefc..0000000 --- a/openblas-0.3.3-tls.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff -up OpenBLAS-0.3.3/Makefile.rule.tls OpenBLAS-0.3.3/Makefile.rule ---- OpenBLAS-0.3.3/Makefile.rule.tls 2018-08-31 00:07:48.000000000 +0200 -+++ OpenBLAS-0.3.3/Makefile.rule 2018-09-29 19:00:47.804678736 +0200 -@@ -113,7 +113,7 @@ USE_SIMPLE_THREADED_LEVEL3 = 1 - # thread-local storage instead of a central memory buffer in memory.c - # Note that if your system uses GLIBC, it needs to have at least glibc 2.21 - # for this to work. --USE_TLS = 1 -+#USE_TLS = 1 - - # If you want to drive whole 64bit region by BLAS. Not all Fortran - # compiler supports this. It's safe to keep comment it out if you diff --git a/openblas.spec b/openblas.spec index 554c4a5..3029a02 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.3 -Release: 3%{?dist} +Version: 0.3.5 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -29,8 +29,6 @@ Patch1: openblas-0.2.5-libname.patch Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch -# Fix BZ #1634060 -Patch4: openblas-0.3.3-tls.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -253,7 +251,6 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests -%patch4 -p1 -b .tls # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -389,7 +386,7 @@ TARGET="TARGET=POWER7 DYNAMIC_ARCH=0" TARGET="TARGET=POWER8 DYNAMIC_ARCH=0" %endif %ifarch aarch64 -TARGET="TARGET=ARMV8 DYNAMIC_ARCH=0" +TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1" %endif %ifarch s390x TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=0" @@ -699,6 +696,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Mon Jan 07 2018 Susi Lehtola - 0.3.5-1 +- Update to 0.3.5, with dynamic CPU detection on aarch64. + * Fri Nov 09 2018 Nikola Forró - 0.3.3-3 - Fix i686-x86_64 multilib difference. - Get rid of executable stack in libRblas.so. diff --git a/sources b/sources index f0b7069..e303585 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.3.tar.gz) = 1c72dbe2b85675f564e777a807d0a8f2ab836abee8223b15ac4eb001c6ca06eeb2db7fa83a66d3f9e8420202b5afca6b6b1acb920e52abb3cec27b6f4629e618 +SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0 From aa1356a8e5cd5308044264685b8fbde648e51294 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 7 Jan 2019 16:10:20 +0100 Subject: [PATCH 22/44] Fix typo --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 3029a02..215d27c 100644 --- a/openblas.spec +++ b/openblas.spec @@ -696,7 +696,7 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog -* Mon Jan 07 2018 Susi Lehtola - 0.3.5-1 +* Mon Jan 07 2019 Susi Lehtola - 0.3.5-1 - Update to 0.3.5, with dynamic CPU detection on aarch64. * Fri Nov 09 2018 Nikola Forró - 0.3.3-3 From c2e6ea0c694bceb3828139d99b991822a7c1daf4 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 7 Jan 2019 16:41:20 +0100 Subject: [PATCH 23/44] No more suffix on aarch64 --- openblas.spec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 215d27c..7d05589 100644 --- a/openblas.spec +++ b/openblas.spec @@ -450,6 +450,7 @@ cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} %multilib_fix_c_header --file %{_includedir}/openblas/openblas_config.h # Fix name of libraries +suffix="" %ifarch armv7hl suffix="_armv7" %endif @@ -463,7 +464,7 @@ suffix="_power7" suffix="_power8" %endif %ifarch aarch64 -suffix="_armv8" +# Runtime CPU detection, no suffix %endif %ifarch s390x suffix="_zarch_generic" From 09825fb09a3abc9f530ac952ff4657fe9702d185 Mon Sep 17 00:00:00 2001 From: Igor Gnatenko Date: Tue, 22 Jan 2019 18:40:34 +0100 Subject: [PATCH 24/44] Remove obsolete ldconfig scriptlets References: https://fedoraproject.org/wiki/Changes/RemoveObsoleteScriptlets Signed-off-by: Igor Gnatenko --- openblas.spec | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/openblas.spec b/openblas.spec index 7d05589..f6d790b 100644 --- a/openblas.spec +++ b/openblas.spec @@ -595,33 +595,23 @@ rm -rf %{buildroot}%{_libdir}/cmake # Get rid of generated pkgconfig rm -rf %{buildroot}%{_libdir}/pkgconfig -%post -p /sbin/ldconfig -%postun -p /sbin/ldconfig +%ldconfig_scriptlets -%post openmp -p /sbin/ldconfig -%postun openmp -p /sbin/ldconfig +%ldconfig_scriptlets openmp -%post Rblas -p /sbin/ldconfig -%postun Rblas -p /sbin/ldconfig +%ldconfig_scriptlets Rblas -%post threads -p /sbin/ldconfig -%postun threads -p /sbin/ldconfig +%ldconfig_scriptlets threads %if %build64 -%post openmp64 -p /sbin/ldconfig -%postun openmp64 -p /sbin/ldconfig -%post openmp64_ -p /sbin/ldconfig -%postun openmp64_ -p /sbin/ldconfig +%ldconfig_scriptlets openmp64 +%ldconfig_scriptlets openmp64_ -%post serial64 -p /sbin/ldconfig -%postun serial64 -p /sbin/ldconfig -%post serial64_ -p /sbin/ldconfig -%postun serial64_ -p /sbin/ldconfig +%ldconfig_scriptlets serial64 +%ldconfig_scriptlets serial64_ -%post threads64 -p /sbin/ldconfig -%postun threads64 -p /sbin/ldconfig -%post threads64_ -p /sbin/ldconfig -%postun threads64_ -p /sbin/ldconfig +%ldconfig_scriptlets threads64 +%ldconfig_scriptlets threads64_ %endif %files From 19946032f956bc27cfd0cd0717f3c6656197d602 Mon Sep 17 00:00:00 2001 From: Igor Gnatenko Date: Mon, 28 Jan 2019 20:17:59 +0100 Subject: [PATCH 25/44] Remove obsolete Group tag References: https://fedoraproject.org/wiki/Changes/Remove_Group_Tag --- openblas.spec | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/openblas.spec b/openblas.spec index f6d790b..302aeca 100644 --- a/openblas.spec +++ b/openblas.spec @@ -17,7 +17,6 @@ Name: openblas Version: 0.3.5 Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 -Group: Development/Libraries License: BSD URL: https://github.com/xianyi/OpenBLAS/ Source0: https://github.com/xianyi/OpenBLAS/archive/v%{version}/openblas-%{version}.tar.gz @@ -97,14 +96,12 @@ Computational Science, ISCAS. http://www.rdcps.ac.cn %package Rblas Summary: A version of OpenBLAS for R to use as libRblas -Group: Development/Libraries %description Rblas %{base_description} %package serial Summary: An optimized BLAS library based on GotoBLAS2, serial version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description serial @@ -115,7 +112,6 @@ integer interface. %package openmp Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description openmp @@ -126,7 +122,6 @@ This package contains the library compiled with OpenMP support with %package threads Summary: An optimized BLAS library based on GotoBLAS2, pthreads version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description threads @@ -138,7 +133,6 @@ a 32-bit integer interface. %if %build64 %package serial64 Summary: An optimized BLAS library based on GotoBLAS2, serial version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description serial64 @@ -149,7 +143,6 @@ integer interface. %package openmp64 Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description openmp64 @@ -160,7 +153,6 @@ This package contains the library compiled with OpenMP support and %package threads64 Summary: An optimized BLAS library based on GotoBLAS2, pthreads version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description threads64 @@ -171,7 +163,6 @@ This package contains the library compiled with threading support and %package serial64_ Summary: An optimized BLAS library based on GotoBLAS2, serial version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description serial64_ @@ -182,7 +173,6 @@ integer interface and a symbol name suffix. %package openmp64_ Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description openmp64_ @@ -193,7 +183,6 @@ This package contains the library compiled with OpenMP support and %package threads64_ Summary: An optimized BLAS library based on GotoBLAS2, pthreads version -Group: Development/Libraries Requires: %{name} = %{version}-%{release} %description threads64_ @@ -206,7 +195,6 @@ This package contains the library compiled with threading support and %package devel Summary: Development headers and libraries for OpenBLAS -Group: Development/Libraries Requires: %{name}%{?_isa} = %{version}-%{release} Requires: %{name}-serial%{?_isa} = %{version}-%{release} Requires: %{name}-openmp%{?_isa} = %{version}-%{release} @@ -228,7 +216,6 @@ This package contains the development headers and libraries. %package static Summary: Static version of OpenBLAS -Group: Development/Libraries Requires: %{name}-devel%{?_isa} = %{version}-%{release} %description static From 2529d97e842e4dc325ae3a68ca83d5339375ce95 Mon Sep 17 00:00:00 2001 From: Fedora Release Engineering Date: Fri, 1 Feb 2019 17:18:57 +0000 Subject: [PATCH 26/44] - Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild Signed-off-by: Fedora Release Engineering --- openblas.spec | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 302aeca..495358c 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.5 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -674,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Fri Feb 01 2019 Fedora Release Engineering - 0.3.5-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild + * Mon Jan 07 2019 Susi Lehtola - 0.3.5-1 - Update to 0.3.5, with dynamic CPU detection on aarch64. From 2a8a1574d78ffdd825b263cbebbf7f200db65e57 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 17 Feb 2019 21:36:32 +0100 Subject: [PATCH 27/44] Import patches from upstream to fix gcc 9 compatibility. --- 2010.patch | 499 ++++++++++++++++++ 2018.patch | 27 + 2019.patch | 274 ++++++++++ 2021.patch | 255 ++++++++++ 2023.patch | 874 ++++++++++++++++++++++++++++++++ 2024.patch | 1349 +++++++++++++++++++++++++++++++++++++++++++++++++ openblas.spec | 20 +- 7 files changed, 3297 insertions(+), 1 deletion(-) create mode 100644 2010.patch create mode 100644 2018.patch create mode 100644 2019.patch create mode 100644 2021.patch create mode 100644 2023.patch create mode 100644 2024.patch diff --git a/2010.patch b/2010.patch new file mode 100644 index 0000000..2393325 --- /dev/null +++ b/2010.patch @@ -0,0 +1,499 @@ +From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 15:33:48 +0100 +Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64 + s/dGEMV_T and s/dGEMV_N kernels + +Arguments 0 and 1 need to be tagged as both input and output +--- + kernel/x86_64/dgemv_n_4.c | 10 +++++----- + kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- + kernel/x86_64/sgemv_n_4.c | 14 +++++++------- + kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- + 4 files changed, 30 insertions(+), 30 deletions(-) + +diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c +index 6d2530e81..6d33641e9 100644 +--- a/kernel/x86_64/dgemv_n_4.c ++++ b/kernel/x86_64/dgemv_n_4.c +@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 +@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a + "jnz 1b \n\t" + + : ++ "+r" (i), // 0 ++ "+r" (n) // 1 + : +- "r" (i), // 0 +- "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 +diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c +index a7478e3a8..ed672a757 100644 +--- a/kernel/x86_64/dgemv_t_4.c ++++ b/kernel/x86_64/dgemv_t_4.c +@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT + "movsd %%xmm11,8(%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 +@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) + "movsd %%xmm10, (%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 +@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (&da), // 2 + "r" (src), // 3 + "r" (dest) // 4 +diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c +index 65305ac59..63697970f 100644 +--- a/kernel/x86_64/sgemv_n_4.c ++++ b/kernel/x86_64/sgemv_n_4.c +@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 +@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a + + "3: \n\t" + : ++ "+r" (i), // 0 ++ "+r" (n1) // 1 + : +- "r" (i), // 0 +- "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 +@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) + "jnz 1b \n\t" + + : ++ "+r" (i), // 0 ++ "+r" (n) // 1 + : +- "r" (i), // 0 +- "r" (n), // 1 + "r" (src), // 2 + "r" (dest) // 3 + : "cc", +diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c +index 065e5b385..86ecaf516 100644 +--- a/kernel/x86_64/sgemv_t_4.c ++++ b/kernel/x86_64/sgemv_t_4.c +@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT + "movss %%xmm11,4(%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 +@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) + "movss %%xmm10, (%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 +@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (&da), // 2 + "r" (src), // 3 + "r" (dest) // 4 + +From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 15:51:43 +0100 +Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly + +Argument 0 is modified as it doubles as a counter +--- + kernel/x86_64/dscal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c +index ef9a0a6ba..d0d7801fd 100644 +--- a/kernel/x86_64/dscal.c ++++ b/kernel/x86_64/dscal.c +@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ + "jnz 1b \n\t" + + : ++ "+r" (n) // 0 + : +- "r" (n), // 0 + "r" (x), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + +From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 16:00:18 +0100 +Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV + microkernels + +Arguments 0 and 1 are both input and output +--- + kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- + 8 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +index d7166fe4b..ae287b6d8 100644 +--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c ++++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c +index d83d20f8e..4778f644a 100644 +--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c ++++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c +@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c +index 1344c75f7..065182286 100644 +--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c ++++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c +@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c +index 1ef6fbafd..d84e703bd 100644 +--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c ++++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c +@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +index 8c01ab806..4a4f4d68d 100644 +--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c ++++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c +index a32e59b44..e6a09ccf8 100644 +--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c ++++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c +@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c +index b8e6ee732..c56ff3b15 100644 +--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c ++++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c +@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c +index e8650650c..c4919a39a 100644 +--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c ++++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c +@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + +From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 16:14:02 +0100 +Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly + +Argument 0 is modified so should be input and output +--- + kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- + 8 files changed, 18 insertions(+), 18 deletions(-) + +diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +index d84470cc4..bfa07b6d0 100644 +--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c ++++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c +index 866782ee6..6241879d5 100644 +--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c ++++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c +@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c +index 38479f77a..a161dcd8b 100644 +--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c ++++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c +@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c +index b4e6ab369..b205b1019 100644 +--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c ++++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c +@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +index 9002228f3..602c3edf2 100644 +--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c ++++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c +index 69db008b6..fdfe4349a 100644 +--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c ++++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c +@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c +index c0fe5d640..6bb9c02f6 100644 +--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c ++++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c +@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c +index 093ca8073..0c78212e7 100644 +--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c ++++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c +@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 diff --git a/2018.patch b/2018.patch new file mode 100644 index 0000000..594a4c4 --- /dev/null +++ b/2018.patch @@ -0,0 +1,27 @@ +From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 +From: Bart Oldeman +Date: Thu, 14 Feb 2019 16:19:41 +0000 +Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for + xmm0,xmm1,xmm2,xmm3 + +This fixes a crash in dblat2 when OpenBLAS is compiled using +-march=znver1 -ftree-vectorize -O2 + +See also: +https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 +--- + kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c +index 584a6c6b5..da0fa2fff 100644 +--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c +@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", ++ "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", diff --git a/2019.patch b/2019.patch new file mode 100644 index 0000000..a3aa674 --- /dev/null +++ b/2019.patch @@ -0,0 +1,274 @@ +From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 14 Feb 2019 22:43:18 +0100 +Subject: [PATCH 1/2] Save and restore input argument 8 (lda4) + +Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) +--- + kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c +index 2c90f8aa9..e89a16785 100644 +--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c +@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +- + #define HAVE_KERNEL_4x8 1 + static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + ++ "movq %8, %%xmm10 \n\t" //save lda ++ + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + +@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "4: \n\t" + "vzeroupper \n\t" ++ "movq %%xmm10, %8 \n\t" //restore lda + + : + "+r" (i), // 0 +@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", ++ "%xmm10", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); +@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + } + + +- + #define HAVE_KERNEL_4x4 1 + static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + ++ + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + +From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 15 Feb 2019 10:10:04 +0100 +Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint + list + +--- + kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ + 1 file changed, 61 insertions(+), 65 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c +index e89a16785..93e1e26e8 100644 +--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c +@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + __asm__ __volatile__ + ( + "vzeroupper \n\t" +- "vbroadcastss (%2), %%ymm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 ++ "vbroadcastss (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + +- "movq %8, %%xmm10 \n\t" //save lda +- + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + +- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" + +- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" ++ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" + +- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + +- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + +- "addq $4 , %8 \n\t" ++ "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + +@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "testq $0x08, %1 \n\t" + "jz 3f \n\t" + +- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y ++ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + +- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" +- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" +- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" ++ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" + +- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" +- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" ++ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + +- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + +- "addq $8 , %8 \n\t" ++ "addq $8 , %2 \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + +@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y +- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y +- +- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" +- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" +- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" +- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" +- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" +- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" +- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" ++ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y ++ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y ++ ++ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "addq $16, %0 \n\t" +- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" +- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" +- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" +- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" +- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" ++ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" ++ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + +- "addq $16, %8 \n\t" +- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y ++ "addq $16, %2 \n\t" ++ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y + "subq $16, %1 \n\t" +- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y + + "jnz 1b \n\t" + + "4: \n\t" + "vzeroupper \n\t" +- "movq %%xmm10, %8 \n\t" //restore lda + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", +@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", +- "%xmm10", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); diff --git a/2021.patch b/2021.patch new file mode 100644 index 0000000..7724f38 --- /dev/null +++ b/2021.patch @@ -0,0 +1,255 @@ +From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 15 Feb 2019 15:08:16 +0100 +Subject: [PATCH] Fix wrong constraints in inline assembly + +for #2009 +--- + kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- + 1 file changed, 49 insertions(+), 49 deletions(-) + +diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c +index fcab8e2c7..9ab78fc8e 100644 +--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c ++++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c +@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " cmpq $0, %0 \n\t" + " je 4f \n\t" + +- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a +- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 +- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 ++ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a ++ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 ++ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + + + " addq $8, %1 \n\t" +@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .p2align 4 \n\t" + "1: \n\t" + +- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a ++ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" + +- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 ++ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 + " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" + " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" +- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 ++ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" +@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 22f \n\t" + +- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a ++ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" +- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 ++ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" +- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 ++ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" + +@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 + + " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" +- " vmovups (%9), %%ymm0 \n\t" ++ " vmovups (%3), %%ymm0 \n\t" + " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" + " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" + " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" +@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" + + " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" +- " vmovups 32(%9), %%ymm4 \n\t" ++ " vmovups 32(%3), %%ymm4 \n\t" + " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" + " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" +@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "5: \n\t" // i = 0 + +- " addq $64, %9 \n\t" // b=b+8 ++ " addq $64, %3 \n\t" // b=b+8 + + " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups %%ymm8 , (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups %%ymm8 , (%2) \n\t" // write a + " vmovups %%ymm8 , (%4) \n\t" // write c + + " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" + " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" +@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + + " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm9 , (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm9 , (%2) \n\t" // write a + " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" +@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm10, (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm10, (%2) \n\t" // write a + " vmovups %%ymm10, (%4,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" +@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + + " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm11, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm11, (%2) \n\t" // write a + " vmovups %%ymm11, (%5) \n\t" // write c + + " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" +@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm12, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm12, (%2) \n\t" // write a + " vmovups %%ymm12, (%5,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" +@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm13, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm13, (%2) \n\t" // write a + " vmovups %%ymm13, (%5,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" +@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm14, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm14, (%2) \n\t" // write a + " vmovups %%ymm14, (%6) \n\t" // write c + + " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" + + " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" + +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb +- " vmovups %%ymm15, (%8) \n\t" // write a ++ " vmovups %%ymm15, (%2) \n\t" // write a + " vmovups %%ymm15, (%6,%7,1) \n\t" // write c + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c3), // 5 + "r" (c6), // 6 + "r" (ldc), // 7 +- "r" (as), // 8 +- "r" (bs) // 9 ++ "r" (a), // 8 ++ "r" (b) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2023.patch b/2023.patch new file mode 100644 index 0000000..225a8a2 --- /dev/null +++ b/2023.patch @@ -0,0 +1,874 @@ +From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:24:11 +0100 +Subject: [PATCH 1/4] Fix inline assembly constraints + +rework indices to allow marking argument lda4 as input and output. For #2009 +--- + kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ + 1 file changed, 27 insertions(+), 27 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c +index 11a3e943b..d21232bfa 100644 +--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c ++++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c +@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + __asm__ __volatile__ + ( +- "movss (%2), %%xmm12 \n\t" // x0 +- "movss 4(%2), %%xmm13 \n\t" // x1 +- "movss 8(%2), %%xmm14 \n\t" // x2 +- "movss 12(%2), %%xmm15 \n\t" // x3 ++ "movss (%3), %%xmm12 \n\t" // x0 ++ "movss 4(%3), %%xmm13 \n\t" // x1 ++ "movss 8(%3), %%xmm14 \n\t" // x2 ++ "movss 12(%3), %%xmm15 \n\t" // x3 + "shufps $0, %%xmm12, %%xmm12\n\t" + "shufps $0, %%xmm13, %%xmm13\n\t" + "shufps $0, %%xmm14, %%xmm14\n\t" + "shufps $0, %%xmm15, %%xmm15\n\t" + +- "movss 16(%2), %%xmm0 \n\t" // x4 +- "movss 20(%2), %%xmm1 \n\t" // x5 +- "movss 24(%2), %%xmm2 \n\t" // x6 +- "movss 28(%2), %%xmm3 \n\t" // x7 ++ "movss 16(%3), %%xmm0 \n\t" // x4 ++ "movss 20(%3), %%xmm1 \n\t" // x5 ++ "movss 24(%3), %%xmm2 \n\t" // x6 ++ "movss 28(%3), %%xmm3 \n\t" // x7 + "shufps $0, %%xmm0 , %%xmm0 \n\t" + "shufps $0, %%xmm1 , %%xmm1 \n\t" + "shufps $0, %%xmm2 , %%xmm2 \n\t" +@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "1: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" +- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y + + ".p2align 1 \n\t" +- "movups (%4,%0,4), %%xmm8 \n\t" +- "movups (%5,%0,4), %%xmm9 \n\t" +- "movups (%6,%0,4), %%xmm10 \n\t" +- "movups (%7,%0,4), %%xmm11 \n\t" ++ "movups (%5,%0,4), %%xmm8 \n\t" ++ "movups (%6,%0,4), %%xmm9 \n\t" ++ "movups (%7,%0,4), %%xmm10 \n\t" ++ "movups (%8,%0,4), %%xmm11 \n\t" + ".p2align 1 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" +@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + +- "movups (%4,%8,4), %%xmm8 \n\t" +- "movups (%5,%8,4), %%xmm9 \n\t" +- "movups (%6,%8,4), %%xmm10 \n\t" +- "movups (%7,%8,4), %%xmm11 \n\t" ++ "movups (%5,%2,4), %%xmm8 \n\t" ++ "movups (%6,%2,4), %%xmm9 \n\t" ++ "movups (%7,%2,4), %%xmm10 \n\t" ++ "movups (%8,%2,4), %%xmm11 \n\t" + ".p2align 1 \n\t" + "mulps %%xmm0 , %%xmm8 \n\t" + "mulps %%xmm1 , %%xmm9 \n\t" +@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + +- "addq $4 , %8 \n\t" ++ "addq $4 , %2 \n\t" + "addps %%xmm5 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "mulps %%xmm6 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm4 , %%xmm7 \n\t" + +- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y ++ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y + + "jnz 1b \n\t" + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:36:39 +0100 +Subject: [PATCH 2/4] Fix inline assembly constraints + +rework indices to allow marking argument lda as input and output. +--- + kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- + 1 file changed, 65 insertions(+), 65 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c +index b35daa35b..3fc46542b 100644 +--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c ++++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c +@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + __asm__ __volatile__ + ( + "vzeroupper \n\t" +- "vbroadcastss (%2), %%ymm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 ++ "vbroadcastss (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + +@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" +- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + +- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" +- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" +- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" +- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" ++ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" ++ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" ++ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" ++ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" + +- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" +- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" +- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" +- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" ++ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" ++ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" ++ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" ++ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" +@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + +- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + +- "addq $4, %8 \n\t" ++ "addq $4, %2 \n\t" + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + +@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" +- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y ++ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + +- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" +- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" +- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" +- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" ++ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" ++ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" ++ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" ++ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" +- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" +- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" +- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" ++ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" ++ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" ++ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" ++ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" +@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + +- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + +- "addq $8, %8 \n\t" ++ "addq $8, %2 \n\t" + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + +@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + +- "prefetcht0 192(%4,%0,4) \n\t" +- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" +- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" +- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" +- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" ++ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" ++ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" ++ "prefetcht0 192(%6,%0,4) \n\t" ++ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" ++ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%6,%0,4) \n\t" +- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" +- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" +- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" +- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" ++ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" ++ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" ++ "prefetcht0 192(%8,%0,4) \n\t" ++ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" ++ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%4,%8,4) \n\t" +- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" +- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" +- "prefetcht0 192(%5,%8,4) \n\t" +- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" +- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" ++ "prefetcht0 192(%5,%2,4) \n\t" ++ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" ++ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" ++ "prefetcht0 192(%6,%2,4) \n\t" ++ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" ++ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%6,%8,4) \n\t" +- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" +- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" +- "prefetcht0 192(%7,%8,4) \n\t" +- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" +- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" ++ "prefetcht0 192(%7,%2,4) \n\t" ++ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" ++ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" ++ "prefetcht0 192(%8,%2,4) \n\t" ++ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" ++ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" +@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + +- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y +- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y ++ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y ++ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + +- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y +- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y + +- "addq $16, %8 \n\t" ++ "addq $16, %2 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" +@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:46:17 +0100 +Subject: [PATCH 3/4] Fix inline assembly constraints + +--- + kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- + 1 file changed, 97 insertions(+), 97 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +index 31001c7f3..bbf06c84b 100644 +--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c ++++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + __asm__ __volatile__ + ( +- "vbroadcastss (%2), %%xmm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 ++ "vbroadcastss (%3), %%xmm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 + + "vbroadcastss (%9), %%xmm8 \n\t" // alpha + +@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" + "addq $4 , %0 \n\t" + +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" +- "addq $4 , %8 \n\t" ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" ++ "addq $4 , %2 \n\t" + + "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" +- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" ++ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "subq $4 , %1 \n\t" +- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y + + "2: \n\t" + +@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" +- +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" ++ ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + +- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" +- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" +- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y ++ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" ++ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" ++ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y + + "addq $8 , %0 \n\t" +- "addq $8 , %8 \n\t" ++ "addq $8 , %2 \n\t" + "subq $8 , %1 \n\t" + + +@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" + +- "prefetcht0 192(%4,%0,4) \n\t" +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" +- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" +- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" ++ "prefetcht0 192(%8,%0,4) \n\t" ++ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + ".align 2 \n\t" +- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" +- +- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" +- +- "prefetcht0 192(%4,%8,4) \n\t" +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" +- "prefetcht0 192(%5,%8,4) \n\t" +- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "prefetcht0 192(%6,%8,4) \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" +- "prefetcht0 192(%7,%8,4) \n\t" +- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" ++ ++ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" ++ ++ "prefetcht0 192(%5,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" ++ "prefetcht0 192(%6,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "prefetcht0 192(%7,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" ++ "prefetcht0 192(%8,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + +- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" + +- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" +- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" +- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" +- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" ++ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" ++ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" ++ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" ++ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + + "addq $16, %0 \n\t" +- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y +- "addq $16, %8 \n\t" +- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y ++ "addq $16, %2 \n\t" ++ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y + + "subq $16, %1 \n\t" + "jnz 1b \n\t" +@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:51:09 +0100 +Subject: [PATCH 4/4] Fix inline assembly constraints + +--- + dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ + 1 file changed, 247 insertions(+) + create mode 100644 dgemv_n_microk_piledriver-4.c + +diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c +new file mode 100644 +index 000000000..466931b82 +--- /dev/null ++++ b/dgemv_n_microk_piledriver-4.c +@@ -0,0 +1,247 @@ ++/*************************************************************************** ++Copyright (c) 2014, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++ ++ ++#define HAVE_KERNEL_4x8 1 ++static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); ++ ++static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) ++{ ++ ++ BLASLONG register i = 0; ++ ++ __asm__ __volatile__ ++ ( ++ "vzeroupper \n\t" ++ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 ++ ++ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha ++ ++ "testq $0x04, %1 \n\t" ++ "jz 2f \n\t" ++ ++ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" ++ ++ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" ++ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" ++ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" ++ ++ ++ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y ++ ++ "addq $4 , %2 \n\t" ++ "addq $4 , %0 \n\t" ++ "subq $4 , %1 \n\t" ++ ++ "2: \n\t" ++ ++ "cmpq $0, %1 \n\t" ++ "je 3f \n\t" ++ ++ ++ ".align 16 \n\t" ++ "1: \n\t" ++ ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y ++ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y ++ ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" ++ "addq $8 , %0 \n\t" ++ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" ++ ++ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" ++ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" ++ ++ "addq $8 , %2 \n\t" ++ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y ++ "subq $8 , %1 \n\t" ++ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y ++ ++ "jnz 1b \n\t" ++ ++ "3: \n\t" ++ "vzeroupper \n\t" ++ ++ : ++ "+r" (i), // 0 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 ++ : ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 ++ "r" (alpha) // 9 ++ : "cc", ++ "%xmm0", "%xmm1", ++ "%xmm2", "%xmm3", ++ "%xmm4", "%xmm5", ++ "%xmm6", "%xmm7", ++ "%xmm8", "%xmm9", ++ "%xmm12", "%xmm13", "%xmm14", "%xmm15", ++ "memory" ++ ); ++ ++} ++ ++ ++ ++#define HAVE_KERNEL_4x4 1 ++static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); ++ ++static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) ++{ ++ ++ BLASLONG register i = 0; ++ ++ __asm__ __volatile__ ++ ( ++ "vzeroupper \n\t" ++ "vbroadcastsd (%2), %%ymm12 \n\t" // x0 ++ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 ++ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 ++ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 ++ ++ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha ++ ++ "testq $0x04, %1 \n\t" ++ "jz 2f \n\t" ++ ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y ++ ++ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" ++ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" ++ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" ++ ++ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y ++ ++ "addq $4 , %0 \n\t" ++ "subq $4 , %1 \n\t" ++ ++ "2: \n\t" ++ ++ "cmpq $0, %1 \n\t" ++ "je 3f \n\t" ++ ++ ++ ".align 16 \n\t" ++ "1: \n\t" ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y ++ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y ++ ++ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" ++ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" ++ ++ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y ++ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y ++ ++ "addq $8 , %0 \n\t" ++ "subq $8 , %1 \n\t" ++ "jnz 1b \n\t" ++ ++ "3: \n\t" ++ "vzeroupper \n\t" ++ ++ : ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : ++ "r" (x), // 2 ++ "r" (y), // 3 ++ "r" (ap[0]), // 4 ++ "r" (ap[1]), // 5 ++ "r" (ap[2]), // 6 ++ "r" (ap[3]), // 7 ++ "r" (alpha) // 8 ++ : "cc", ++ "%xmm4", "%xmm5", ++ "%xmm6", "%xmm7", ++ "%xmm8", "%xmm9", ++ "%xmm12", "%xmm13", "%xmm14", "%xmm15", ++ "memory" ++ ); ++ ++} ++ ++ diff --git a/2024.patch b/2024.patch new file mode 100644 index 0000000..720a9e2 --- /dev/null +++ b/2024.patch @@ -0,0 +1,1349 @@ +From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 20:06:48 +0100 +Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels + +rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 +--- + kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- + kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- + kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- + kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- + kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- + 5 files changed, 356 insertions(+), 356 deletions(-) + +diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +index 54df5b359..35ed4cc01 100644 +--- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c ++++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " prefetcht0 384(%3,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " prefetcht0 384(%7,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 1 + +- " vmovddup (%7), %%xmm1 \n\t" // read b +- " vmovddup 8(%7), %%xmm0 \n\t" // read bb ++ " vmovddup (%3), %%xmm1 \n\t" // read b ++ " vmovddup 8(%3), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 +- " subq $16 , %7 \n\t" // b = b - 2 +- " subq $64 , %6 \n\t" // a = a - 8 ++ " subq $16 , %3 \n\t" // b = b - 2 ++ " subq $64 , %2 \n\t" // a = a - 8 + +- " vmovddup (%7), %%xmm0 \n\t" // read bb ++ " vmovddup (%3), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c +index 1b8991c6c..3cd215000 100644 +--- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c +@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" + +- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] ++ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] ++ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] ++ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] ++ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] ++ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] ++ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] ++ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] ++ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] ++ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] ++ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] ++ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] ++ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] ++ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] ++ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] ++ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] ++ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c +index 0623dddb0..a4a62491c 100644 +--- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c +@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" + +- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] ++ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] ++ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] ++ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] ++ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] ++ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] ++ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] ++ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] ++ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] ++ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] ++ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] ++ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] ++ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] ++ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] ++ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] ++ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] ++ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c +index 4cc557d55..c11c84cec 100644 +--- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c +@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 0 + +- " vbroadcastss (%7), %%xmm0 \n\t" // read bb +- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b ++ " vbroadcastss (%3), %%xmm0 \n\t" // read bb ++ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" + + " \n\t" // i = 1 +- " addq $8 , %7 \n\t" // b = b + 2 +- " addq $64 , %6 \n\t" // a = a + 16 ++ " addq $8 , %3 \n\t" // b = b + 2 ++ " addq $64 , %2 \n\t" // a = a + 16 + +- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c +index 73f6e8a95..326ca2976 100644 +--- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c +@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 1 + +- " vbroadcastss (%7), %%xmm1 \n\t" // read b +- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss (%3), %%xmm1 \n\t" // read b ++ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 +- " subq $8 , %7 \n\t" // b = b - 2 +- " subq $64 , %6 \n\t" // a = a - 16 ++ " subq $8 , %3 \n\t" // b = b - 2 ++ " subq $64 , %2 \n\t" // a = a - 16 + +- " vbroadcastss (%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss (%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/openblas.spec b/openblas.spec index 495358c..6d8fd95 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.5 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,6 +29,14 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch +# Fix assembly code +Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch +Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch +Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch +Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch +Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch +Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch + BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel @@ -239,6 +247,13 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests +%patch10 -p0 +%patch11 -p0 +%patch12 -p0 +%patch13 -p0 +%patch14 -p0 +%patch15 -p0 + # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -674,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Feb 17 2019 Susi Lehtola - 0.3.5-3 +- Patch assembly kernels to satisfy gcc 9 demands. + * Fri Feb 01 2019 Fedora Release Engineering - 0.3.5-2 - Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild From 03740aadcefc9e261fbd433c90dd91da591bff22 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 17 Feb 2019 21:49:09 +0100 Subject: [PATCH 28/44] Fix patch level --- openblas.spec | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openblas.spec b/openblas.spec index 6d8fd95..919d632 100644 --- a/openblas.spec +++ b/openblas.spec @@ -247,12 +247,12 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests -%patch10 -p0 -%patch11 -p0 -%patch12 -p0 -%patch13 -p0 -%patch14 -p0 -%patch15 -p0 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 # Fix source permissions find -name \*.f -exec chmod 644 {} \; From ee6a88c55dbf57369f2bccb04eea130b8bf41ae8 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 25 Feb 2019 10:18:57 +0100 Subject: [PATCH 29/44] Add another assembly kernel patch. --- 2028.patch | 412 ++++++++++++++++++++++++++++++++++++++++++++++++++ openblas.spec | 7 +- 2 files changed, 418 insertions(+), 1 deletion(-) create mode 100644 2028.patch diff --git a/2028.patch b/2028.patch new file mode 100644 index 0000000..64d050f --- /dev/null +++ b/2028.patch @@ -0,0 +1,412 @@ +From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 +From: Andrew <16061801+brada4@users.noreply.github.com> +Date: Sun, 24 Feb 2019 20:41:02 +0200 +Subject: [PATCH 2/2] move fix to right place + +--- + dgemv_n_microk_piledriver-4.c | 247 -------------------- + kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- + 2 files changed, 49 insertions(+), 296 deletions(-) + delete mode 100644 dgemv_n_microk_piledriver-4.c + +diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c +deleted file mode 100644 +index 466931b82..000000000 +--- a/dgemv_n_microk_piledriver-4.c ++++ /dev/null +@@ -1,247 +0,0 @@ +-/*************************************************************************** +-Copyright (c) 2014, The OpenBLAS Project +-All rights reserved. +-Redistribution and use in source and binary forms, with or without +-modification, are permitted provided that the following conditions are +-met: +-1. Redistributions of source code must retain the above copyright +-notice, this list of conditions and the following disclaimer. +-2. Redistributions in binary form must reproduce the above copyright +-notice, this list of conditions and the following disclaimer in +-the documentation and/or other materials provided with the +-distribution. +-3. Neither the name of the OpenBLAS project nor the names of +-its contributors may be used to endorse or promote products +-derived from this software without specific prior written permission. +-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-*****************************************************************************/ +- +- +- +-#define HAVE_KERNEL_4x8 1 +-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); +- +-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +-{ +- +- BLASLONG register i = 0; +- +- __asm__ __volatile__ +- ( +- "vzeroupper \n\t" +- "vbroadcastsd (%3), %%ymm12 \n\t" // x0 +- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 +- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 +- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 +- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 +- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 +- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 +- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 +- +- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha +- +- "testq $0x04, %1 \n\t" +- "jz 2f \n\t" +- +- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y +- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" +- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" +- +- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" +- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" +- +- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" +- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" +- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" +- +- +- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y +- +- "addq $4 , %2 \n\t" +- "addq $4 , %0 \n\t" +- "subq $4 , %1 \n\t" +- +- "2: \n\t" +- +- "cmpq $0, %1 \n\t" +- "je 3f \n\t" +- +- +- ".align 16 \n\t" +- "1: \n\t" +- +- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" +- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y +- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y +- +- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" +- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" +- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" +- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" +- "addq $8 , %0 \n\t" +- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" +- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" +- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" +- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" +- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" +- +- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" +- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" +- +- "addq $8 , %2 \n\t" +- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y +- "subq $8 , %1 \n\t" +- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y +- +- "jnz 1b \n\t" +- +- "3: \n\t" +- "vzeroupper \n\t" +- +- : +- "+r" (i), // 0 +- "+r" (n), // 1 +- "+r" (lda4) // 2 +- : +- "r" (x), // 3 +- "r" (y), // 4 +- "r" (ap[0]), // 5 +- "r" (ap[1]), // 6 +- "r" (ap[2]), // 7 +- "r" (ap[3]), // 8 +- "r" (alpha) // 9 +- : "cc", +- "%xmm0", "%xmm1", +- "%xmm2", "%xmm3", +- "%xmm4", "%xmm5", +- "%xmm6", "%xmm7", +- "%xmm8", "%xmm9", +- "%xmm12", "%xmm13", "%xmm14", "%xmm15", +- "memory" +- ); +- +-} +- +- +- +-#define HAVE_KERNEL_4x4 1 +-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); +- +-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +-{ +- +- BLASLONG register i = 0; +- +- __asm__ __volatile__ +- ( +- "vzeroupper \n\t" +- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 +- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 +- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 +- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 +- +- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha +- +- "testq $0x04, %1 \n\t" +- "jz 2f \n\t" +- +- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" +- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y +- +- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" +- +- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" +- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" +- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" +- +- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y +- +- "addq $4 , %0 \n\t" +- "subq $4 , %1 \n\t" +- +- "2: \n\t" +- +- "cmpq $0, %1 \n\t" +- "je 3f \n\t" +- +- +- ".align 16 \n\t" +- "1: \n\t" +- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" +- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y +- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y +- +- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" +- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" +- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" +- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" +- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" +- +- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y +- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y +- +- "addq $8 , %0 \n\t" +- "subq $8 , %1 \n\t" +- "jnz 1b \n\t" +- +- "3: \n\t" +- "vzeroupper \n\t" +- +- : +- "+r" (i), // 0 +- "+r" (n) // 1 +- : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (alpha) // 8 +- : "cc", +- "%xmm4", "%xmm5", +- "%xmm6", "%xmm7", +- "%xmm8", "%xmm9", +- "%xmm12", "%xmm13", "%xmm14", "%xmm15", +- "memory" +- ); +- +-} +- +- +diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c +index 530780bab..466931b82 100644 +--- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c ++++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c +@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + __asm__ __volatile__ + ( + "vzeroupper \n\t" +- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 +- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 +- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 +- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 +- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 +- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 +- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 +- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 ++ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + +- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y ++ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + +- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" + +- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" +- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + +- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y ++ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y + +- "addq $4 , %8 \n\t" ++ "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + +@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y +- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y +- +- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" +- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" +- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" +- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" +- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" +- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" +- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" +- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" ++ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y ++ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y ++ ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" +- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" +- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" +- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" +- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" +- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + +- "addq $8 , %8 \n\t" ++ "addq $8 , %2 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" +- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y ++ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y + + "jnz 1b \n\t" + +@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", diff --git a/openblas.spec b/openblas.spec index 919d632..93e9af3 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.5 -Release: 3%{?dist} +Release: 4%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -36,6 +36,7 @@ Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pul Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch +Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -253,6 +254,7 @@ cd OpenBLAS-%{version} %patch13 -p1 %patch14 -p1 %patch15 -p1 +%patch16 -p1 # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -689,6 +691,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Mon Feb 25 2019 Susi Lehtola - 0.3.5-4 +- Another assembly kernel patch. + * Sun Feb 17 2019 Susi Lehtola - 0.3.5-3 - Patch assembly kernels to satisfy gcc 9 demands. From 4e591d87253d58798a9bb5f26fd38f14b81a802c Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 26 Feb 2019 01:11:21 +0100 Subject: [PATCH 30/44] More patches --- 1965.patch | 3283 +++++++++++++++++++++++++++++++++++++++++++++++++ 1966.patch | 960 +++++++++++++++ 1967.patch | 99 ++ openblas.spec | 11 +- 4 files changed, 4352 insertions(+), 1 deletion(-) create mode 100644 1965.patch create mode 100644 1966.patch create mode 100644 1967.patch diff --git a/1965.patch b/1965.patch new file mode 100644 index 0000000..5d8b935 --- /dev/null +++ b/1965.patch @@ -0,0 +1,3283 @@ +From f0dd0584306b42289cac77fdafe6997e449d4f38 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 19:56:10 +0100 +Subject: [PATCH 001/111] Tag operands 0 and 1 as both input and output + +For #1964 (basically a continuation of coding problems first seen in #1292) +--- + kernel/x86_64/caxpy_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c +index 33bda0943..cb98f208a 100644 +--- a/kernel/x86_64/caxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c +@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 8a6bbf5a5bf4623795b2ff9aaa8d35467288d6c7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 19:57:27 +0100 +Subject: [PATCH 002/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/caxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c +index 00e2e6a42..f31cf9710 100644 +--- a/kernel/x86_64/caxpy_microk_haswell-2.c ++++ b/kernel/x86_64/caxpy_microk_haswell-2.c +@@ -113,8 +113,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 4e6f8fec31e83648c77c47398829b5191e671966 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 19:58:19 +0100 +Subject: [PATCH 003/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c +index a798fd977..931d1ad47 100644 +--- a/kernel/x86_64/caxpy_microk_sandy-2.c ++++ b/kernel/x86_64/caxpy_microk_sandy-2.c +@@ -97,8 +97,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 663eef3b666e79c0e93f35cf79eada50040d9dd3 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 19:59:59 +0100 +Subject: [PATCH 004/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/caxpy_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c +index 87370b032..9aeb47968 100644 +--- a/kernel/x86_64/caxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/caxpy_microk_steamroller-2.c +@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From a671e19dd2cad6dc1e2e639f45a4faebf53b6f7f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:02:01 +0100 +Subject: [PATCH 005/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/cdot_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c +index f587aa036..e6d11f1af 100644 +--- a/kernel/x86_64/cdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/cdot_microk_bulldozer-2.c +@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 47e2b4592eb31860a58222bedc8a3208c153aa00 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:03:03 +0100 +Subject: [PATCH 006/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/cdot_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c +index fe195a63b..9fee7615d 100644 +--- a/kernel/x86_64/cdot_microk_haswell-2.c ++++ b/kernel/x86_64/cdot_microk_haswell-2.c +@@ -99,8 +99,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 30a7bd8e15fb68d3fa651bbf48e1e65fc6078090 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:03:50 +0100 +Subject: [PATCH 007/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c +index 01816917d..705c80c5c 100644 +--- a/kernel/x86_64/cdot_microk_sandy-2.c ++++ b/kernel/x86_64/cdot_microk_sandy-2.c +@@ -107,8 +107,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 2f5a7c1656b7975f71db2b8da90080938ccd3757 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:05:03 +0100 +Subject: [PATCH 008/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/cdot_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c +index 76a3aa0eb..5a46aed8c 100644 +--- a/kernel/x86_64/cdot_microk_steamroller-2.c ++++ b/kernel/x86_64/cdot_microk_steamroller-2.c +@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From bb16456fe1ff372b61a7ab042418248f68ddddc6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:23:58 +0100 +Subject: [PATCH 009/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c +index 8c520dcf1..c9a01580e 100644 +--- a/kernel/x86_64/daxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c +@@ -65,8 +65,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 7af8f34df4efcc0ecaaa34c380119edcd5d206de Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:24:55 +0100 +Subject: [PATCH 010/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c +index bbe8b9550..67431659d 100644 +--- a/kernel/x86_64/daxpy_microk_haswell-2.c ++++ b/kernel/x86_64/daxpy_microk_haswell-2.c +@@ -61,8 +61,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From d94e7da701dae1106854753b2d5b676255c1c0f4 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:25:56 +0100 +Subject: [PATCH 011/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c +index 943d893af..61c99904a 100644 +--- a/kernel/x86_64/daxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/daxpy_microk_nehalem-2.c +@@ -74,8 +74,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 6008f6531855d615ad98febe65364074b99fa5bf Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:26:55 +0100 +Subject: [PATCH 012/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c +index 95eb953b4..e3d605b75 100644 +--- a/kernel/x86_64/daxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/daxpy_microk_piledriver-2.c +@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 9d46f84f24dc7284fc398574b811621e5c61e2dc Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:27:48 +0100 +Subject: [PATCH 013/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c +index 85e038cef..1b827e7e2 100644 +--- a/kernel/x86_64/daxpy_microk_sandy-2.c ++++ b/kernel/x86_64/daxpy_microk_sandy-2.c +@@ -101,8 +101,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From ca02ac724f5b06e16a8941ef3b2582c251234679 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:28:56 +0100 +Subject: [PATCH 014/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c +index e40009037..2cab80067 100644 +--- a/kernel/x86_64/daxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/daxpy_microk_steamroller-2.c +@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From c18c2c9d9b0cd7e82cb98c7b212ffb29648fb9e0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:32:17 +0100 +Subject: [PATCH 015/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c +index 9756ee46a..379fd3ca1 100644 +--- a/kernel/x86_64/ddot_microk_bulldozer-2.c ++++ b/kernel/x86_64/ddot_microk_bulldozer-2.c +@@ -67,8 +67,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From c23c17163f1b7a5fb7652cbc038a50c01f9440c5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:33:07 +0100 +Subject: [PATCH 016/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c +index 365737363..c0c277c32 100644 +--- a/kernel/x86_64/ddot_microk_haswell-2.c ++++ b/kernel/x86_64/ddot_microk_haswell-2.c +@@ -78,8 +78,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From de207d10c1f11ef1f38b4f766909619ab744d64a Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:34:05 +0100 +Subject: [PATCH 017/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c +index fb5ec9bca..ea0b4eff1 100644 +--- a/kernel/x86_64/ddot_microk_nehalem-2.c ++++ b/kernel/x86_64/ddot_microk_nehalem-2.c +@@ -77,8 +77,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From c9078eb8b4481fbc1841bcbf36ba438bf2749632 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:35:14 +0100 +Subject: [PATCH 018/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c +index ac950885c..f7b74add6 100644 +--- a/kernel/x86_64/ddot_microk_piledriver-2.c ++++ b/kernel/x86_64/ddot_microk_piledriver-2.c +@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -147,8 +147,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 00aff05c4049cd697b4000b5f2e726496b34dc54 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:36:08 +0100 +Subject: [PATCH 019/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c +index 160f95604..e57eb37ea 100644 +--- a/kernel/x86_64/ddot_microk_sandy-2.c ++++ b/kernel/x86_64/ddot_microk_sandy-2.c +@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From dc15f3b5a7689a6cea1d31e004d7a3488bf9b66d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:37:06 +0100 +Subject: [PATCH 020/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c +index 5ce20b5de..845c78df1 100644 +--- a/kernel/x86_64/ddot_microk_steamroller-2.c ++++ b/kernel/x86_64/ddot_microk_steamroller-2.c +@@ -80,8 +80,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 3f1719a98da89f0a6f1d435d3f705aa083702ac7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:39:08 +0100 +Subject: [PATCH 021/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c +index 3a743d64c..3b03e11a4 100644 +--- a/kernel/x86_64/saxpy_microk_haswell-2.c ++++ b/kernel/x86_64/saxpy_microk_haswell-2.c +@@ -61,8 +61,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From b13f3c3bcfffcecbcc80454c90c31bc05dd5a04d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:39:57 +0100 +Subject: [PATCH 022/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/saxpy_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c +index 68f68ea3a..4ffb39acf 100644 +--- a/kernel/x86_64/saxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/saxpy_microk_nehalem-2.c +@@ -74,8 +74,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 2bd18c7b73731d1b8bd900213fc7fa7a2356a357 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:40:50 +0100 +Subject: [PATCH 023/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c +index 204cf8bac..87c5fe3cf 100644 +--- a/kernel/x86_64/saxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/saxpy_microk_piledriver-2.c +@@ -80,8 +80,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -141,8 +141,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 6fcb55b22f6e8b80e7f6ffcf228c70c0929915b5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:41:41 +0100 +Subject: [PATCH 024/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c +index 0a6bef046..5a8424d66 100644 +--- a/kernel/x86_64/saxpy_microk_sandy-2.c ++++ b/kernel/x86_64/saxpy_microk_sandy-2.c +@@ -101,8 +101,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 922e44897831f393cbeeb1406feb7fcf6e320281 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:42:35 +0100 +Subject: [PATCH 025/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c +index 36e61b077..5a6fc6da2 100644 +--- a/kernel/x86_64/sdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/sdot_microk_bulldozer-2.c +@@ -68,8 +68,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From d384880da564344e92a8d60b08e3183ab02ba75b Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:43:24 +0100 +Subject: [PATCH 026/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c +index df367b61f..89d9cfe61 100644 +--- a/kernel/x86_64/sdot_microk_haswell-2.c ++++ b/kernel/x86_64/sdot_microk_haswell-2.c +@@ -81,8 +81,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From cd3a35ee79b4b5fa00e5a446be2a6cceb3230874 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:44:13 +0100 +Subject: [PATCH 027/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c +index 1a27177f5..cef41b530 100644 +--- a/kernel/x86_64/sdot_microk_nehalem-2.c ++++ b/kernel/x86_64/sdot_microk_nehalem-2.c +@@ -77,8 +77,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From ba9f792e759ea97e75445b1fe1eaab4f3432f4f1 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:45:08 +0100 +Subject: [PATCH 028/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c +index ca13536f2..e77ba1424 100644 +--- a/kernel/x86_64/sdot_microk_sandy-2.c ++++ b/kernel/x86_64/sdot_microk_sandy-2.c +@@ -84,8 +84,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From c931bb8172bbdcbcfe6d2de281d2f83a7f5a3515 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:46:19 +0100 +Subject: [PATCH 029/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c +index 6b8b2566b..bedde8fb6 100644 +--- a/kernel/x86_64/sdot_microk_steamroller-2.c ++++ b/kernel/x86_64/sdot_microk_steamroller-2.c +@@ -82,8 +82,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -145,8 +145,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 0172c51829110a5450b4d6d5f454bd4aa4106269 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:48:16 +0100 +Subject: [PATCH 030/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +index 0e15761f7..56493f8cb 100644 +--- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 0cfb647a577058cebeaabadbe6ef62eebd2ce49e Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:51:34 +0100 +Subject: [PATCH 031/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c +index 30e8b1955..bd52ba01f 100644 +--- a/kernel/x86_64/zaxpy_microk_haswell-2.c ++++ b/kernel/x86_64/zaxpy_microk_haswell-2.c +@@ -113,8 +113,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 2b542d10368cbb8433b7274fb12b77845606d2fe Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:52:35 +0100 +Subject: [PATCH 032/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c +index 233af143a..d6a9ff394 100644 +--- a/kernel/x86_64/zaxpy_microk_sandy-2.c ++++ b/kernel/x86_64/zaxpy_microk_sandy-2.c +@@ -101,8 +101,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -178,8 +178,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From af29c99c85d9ea5c27b6e917ebb1dcdbe1292f7b Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 20:53:29 +0100 +Subject: [PATCH 033/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c +index 728d09213..58d4c7286 100644 +--- a/kernel/x86_64/zaxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c +@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From f78531a9ec8ee28f7790505382231b3f5094b795 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 21:05:31 +0100 +Subject: [PATCH 034/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c +index 30a9552d6..ed66cc674 100644 +--- a/kernel/x86_64/zdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/zdot_microk_bulldozer-2.c +@@ -98,8 +98,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -177,8 +177,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From b6f4ef5aea58e5ea1225283e406cadf9416818fc Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 21:06:54 +0100 +Subject: [PATCH 035/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c +index 11056a3c1..0e6ac55db 100644 +--- a/kernel/x86_64/zdot_microk_haswell-2.c ++++ b/kernel/x86_64/zdot_microk_haswell-2.c +@@ -103,8 +103,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -188,8 +188,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 715b1f263d6903f1af391c5278a9aa61f1753193 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 21:08:09 +0100 +Subject: [PATCH 036/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c +index 87c5b0340..416265ae2 100644 +--- a/kernel/x86_64/zdot_microk_sandy-2.c ++++ b/kernel/x86_64/zdot_microk_sandy-2.c +@@ -109,8 +109,8 @@ if ( n < 1280 ) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -201,8 +201,8 @@ if ( n < 1280 ) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From e8d835ea466a1605db2157b6884a4cfe762478fc Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 21:09:03 +0100 +Subject: [PATCH 037/111] Tag operands 0 and 1 as both input and output + +--- + kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c +index 325f74ae3..fe1613fd4 100644 +--- a/kernel/x86_64/zdot_microk_steamroller-2.c ++++ b/kernel/x86_64/zdot_microk_steamroller-2.c +@@ -97,8 +97,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -174,8 +174,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From bbc30700e871d84c07d770f54b645ea3eee549fa Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:49:34 +0100 +Subject: [PATCH 038/111] Update saxpy_microk_nehalem-2.c + +--- + kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c +index 4ffb39acf..e25156939 100644 +--- a/kernel/x86_64/saxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/saxpy_microk_nehalem-2.c +@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 300bb19b3ec0a48b7371d7c1be3ee88a29e87cf9 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:52:04 +0100 +Subject: [PATCH 039/111] Update caxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/caxpy_microk_bulldozer-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c +index cb98f208a..faf5cdc40 100644 +--- a/kernel/x86_64/caxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c +@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 1878e0c95aee9777f7c082bcc98ff12b04edc75d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:52:54 +0100 +Subject: [PATCH 040/111] Update caxpy_microk_haswell-2.c + +--- + kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c +index f31cf9710..a011b2bfa 100644 +--- a/kernel/x86_64/caxpy_microk_haswell-2.c ++++ b/kernel/x86_64/caxpy_microk_haswell-2.c +@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From f6be89295f4e21572a743d26e677256fc29ee8cf Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:53:35 +0100 +Subject: [PATCH 041/111] Update caxpy_microk_sandy-2.c + +--- + kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c +index 931d1ad47..c760d6540 100644 +--- a/kernel/x86_64/caxpy_microk_sandy-2.c ++++ b/kernel/x86_64/caxpy_microk_sandy-2.c +@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 4673e5317861de37b326181b0dfc8514a2b3b69d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:54:39 +0100 +Subject: [PATCH 042/111] Update caxpy_microk_steamroller-2.c + +--- + kernel/x86_64/caxpy_microk_steamroller-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c +index 9aeb47968..b6eb55f9b 100644 +--- a/kernel/x86_64/caxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/caxpy_microk_steamroller-2.c +@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From ba6d2c77a98f55431d8d2d4de4b6df99814352c1 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:55:38 +0100 +Subject: [PATCH 043/111] Update cdot_microk_bulldozer-2.c + +--- + kernel/x86_64/cdot_microk_bulldozer-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c +index e6d11f1af..c2245c6dc 100644 +--- a/kernel/x86_64/cdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/cdot_microk_bulldozer-2.c +@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 093a3d7d5790efd7441611ee8c8769d4f3d997c0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:56:15 +0100 +Subject: [PATCH 044/111] Update cdot_microk_haswell-2.c + +--- + kernel/x86_64/cdot_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c +index 9fee7615d..396dbeaa7 100644 +--- a/kernel/x86_64/cdot_microk_haswell-2.c ++++ b/kernel/x86_64/cdot_microk_haswell-2.c +@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 2224bcb4f070e607ede67f2f6e089e2e99519517 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:57:01 +0100 +Subject: [PATCH 045/111] Update cdot_microk_sandy-2.c + +--- + kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c +index 705c80c5c..20ba48c00 100644 +--- a/kernel/x86_64/cdot_microk_sandy-2.c ++++ b/kernel/x86_64/cdot_microk_sandy-2.c +@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 2414f1d796e23f8e9e4abba27e948f5877773640 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:57:56 +0100 +Subject: [PATCH 046/111] Update cdot_microk_steamroller-2.c + +--- + kernel/x86_64/cdot_microk_steamroller-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c +index 5a46aed8c..01754b147 100644 +--- a/kernel/x86_64/cdot_microk_steamroller-2.c ++++ b/kernel/x86_64/cdot_microk_steamroller-2.c +@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From ae00befb3e3a9632d9545ba0af43f9afb90787b2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:58:52 +0100 +Subject: [PATCH 047/111] Update daxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c +index c9a01580e..2e2356fb6 100644 +--- a/kernel/x86_64/daxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c +@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 604c574542a5fac237b5134610166fab26db1285 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 22:59:30 +0100 +Subject: [PATCH 048/111] Update daxpy_microk_haswell-2.c + +--- + kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c +index 67431659d..c77fc33ef 100644 +--- a/kernel/x86_64/daxpy_microk_haswell-2.c ++++ b/kernel/x86_64/daxpy_microk_haswell-2.c +@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 082498ee3b8470e992f33414e3097ca301f9efa7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:00:07 +0100 +Subject: [PATCH 049/111] Update daxpy_microk_nehalem-2.c + +--- + kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c +index 61c99904a..b81fe6562 100644 +--- a/kernel/x86_64/daxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/daxpy_microk_nehalem-2.c +@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 293f5531e66088d7149bebd68bcd7aa564b3a263 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:00:53 +0100 +Subject: [PATCH 050/111] Update daxpy_microk_piledriver-2.c + +--- + kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c +index e3d605b75..efe93dfed 100644 +--- a/kernel/x86_64/daxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/daxpy_microk_piledriver-2.c +@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "subq $16, %1 \n\t" + "jnz 1b \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "subq $16, %1 \n\t" + "jnz 1b \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 6cee8e0fdd463139f85656292971de1e4810d775 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:01:28 +0100 +Subject: [PATCH 051/111] Update daxpy_microk_sandy-2.c + +--- + kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c +index 1b827e7e2..3b1214f36 100644 +--- a/kernel/x86_64/daxpy_microk_sandy-2.c ++++ b/kernel/x86_64/daxpy_microk_sandy-2.c +@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 6450bf14afa94cade7d28330749dfbf255697026 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:02:32 +0100 +Subject: [PATCH 052/111] Update daxpy_microk_steamroller-2.c + +--- + kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c +index 2cab80067..a5143682f 100644 +--- a/kernel/x86_64/daxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/daxpy_microk_steamroller-2.c +@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "subq $16, %1 \n\t" + "jnz 1b \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "subq $16, %1 \n\t" + "jnz 1b \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From a339b45e51c58e5b13c01c6918282fb31941acdf Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:03:07 +0100 +Subject: [PATCH 053/111] Update ddot_microk_bulldozer-2.c + +--- + kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c +index 379fd3ca1..62bf7e7dc 100644 +--- a/kernel/x86_64/ddot_microk_bulldozer-2.c ++++ b/kernel/x86_64/ddot_microk_bulldozer-2.c +@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "vmovsd %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 64fcdadf39137bdc56c56ead1e4d8f1bea32fe2a Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:03:44 +0100 +Subject: [PATCH 054/111] Update ddot_microk_haswell-2.c + +--- + kernel/x86_64/ddot_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c +index c0c277c32..0cf4ece65 100644 +--- a/kernel/x86_64/ddot_microk_haswell-2.c ++++ b/kernel/x86_64/ddot_microk_haswell-2.c +@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vzeroupper \n\t" + + : +- : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 504dd44e887cbd985bac3d48a2a7fdc3a03727d8 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:04:20 +0100 +Subject: [PATCH 055/111] Update ddot_microk_nehalem-2.c + +--- + kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c +index ea0b4eff1..086a0bb91 100644 +--- a/kernel/x86_64/ddot_microk_nehalem-2.c ++++ b/kernel/x86_64/ddot_microk_nehalem-2.c +@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "movsd %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 56c67a929a2b215c3980a542c74a016f828e119d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:05:11 +0100 +Subject: [PATCH 056/111] Update ddot_microk_piledriver-2.c + +--- + kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c +index f7b74add6..d7347ebdf 100644 +--- a/kernel/x86_64/ddot_microk_piledriver-2.c ++++ b/kernel/x86_64/ddot_microk_piledriver-2.c +@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From b7ffbc40eca528e3aae46d004c1ad8e6fd013530 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:05:43 +0100 +Subject: [PATCH 057/111] Update ddot_microk_sandy-2.c + +--- + kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c +index e57eb37ea..28b1a8bd1 100644 +--- a/kernel/x86_64/ddot_microk_sandy-2.c ++++ b/kernel/x86_64/ddot_microk_sandy-2.c +@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 0c9c31dbe4817ad24ecc2cc5dc553239a7c31590 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:06:20 +0100 +Subject: [PATCH 058/111] Update ddot_microk_steamroller-2.c + +--- + kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c +index 845c78df1..98cf94acf 100644 +--- a/kernel/x86_64/ddot_microk_steamroller-2.c ++++ b/kernel/x86_64/ddot_microk_steamroller-2.c +@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovsd %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From d1b69022c935a37bbe3c8b09eb329a7468339ff0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:07:04 +0100 +Subject: [PATCH 059/111] Update saxpy_microk_haswell-2.c + +--- + kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c +index 3b03e11a4..3bc450f7b 100644 +--- a/kernel/x86_64/saxpy_microk_haswell-2.c ++++ b/kernel/x86_64/saxpy_microk_haswell-2.c +@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 369a2b4af5680dfcbd1d8290077f62a4d74336fb Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:07:54 +0100 +Subject: [PATCH 060/111] Update saxpy_microk_piledriver-2.c + +--- + kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c +index 87c5fe3cf..87e742ac7 100644 +--- a/kernel/x86_64/saxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/saxpy_microk_piledriver-2.c +@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 +@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From dc931ad1fe709ad378d6d963fbde5bad421e5514 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:08:27 +0100 +Subject: [PATCH 061/111] Update saxpy_microk_sandy-2.c + +--- + kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c +index 5a8424d66..6ce67a7d1 100644 +--- a/kernel/x86_64/saxpy_microk_sandy-2.c ++++ b/kernel/x86_64/saxpy_microk_sandy-2.c +@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From b2d6fea1cb99f0830c33e3667d1928be4496a31f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:09:07 +0100 +Subject: [PATCH 062/111] Update sdot_microk_bulldozer-2.c + +--- + kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c +index 5a6fc6da2..c7f8cb1a7 100644 +--- a/kernel/x86_64/sdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/sdot_microk_bulldozer-2.c +@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "vmovss %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From ffc008663aef2dd318c58275fb8b68cc93de9a42 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:09:50 +0100 +Subject: [PATCH 063/111] Update sdot_microk_haswell-2.c + +--- + kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c +index 89d9cfe61..417fb3862 100644 +--- a/kernel/x86_64/sdot_microk_haswell-2.c ++++ b/kernel/x86_64/sdot_microk_haswell-2.c +@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovss %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 88b0dbfbddbc5170263bd06eb0aad0abf85faa81 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:10:30 +0100 +Subject: [PATCH 064/111] Update sdot_microk_nehalem-2.c + +--- + kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c +index cef41b530..115e7a410 100644 +--- a/kernel/x86_64/sdot_microk_nehalem-2.c ++++ b/kernel/x86_64/sdot_microk_nehalem-2.c +@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "movss %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From ba9c3c4328a73821ce6067fb78b01b8817a92fa1 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:11:09 +0100 +Subject: [PATCH 065/111] Update sdot_microk_sandy-2.c + +--- + kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c +index e77ba1424..9d0795181 100644 +--- a/kernel/x86_64/sdot_microk_sandy-2.c ++++ b/kernel/x86_64/sdot_microk_sandy-2.c +@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovss %%xmm4, (%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 266e72d24b767dbcdb97f597c899c7f495609c6f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:12:07 +0100 +Subject: [PATCH 066/111] Update sdot_microk_steamroller-2.c + +--- + kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c +index bedde8fb6..3475f890d 100644 +--- a/kernel/x86_64/sdot_microk_steamroller-2.c ++++ b/kernel/x86_64/sdot_microk_steamroller-2.c +@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "vmovss %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + "vmovss %%xmm4, (%4) \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 72c3a4d1bd1daf3a98413dbea081f19fc6ee897d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:13:06 +0100 +Subject: [PATCH 067/111] Update zaxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +index 56493f8cb..eed36ffd0 100644 +--- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 157e65ff74b7760a19ed38e8796aab6ad0d2a152 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:13:41 +0100 +Subject: [PATCH 068/111] Update zaxpy_microk_haswell-2.c + +--- + kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c +index bd52ba01f..9aeea975b 100644 +--- a/kernel/x86_64/zaxpy_microk_haswell-2.c ++++ b/kernel/x86_64/zaxpy_microk_haswell-2.c +@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 212b0a106d83491aeac793c6d45b4e494d06d868 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:14:28 +0100 +Subject: [PATCH 069/111] Update zaxpy_microk_sandy-2.c + +--- + kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c +index d6a9ff394..cbd9b378f 100644 +--- a/kernel/x86_64/zaxpy_microk_sandy-2.c ++++ b/kernel/x86_64/zaxpy_microk_sandy-2.c +@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 2fa6d8107c40d780c988c8f23b5d61d6a0f8e8eb Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:15:18 +0100 +Subject: [PATCH 070/111] Update zaxpy_microk_steamroller-2.c + +--- + kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c +index 58d4c7286..5fc56aec7 100644 +--- a/kernel/x86_64/zaxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c +@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 +@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "jnz 1b \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha), // 4 + +From 79d5dd461d13953e8cade9a1dad43ad38cf93aaa Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:16:09 +0100 +Subject: [PATCH 071/111] Update zdot_microk_bulldozer-2.c + +--- + kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c +index ed66cc674..a80eac003 100644 +--- a/kernel/x86_64/zdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/zdot_microk_bulldozer-2.c +@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From cb5cfffb1765ac8ef1e2f149aea1dc3e5fbb9623 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:16:55 +0100 +Subject: [PATCH 072/111] Update zdot_microk_haswell-2.c + +--- + kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c +index 0e6ac55db..963d2e3bd 100644 +--- a/kernel/x86_64/zdot_microk_haswell-2.c ++++ b/kernel/x86_64/zdot_microk_haswell-2.c +@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From f4e5f931ae5c14d284749c65d1e9ed08873afaa2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:17:46 +0100 +Subject: [PATCH 073/111] Update zdot_microk_sandy-2.c + +--- + kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c +index 416265ae2..88d4e1bbb 100644 +--- a/kernel/x86_64/zdot_microk_sandy-2.c ++++ b/kernel/x86_64/zdot_microk_sandy-2.c +@@ -107,10 +107,10 @@ if ( n < 1280 ) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -199,10 +199,10 @@ if ( n < 1280 ) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From ae2f3e617df8894ebe1779d3bcc78170bcad8b4c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:18:27 +0100 +Subject: [PATCH 074/111] Update zdot_microk_steamroller-2.c + +--- + kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c +index fe1613fd4..2f11fe562 100644 +--- a/kernel/x86_64/zdot_microk_steamroller-2.c ++++ b/kernel/x86_64/zdot_microk_steamroller-2.c +@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 +@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + "vmovups %%xmm4, 16(%4) \n\t" + "vzeroupper \n\t" + +- : +- : ++ : + "+r" (i), // 0 + "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + +From 379aa11f4bfc5bb352372a3f423062267e73dd77 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:10:21 +0100 +Subject: [PATCH 075/111] Update caxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/caxpy_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c +index faf5cdc40..ca2209340 100644 +--- a/kernel/x86_64/caxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c +@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 983c72ab0fc182264a635d1c5286ceebc2b2f3e2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:10:51 +0100 +Subject: [PATCH 076/111] Update caxpy_microk_haswell-2.c + +--- + kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c +index a011b2bfa..b605ea34c 100644 +--- a/kernel/x86_64/caxpy_microk_haswell-2.c ++++ b/kernel/x86_64/caxpy_microk_haswell-2.c +@@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 6f7f9967f945c145e6e4ceac14162e8dbc551f4c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:11:21 +0100 +Subject: [PATCH 077/111] Update caxpy_microk_sandy-2.c + +--- + kernel/x86_64/caxpy_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c +index c760d6540..72d37afed 100644 +--- a/kernel/x86_64/caxpy_microk_sandy-2.c ++++ b/kernel/x86_64/caxpy_microk_sandy-2.c +@@ -97,7 +97,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From aa799573b5f91e786ef41116b9fd030161fb6a10 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:11:59 +0100 +Subject: [PATCH 078/111] Update caxpy_microk_steamroller-2.c + +--- + kernel/x86_64/caxpy_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c +index b6eb55f9b..7ca7af070 100644 +--- a/kernel/x86_64/caxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/caxpy_microk_steamroller-2.c +@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From f9497bdab685ca8b9bea018c900df24b7dd2aad7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:12:37 +0100 +Subject: [PATCH 079/111] Update cdot_microk_bulldozer-2.c + +--- + kernel/x86_64/cdot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c +index c2245c6dc..118655913 100644 +--- a/kernel/x86_64/cdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/cdot_microk_bulldozer-2.c +@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From de4c5a9258b3c29e1e305660c50e7b4cf8204c46 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:13:09 +0100 +Subject: [PATCH 080/111] Update daxpy_microk_haswell-2.c + +--- + kernel/x86_64/daxpy_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c +index c77fc33ef..f3682e6d7 100644 +--- a/kernel/x86_64/daxpy_microk_haswell-2.c ++++ b/kernel/x86_64/daxpy_microk_haswell-2.c +@@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 59ca748c9ec75cf57148bcf4de06dc328f227845 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:13:38 +0100 +Subject: [PATCH 081/111] Update daxpy_microk_nehalem-2.c + +--- + kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c +index b81fe6562..8feb9f26c 100644 +--- a/kernel/x86_64/daxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/daxpy_microk_nehalem-2.c +@@ -74,7 +74,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 5f2ef0e70fb180022f3447826029f42c75c6fbb5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:14:13 +0100 +Subject: [PATCH 082/111] Update daxpy_microk_piledriver-2.c + +--- + kernel/x86_64/daxpy_microk_piledriver-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c +index efe93dfed..4b83124c7 100644 +--- a/kernel/x86_64/daxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/daxpy_microk_piledriver-2.c +@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From c5b01c8be14c3cc3b364b9067124695e2d91c63a Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:14:43 +0100 +Subject: [PATCH 083/111] Update daxpy_microk_sandy-2.c + +--- + kernel/x86_64/daxpy_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c +index 3b1214f36..db9a45de8 100644 +--- a/kernel/x86_64/daxpy_microk_sandy-2.c ++++ b/kernel/x86_64/daxpy_microk_sandy-2.c +@@ -101,7 +101,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From d4f3b733dc1026c9d1bfa8bea5696353de3b47c0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:15:18 +0100 +Subject: [PATCH 084/111] Update daxpy_microk_steamroller-2.c + +--- + kernel/x86_64/daxpy_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c +index a5143682f..8e63fcc1d 100644 +--- a/kernel/x86_64/daxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/daxpy_microk_steamroller-2.c +@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From dcfab783f725abb0280a77f61a4083be581e89b8 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:15:57 +0100 +Subject: [PATCH 085/111] Update ddot_microk_bulldozer-2.c + +--- + kernel/x86_64/ddot_microk_bulldozer-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c +index 62bf7e7dc..5590c5b17 100644 +--- a/kernel/x86_64/ddot_microk_bulldozer-2.c ++++ b/kernel/x86_64/ddot_microk_bulldozer-2.c +@@ -67,7 +67,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 0779654cb47dbc9984f344d5b7ffa68e39afdbc3 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:16:26 +0100 +Subject: [PATCH 086/111] Update ddot_microk_haswell-2.c + +--- + kernel/x86_64/ddot_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c +index 0cf4ece65..dbb5487f7 100644 +--- a/kernel/x86_64/ddot_microk_haswell-2.c ++++ b/kernel/x86_64/ddot_microk_haswell-2.c +@@ -78,7 +78,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 29028652213235c1d2e7dc18d49daa86f3356574 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:16:53 +0100 +Subject: [PATCH 087/111] Update ddot_microk_nehalem-2.c + +--- + kernel/x86_64/ddot_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c +index 086a0bb91..e5e234e22 100644 +--- a/kernel/x86_64/ddot_microk_nehalem-2.c ++++ b/kernel/x86_64/ddot_microk_nehalem-2.c +@@ -77,7 +77,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 6df88c7c455c37a18a16f1cbd003b640ef6777f0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:17:43 +0100 +Subject: [PATCH 088/111] Update cdot_microk_haswell-2.c + +--- + kernel/x86_64/cdot_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c +index 396dbeaa7..8b9d6d104 100644 +--- a/kernel/x86_64/cdot_microk_haswell-2.c ++++ b/kernel/x86_64/cdot_microk_haswell-2.c +@@ -99,7 +99,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 81691c726eb55df75f638794fe3afff70cc3286d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:18:11 +0100 +Subject: [PATCH 089/111] Update cdot_microk_sandy-2.c + +--- + kernel/x86_64/cdot_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c +index 20ba48c00..fe142c38f 100644 +--- a/kernel/x86_64/cdot_microk_sandy-2.c ++++ b/kernel/x86_64/cdot_microk_sandy-2.c +@@ -107,7 +107,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From ab8cc007364b9477e13c107a7befce7668c10ebb Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:18:47 +0100 +Subject: [PATCH 090/111] Update cdot_microk_steamroller-2.c + +--- + kernel/x86_64/cdot_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c +index 01754b147..7350b21c9 100644 +--- a/kernel/x86_64/cdot_microk_steamroller-2.c ++++ b/kernel/x86_64/cdot_microk_steamroller-2.c +@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From bdcba6adda368da48e450cdc3b9c9f7b6c52e630 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:19:32 +0100 +Subject: [PATCH 091/111] Update daxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c +index 2e2356fb6..9c1305b97 100644 +--- a/kernel/x86_64/daxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c +@@ -65,7 +65,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From e9fc4dfdead60ed013e016c62215170d04b5ad9d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:20:20 +0100 +Subject: [PATCH 092/111] Update ddot_microk_piledriver-2.c + +--- + kernel/x86_64/ddot_microk_piledriver-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c +index d7347ebdf..cc4bcd90a 100644 +--- a/kernel/x86_64/ddot_microk_piledriver-2.c ++++ b/kernel/x86_64/ddot_microk_piledriver-2.c +@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -147,7 +147,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 9430424102257485eae76482f495402260e9682d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:20:48 +0100 +Subject: [PATCH 093/111] Update ddot_microk_sandy-2.c + +--- + kernel/x86_64/ddot_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c +index 28b1a8bd1..84493ec27 100644 +--- a/kernel/x86_64/ddot_microk_sandy-2.c ++++ b/kernel/x86_64/ddot_microk_sandy-2.c +@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 129a987e4b55f13c413f4eaad58465443051dd43 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:21:26 +0100 +Subject: [PATCH 094/111] Update ddot_microk_steamroller-2.c + +--- + kernel/x86_64/ddot_microk_steamroller-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c +index 98cf94acf..27d5244ce 100644 +--- a/kernel/x86_64/ddot_microk_steamroller-2.c ++++ b/kernel/x86_64/ddot_microk_steamroller-2.c +@@ -80,7 +80,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 49789c39fb2a55dacc146f079c1c5fab45d3ce2e Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:22:17 +0100 +Subject: [PATCH 095/111] Update saxpy_microk_haswell-2.c + +--- + kernel/x86_64/saxpy_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c +index 3bc450f7b..7099ba4c6 100644 +--- a/kernel/x86_64/saxpy_microk_haswell-2.c ++++ b/kernel/x86_64/saxpy_microk_haswell-2.c +@@ -61,7 +61,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 7f556b81fb40ca6d90529829b802b38adbc747d7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:22:46 +0100 +Subject: [PATCH 096/111] Update saxpy_microk_nehalem-2.c + +--- + kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c +index e25156939..88bbb695d 100644 +--- a/kernel/x86_64/saxpy_microk_nehalem-2.c ++++ b/kernel/x86_64/saxpy_microk_nehalem-2.c +@@ -74,7 +74,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From cb75878f98892850b29fc7a0b427500a56d244dd Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:23:16 +0100 +Subject: [PATCH 097/111] Update saxpy_microk_piledriver-2.c + +--- + kernel/x86_64/saxpy_microk_piledriver-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c +index 87e742ac7..5feea7f24 100644 +--- a/kernel/x86_64/saxpy_microk_piledriver-2.c ++++ b/kernel/x86_64/saxpy_microk_piledriver-2.c +@@ -80,7 +80,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -141,7 +141,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 65719fcb41987c499c31455fe7b0290800cacdd6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:23:44 +0100 +Subject: [PATCH 098/111] Update saxpy_microk_sandy-2.c + +--- + kernel/x86_64/saxpy_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c +index 6ce67a7d1..0d448d5f8 100644 +--- a/kernel/x86_64/saxpy_microk_sandy-2.c ++++ b/kernel/x86_64/saxpy_microk_sandy-2.c +@@ -101,7 +101,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From b52e763084040ed624fff574fba1fe1bc58b1cc7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:24:16 +0100 +Subject: [PATCH 099/111] Update sdot_microk_bulldozer-2.c + +--- + kernel/x86_64/sdot_microk_bulldozer-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c +index c7f8cb1a7..8958a33dc 100644 +--- a/kernel/x86_64/sdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/sdot_microk_bulldozer-2.c +@@ -68,7 +68,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 2c021aeb9c018e4da2a7a0a5c0315d06d689a3c2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:24:42 +0100 +Subject: [PATCH 100/111] Update sdot_microk_haswell-2.c + +--- + kernel/x86_64/sdot_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c +index 417fb3862..91dc928d3 100644 +--- a/kernel/x86_64/sdot_microk_haswell-2.c ++++ b/kernel/x86_64/sdot_microk_haswell-2.c +@@ -81,7 +81,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From bb43f185cf2f4354b62b779a369b53db3607598d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:25:15 +0100 +Subject: [PATCH 101/111] Update sdot_microk_nehalem-2.c + +--- + kernel/x86_64/sdot_microk_nehalem-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c +index 115e7a410..5a715d008 100644 +--- a/kernel/x86_64/sdot_microk_nehalem-2.c ++++ b/kernel/x86_64/sdot_microk_nehalem-2.c +@@ -77,7 +77,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 3b98d1e16d48f08540952624e9aa7843d5384ceb Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:25:43 +0100 +Subject: [PATCH 102/111] Update sdot_microk_sandy-2.c + +--- + kernel/x86_64/sdot_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c +index 9d0795181..ae25d5a50 100644 +--- a/kernel/x86_64/sdot_microk_sandy-2.c ++++ b/kernel/x86_64/sdot_microk_sandy-2.c +@@ -84,7 +84,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 7009a0337f674911ebe6d9ce6d1bf9b21472e05e Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:26:24 +0100 +Subject: [PATCH 103/111] Update sdot_microk_steamroller-2.c + +--- + kernel/x86_64/sdot_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c +index 3475f890d..bf6a5f287 100644 +--- a/kernel/x86_64/sdot_microk_steamroller-2.c ++++ b/kernel/x86_64/sdot_microk_steamroller-2.c +@@ -82,7 +82,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -145,7 +145,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From f117a2e4aa3e100015d479dd61530019db66e53f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:27:34 +0100 +Subject: [PATCH 104/111] Update zaxpy_microk_bulldozer-2.c + +--- + kernel/x86_64/zaxpy_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +index eed36ffd0..15d367971 100644 +--- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c ++++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c +@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 752d4e88089ce1ff5ab27b25de382750b5e4a9c7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:28:00 +0100 +Subject: [PATCH 105/111] Update zaxpy_microk_haswell-2.c + +--- + kernel/x86_64/zaxpy_microk_haswell-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c +index 9aeea975b..89d23daf3 100644 +--- a/kernel/x86_64/zaxpy_microk_haswell-2.c ++++ b/kernel/x86_64/zaxpy_microk_haswell-2.c +@@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 0f905d346e8c0bda5bbf7cb6ae7f7a6ad137aa76 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:28:40 +0100 +Subject: [PATCH 106/111] Update zaxpy_microk_sandy-2.c + +--- + kernel/x86_64/zaxpy_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c +index cbd9b378f..17b8b24f7 100644 +--- a/kernel/x86_64/zaxpy_microk_sandy-2.c ++++ b/kernel/x86_64/zaxpy_microk_sandy-2.c +@@ -101,7 +101,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -178,7 +178,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 39a29ef0ce2de84526cf8e71881e6117b4532f84 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:29:18 +0100 +Subject: [PATCH 107/111] Update zaxpy_microk_steamroller-2.c + +--- + kernel/x86_64/zaxpy_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c +index 5fc56aec7..907b1ae00 100644 +--- a/kernel/x86_64/zaxpy_microk_steamroller-2.c ++++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c +@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 1496c1a69f4d0c521d797b1847363c38e46958d5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:30:03 +0100 +Subject: [PATCH 108/111] Update zdot_microk_bulldozer-2.c + +--- + kernel/x86_64/zdot_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c +index a80eac003..db9a48cce 100644 +--- a/kernel/x86_64/zdot_microk_bulldozer-2.c ++++ b/kernel/x86_64/zdot_microk_bulldozer-2.c +@@ -98,7 +98,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -177,7 +177,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 8f09f06f2c964ece75730dadd99e569844497fe6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:30:43 +0100 +Subject: [PATCH 109/111] Update zdot_microk_haswell-2.c + +--- + kernel/x86_64/zdot_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c +index 963d2e3bd..9f2fc2c1d 100644 +--- a/kernel/x86_64/zdot_microk_haswell-2.c ++++ b/kernel/x86_64/zdot_microk_haswell-2.c +@@ -103,7 +103,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -188,7 +188,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From fca3f8610fbeb0a4a4198eb0f2fc74f91cd6e85d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:31:24 +0100 +Subject: [PATCH 110/111] Update zdot_microk_sandy-2.c + +--- + kernel/x86_64/zdot_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c +index 88d4e1bbb..33415e26e 100644 +--- a/kernel/x86_64/zdot_microk_sandy-2.c ++++ b/kernel/x86_64/zdot_microk_sandy-2.c +@@ -109,7 +109,7 @@ if ( n < 1280 ) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -201,7 +201,7 @@ if ( n < 1280 ) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From 6976222962772b395054016e99faac34986b5e59 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:32:05 +0100 +Subject: [PATCH 111/111] Update zdot_microk_steamroller-2.c + +--- + kernel/x86_64/zdot_microk_steamroller-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c +index 2f11fe562..87138fe9a 100644 +--- a/kernel/x86_64/zdot_microk_steamroller-2.c ++++ b/kernel/x86_64/zdot_microk_steamroller-2.c +@@ -97,7 +97,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 +@@ -174,7 +174,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 diff --git a/1966.patch b/1966.patch new file mode 100644 index 0000000..c2663cd --- /dev/null +++ b/1966.patch @@ -0,0 +1,960 @@ +From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:27:38 +0100 +Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------ + 1 file changed, 16 insertions(+), 16 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c +index 3abffc4cf..f526fd611 100644 +--- a/kernel/x86_64/cscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/cscal_microk_bulldozer-2.c +@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:30:03 +0100 +Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c +index 0a4eb683c..8623dcd10 100644 +--- a/kernel/x86_64/cscal_microk_haswell-2.c ++++ b/kernel/x86_64/cscal_microk_haswell-2.c +@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"0", "1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 + : "cc", // "0", "1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", +@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 + : "cc", //"%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", +@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + +- : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ : ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"0", "1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:32:48 +0100 +Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c +index 8346e1748..fbeb857e2 100644 +--- a/kernel/x86_64/cscal_microk_steamroller-2.c ++++ b/kernel/x86_64/cscal_microk_steamroller-2.c +@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"0", "1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + ++ : ++ "+r" (n), // 0 ++ "+r" (x), // 1 + : +- : +- "r" (n), // 0 +- "r" (x), // 1 + "r" (alpha) // 2 +- : "cc", //"0", "1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"0", "1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:35:18 +0100 +Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c +index de53b0bc4..71d3a9846 100644 +--- a/kernel/x86_64/dscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/dscal_microk_bulldozer-2.c +@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 ++ "+r" (n1), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", +@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 ++ "+r" (n1), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + +From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:36:37 +0100 +Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c +index e732a2718..90790cfdc 100644 +--- a/kernel/x86_64/dscal_microk_haswell-2.c ++++ b/kernel/x86_64/dscal_microk_haswell-2.c +@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 ++ "+r" (n1), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", +@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + ++ : ++ "+r" (n1), // 0 ++ "+r" (x), // 1 + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + +From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:37:49 +0100 +Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c +index 8d855072b..0f187ba88 100644 +--- a/kernel/x86_64/dscal_microk_sandy-2.c ++++ b/kernel/x86_64/dscal_microk_sandy-2.c +@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 ++ "+r" (n1), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", +@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + ++ : ++ "+r" (n1), // 0 ++ "+r" (x), // 1 + : +- : +- "r" (n1), // 0 +- "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + +From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:40:28 +0100 +Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c +index 03882d6b6..1ce59d2c7 100644 +--- a/kernel/x86_64/zscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/zscal_microk_bulldozer-2.c +@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:42:34 +0100 +Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++------------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c +index d9253c1ed..534370959 100644 +--- a/kernel/x86_64/zscal_microk_haswell-2.c ++++ b/kernel/x86_64/zscal_microk_haswell-2.c +@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Wed, 16 Jan 2019 23:44:42 +0100 +Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c +index 97b07add6..4b489d9f3 100644 +--- a/kernel/x86_64/zscal_microk_steamroller-2.c ++++ b/kernel/x86_64/zscal_microk_steamroller-2.c +@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + ++ : ++ "+r" (n), // 0 ++ "+r" (x), // 1 + : +- : +- "r" (n), // 0 +- "r" (x), // 1 + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", +@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x), // 1 ++ : + "r" (alpha) // 2 +- : "cc", //"%0", "%1", ++ : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + +From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:38:20 +0100 +Subject: [PATCH 10/18] Remove stray comma + +--- + kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c +index f526fd611..31451aa6c 100644 +--- a/kernel/x86_64/cscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/cscal_microk_bulldozer-2.c +@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", + +From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:39:23 +0100 +Subject: [PATCH 11/18] Remove stray comma + +--- + kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c +index 8623dcd10..a04a4c4ab 100644 +--- a/kernel/x86_64/cscal_microk_haswell-2.c ++++ b/kernel/x86_64/cscal_microk_haswell-2.c +@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", // "0", "1", +@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", //"%0", "%1", +@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", + +From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:40:25 +0100 +Subject: [PATCH 12/18] Remove stray comma + +--- + kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c +index fbeb857e2..e8073d485 100644 +--- a/kernel/x86_64/cscal_microk_steamroller-2.c ++++ b/kernel/x86_64/cscal_microk_steamroller-2.c +@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", + +From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:41:27 +0100 +Subject: [PATCH 13/18] Remove stray comma + +--- + kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c +index 71d3a9846..096662781 100644 +--- a/kernel/x86_64/dscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/dscal_microk_bulldozer-2.c +@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 +@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + +From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:42:12 +0100 +Subject: [PATCH 14/18] Remove stray comma + +--- + kernel/x86_64/dscal_microk_sandy-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c +index 0f187ba88..9982b8e58 100644 +--- a/kernel/x86_64/dscal_microk_sandy-2.c ++++ b/kernel/x86_64/dscal_microk_sandy-2.c +@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 +@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + +From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:43:14 +0100 +Subject: [PATCH 15/18] Remove stray comma + +--- + kernel/x86_64/dscal_microk_haswell-2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c +index 90790cfdc..77ed59a4e 100644 +--- a/kernel/x86_64/dscal_microk_haswell-2.c ++++ b/kernel/x86_64/dscal_microk_haswell-2.c +@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 +@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n1), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + +From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:46:05 +0100 +Subject: [PATCH 16/18] Remove stray comma + +--- + kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c +index 1ce59d2c7..5e733ffda 100644 +--- a/kernel/x86_64/zscal_microk_bulldozer-2.c ++++ b/kernel/x86_64/zscal_microk_bulldozer-2.c +@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + "vzeroupper \n\t" + + : +- : +- "r" (n), // 0 +- "r" (x), // 1 ++ "+r" (n), // 0 ++ "+r" (x) // 1 ++ : + "r" (alpha) // 2 + : "cc", //"%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", +@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + "vzeroupper \n\t" + ++ : ++ "+r" (n), // 0 ++ "+r" (x) // 1 + : +- : +- "r" (n), // 0 +- "r" (x), // 1 + "r" (alpha) // 2 + : "cc", //"%0", "%1", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + +From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:47:12 +0100 +Subject: [PATCH 17/18] Remove stray comma + +--- + kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c +index 534370959..8c8f5b75c 100644 +--- a/kernel/x86_64/zscal_microk_haswell-2.c ++++ b/kernel/x86_64/zscal_microk_haswell-2.c +@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", + +From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:48:40 +0100 +Subject: [PATCH 18/18] Remove stray comma + +--- + kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c +index 4b489d9f3..c9267ee0c 100644 +--- a/kernel/x86_64/zscal_microk_steamroller-2.c ++++ b/kernel/x86_64/zscal_microk_steamroller-2.c +@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", +@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) + + : + "+r" (n), // 0 +- "+r" (x), // 1 ++ "+r" (x) // 1 + : + "r" (alpha) // 2 + : "cc", diff --git a/1967.patch b/1967.patch new file mode 100644 index 0000000..c7066fa --- /dev/null +++ b/1967.patch @@ -0,0 +1,99 @@ +From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 00:04:44 +0100 +Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c +index 2bf966a5f..944d4c6f1 100644 +--- a/kernel/x86_64/dger_microk_sandy-2.c ++++ b/kernel/x86_64/dger_microk_sandy-2.c +@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 00:05:47 +0100 +Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output + +--- + kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c +index 79180b991..d38fdd551 100644 +--- a/kernel/x86_64/sger_microk_sandy-2.c ++++ b/kernel/x86_64/sger_microk_sandy-2.c +@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n), // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + +From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:34:12 +0100 +Subject: [PATCH 3/4] Remove stray comma + +--- + kernel/x86_64/sger_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c +index d38fdd551..14f13475b 100644 +--- a/kernel/x86_64/sger_microk_sandy-2.c ++++ b/kernel/x86_64/sger_microk_sandy-2.c +@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + +From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 17 Jan 2019 09:35:56 +0100 +Subject: [PATCH 4/4] Remove stray comma + +--- + kernel/x86_64/dger_microk_sandy-2.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c +index 944d4c6f1..e8494500f 100644 +--- a/kernel/x86_64/dger_microk_sandy-2.c ++++ b/kernel/x86_64/dger_microk_sandy-2.c +@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) + + : + "+r" (i), // 0 +- "+r" (n), // 1 ++ "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 diff --git a/openblas.spec b/openblas.spec index 93e9af3..45cc85f 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.5 -Release: 4%{?dist} +Release: 5%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -37,6 +37,9 @@ Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pul Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch +Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch +Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch +Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch BuildRequires: gcc BuildRequires: gcc-gfortran @@ -255,6 +258,9 @@ cd OpenBLAS-%{version} %patch14 -p1 %patch15 -p1 %patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -691,6 +697,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Feb 26 2019 Susi Lehtola - 0.3.5-5 +- Even more assembly kernel patches. + * Mon Feb 25 2019 Susi Lehtola - 0.3.5-4 - Another assembly kernel patch. From 64c2df1d8536381f8af32d18071f9a3af81821dc Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 30 Apr 2019 12:00:26 +0200 Subject: [PATCH 31/44] Update to 0.3.6. --- .gitignore | 1 + 1965.patch | 3283 ------------------------------------------------- 1966.patch | 960 --------------- 1967.patch | 99 -- 2010.patch | 499 -------- 2018.patch | 27 - 2019.patch | 274 ----- 2021.patch | 255 ---- 2023.patch | 874 ------------- 2024.patch | 1349 -------------------- 2028.patch | 412 ------- openblas.spec | 30 +- sources | 2 +- 13 files changed, 7 insertions(+), 8058 deletions(-) delete mode 100644 1965.patch delete mode 100644 1966.patch delete mode 100644 1967.patch delete mode 100644 2010.patch delete mode 100644 2018.patch delete mode 100644 2019.patch delete mode 100644 2021.patch delete mode 100644 2023.patch delete mode 100644 2024.patch delete mode 100644 2028.patch diff --git a/.gitignore b/.gitignore index 9b6016d..36744a3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ /v0.3.0.tar.gz /v0.3.1.tar.gz /openblas-0.3.2.tar.gz +/openblas-0.3.6.tar.gz diff --git a/1965.patch b/1965.patch deleted file mode 100644 index 5d8b935..0000000 --- a/1965.patch +++ /dev/null @@ -1,3283 +0,0 @@ -From f0dd0584306b42289cac77fdafe6997e449d4f38 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:56:10 +0100 -Subject: [PATCH 001/111] Tag operands 0 and 1 as both input and output - -For #1964 (basically a continuation of coding problems first seen in #1292) ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index 33bda0943..cb98f208a 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 8a6bbf5a5bf4623795b2ff9aaa8d35467288d6c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:57:27 +0100 -Subject: [PATCH 002/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index 00e2e6a42..f31cf9710 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4e6f8fec31e83648c77c47398829b5191e671966 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:58:19 +0100 -Subject: [PATCH 003/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index a798fd977..931d1ad47 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,8 +97,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 663eef3b666e79c0e93f35cf79eada50040d9dd3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:59:59 +0100 -Subject: [PATCH 004/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 87370b032..9aeb47968 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From a671e19dd2cad6dc1e2e639f45a4faebf53b6f7f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:02:01 +0100 -Subject: [PATCH 005/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index f587aa036..e6d11f1af 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 47e2b4592eb31860a58222bedc8a3208c153aa00 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:03 +0100 -Subject: [PATCH 006/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index fe195a63b..9fee7615d 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 30a7bd8e15fb68d3fa651bbf48e1e65fc6078090 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:50 +0100 -Subject: [PATCH 007/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 01816917d..705c80c5c 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,8 +107,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2f5a7c1656b7975f71db2b8da90080938ccd3757 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:05:03 +0100 -Subject: [PATCH 008/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 76a3aa0eb..5a46aed8c 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bb16456fe1ff372b61a7ab042418248f68ddddc6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:23:58 +0100 -Subject: [PATCH 009/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 8c520dcf1..c9a01580e 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,8 +65,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 7af8f34df4efcc0ecaaa34c380119edcd5d206de Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:24:55 +0100 -Subject: [PATCH 010/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index bbe8b9550..67431659d 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From d94e7da701dae1106854753b2d5b676255c1c0f4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:25:56 +0100 -Subject: [PATCH 011/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 943d893af..61c99904a 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6008f6531855d615ad98febe65364074b99fa5bf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:26:55 +0100 -Subject: [PATCH 012/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index 95eb953b4..e3d605b75 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 9d46f84f24dc7284fc398574b811621e5c61e2dc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:27:48 +0100 -Subject: [PATCH 013/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 85e038cef..1b827e7e2 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From ca02ac724f5b06e16a8941ef3b2582c251234679 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:28:56 +0100 -Subject: [PATCH 014/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index e40009037..2cab80067 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From c18c2c9d9b0cd7e82cb98c7b212ffb29648fb9e0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:32:17 +0100 -Subject: [PATCH 015/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 9756ee46a..379fd3ca1 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,8 +67,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c23c17163f1b7a5fb7652cbc038a50c01f9440c5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:33:07 +0100 -Subject: [PATCH 016/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 365737363..c0c277c32 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,8 +78,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From de207d10c1f11ef1f38b4f766909619ab744d64a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:34:05 +0100 -Subject: [PATCH 017/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index fb5ec9bca..ea0b4eff1 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c9078eb8b4481fbc1841bcbf36ba438bf2749632 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:35:14 +0100 -Subject: [PATCH 018/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index ac950885c..f7b74add6 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -147,8 +147,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 00aff05c4049cd697b4000b5f2e726496b34dc54 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:36:08 +0100 -Subject: [PATCH 019/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 160f95604..e57eb37ea 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From dc15f3b5a7689a6cea1d31e004d7a3488bf9b66d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:37:06 +0100 -Subject: [PATCH 020/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 5ce20b5de..845c78df1 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 3f1719a98da89f0a6f1d435d3f705aa083702ac7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:08 +0100 -Subject: [PATCH 021/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3a743d64c..3b03e11a4 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b13f3c3bcfffcecbcc80454c90c31bc05dd5a04d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:57 +0100 -Subject: [PATCH 022/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 68f68ea3a..4ffb39acf 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 2bd18c7b73731d1b8bd900213fc7fa7a2356a357 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:40:50 +0100 -Subject: [PATCH 023/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 204cf8bac..87c5fe3cf 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -141,8 +141,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6fcb55b22f6e8b80e7f6ffcf228c70c0929915b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:41:41 +0100 -Subject: [PATCH 024/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 0a6bef046..5a8424d66 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 922e44897831f393cbeeb1406feb7fcf6e320281 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:42:35 +0100 -Subject: [PATCH 025/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 36e61b077..5a6fc6da2 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,8 +68,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d384880da564344e92a8d60b08e3183ab02ba75b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:43:24 +0100 -Subject: [PATCH 026/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index df367b61f..89d9cfe61 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,8 +81,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cd3a35ee79b4b5fa00e5a446be2a6cceb3230874 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:44:13 +0100 -Subject: [PATCH 027/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 1a27177f5..cef41b530 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9f792e759ea97e75445b1fe1eaab4f3432f4f1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:45:08 +0100 -Subject: [PATCH 028/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index ca13536f2..e77ba1424 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,8 +84,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c931bb8172bbdcbcfe6d2de281d2f83a7f5a3515 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:46:19 +0100 -Subject: [PATCH 029/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 6b8b2566b..bedde8fb6 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,8 +82,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,8 +145,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0172c51829110a5450b4d6d5f454bd4aa4106269 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:48:16 +0100 -Subject: [PATCH 030/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 0e15761f7..56493f8cb 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 0cfb647a577058cebeaabadbe6ef62eebd2ce49e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:51:34 +0100 -Subject: [PATCH 031/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 30e8b1955..bd52ba01f 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2b542d10368cbb8433b7274fb12b77845606d2fe Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:52:35 +0100 -Subject: [PATCH 032/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index 233af143a..d6a9ff394 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -178,8 +178,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From af29c99c85d9ea5c27b6e917ebb1dcdbe1292f7b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:53:29 +0100 -Subject: [PATCH 033/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 728d09213..58d4c7286 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f78531a9ec8ee28f7790505382231b3f5094b795 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:05:31 +0100 -Subject: [PATCH 034/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index 30a9552d6..ed66cc674 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b6f4ef5aea58e5ea1225283e406cadf9416818fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:06:54 +0100 -Subject: [PATCH 035/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 11056a3c1..0e6ac55db 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,8 +103,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -188,8 +188,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 715b1f263d6903f1af391c5278a9aa61f1753193 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:08:09 +0100 -Subject: [PATCH 036/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 87c5b0340..416265ae2 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,8 +109,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -201,8 +201,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From e8d835ea466a1605db2157b6884a4cfe762478fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:09:03 +0100 -Subject: [PATCH 037/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 325f74ae3..fe1613fd4 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,8 +97,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -174,8 +174,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bbc30700e871d84c07d770f54b645ea3eee549fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:49:34 +0100 -Subject: [PATCH 038/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 4ffb39acf..e25156939 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 300bb19b3ec0a48b7371d7c1be3ee88a29e87cf9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:04 +0100 -Subject: [PATCH 039/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index cb98f208a..faf5cdc40 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 1878e0c95aee9777f7c082bcc98ff12b04edc75d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:54 +0100 -Subject: [PATCH 040/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index f31cf9710..a011b2bfa 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f6be89295f4e21572a743d26e677256fc29ee8cf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:53:35 +0100 -Subject: [PATCH 041/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index 931d1ad47..c760d6540 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4673e5317861de37b326181b0dfc8514a2b3b69d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:54:39 +0100 -Subject: [PATCH 042/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 9aeb47968..b6eb55f9b 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From ba6d2c77a98f55431d8d2d4de4b6df99814352c1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:55:38 +0100 -Subject: [PATCH 043/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index e6d11f1af..c2245c6dc 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 093a3d7d5790efd7441611ee8c8769d4f3d997c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:56:15 +0100 -Subject: [PATCH 044/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 9fee7615d..396dbeaa7 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2224bcb4f070e607ede67f2f6e089e2e99519517 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:01 +0100 -Subject: [PATCH 045/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 705c80c5c..20ba48c00 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2414f1d796e23f8e9e4abba27e948f5877773640 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:56 +0100 -Subject: [PATCH 046/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 5a46aed8c..01754b147 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae00befb3e3a9632d9545ba0af43f9afb90787b2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:58:52 +0100 -Subject: [PATCH 047/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index c9a01580e..2e2356fb6 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 604c574542a5fac237b5134610166fab26db1285 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:59:30 +0100 -Subject: [PATCH 048/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index 67431659d..c77fc33ef 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 082498ee3b8470e992f33414e3097ca301f9efa7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:07 +0100 -Subject: [PATCH 049/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 61c99904a..b81fe6562 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 293f5531e66088d7149bebd68bcd7aa564b3a263 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:53 +0100 -Subject: [PATCH 050/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index e3d605b75..efe93dfed 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6cee8e0fdd463139f85656292971de1e4810d775 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:01:28 +0100 -Subject: [PATCH 051/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 1b827e7e2..3b1214f36 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6450bf14afa94cade7d28330749dfbf255697026 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:02:32 +0100 -Subject: [PATCH 052/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index 2cab80067..a5143682f 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From a339b45e51c58e5b13c01c6918282fb31941acdf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:07 +0100 -Subject: [PATCH 053/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 379fd3ca1..62bf7e7dc 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 64fcdadf39137bdc56c56ead1e4d8f1bea32fe2a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:44 +0100 -Subject: [PATCH 054/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index c0c277c32..0cf4ece65 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 504dd44e887cbd985bac3d48a2a7fdc3a03727d8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:04:20 +0100 -Subject: [PATCH 055/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index ea0b4eff1..086a0bb91 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 56c67a929a2b215c3980a542c74a016f828e119d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:11 +0100 -Subject: [PATCH 056/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index f7b74add6..d7347ebdf 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b7ffbc40eca528e3aae46d004c1ad8e6fd013530 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:43 +0100 -Subject: [PATCH 057/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index e57eb37ea..28b1a8bd1 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0c9c31dbe4817ad24ecc2cc5dc553239a7c31590 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:06:20 +0100 -Subject: [PATCH 058/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 845c78df1..98cf94acf 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d1b69022c935a37bbe3c8b09eb329a7468339ff0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:04 +0100 -Subject: [PATCH 059/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3b03e11a4..3bc450f7b 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 369a2b4af5680dfcbd1d8290077f62a4d74336fb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:54 +0100 -Subject: [PATCH 060/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87c5fe3cf..87e742ac7 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From dc931ad1fe709ad378d6d963fbde5bad421e5514 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:08:27 +0100 -Subject: [PATCH 061/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 5a8424d66..6ce67a7d1 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b2d6fea1cb99f0830c33e3667d1928be4496a31f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:07 +0100 -Subject: [PATCH 062/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 5a6fc6da2..c7f8cb1a7 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ffc008663aef2dd318c58275fb8b68cc93de9a42 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:50 +0100 -Subject: [PATCH 063/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 89d9cfe61..417fb3862 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 88b0dbfbddbc5170263bd06eb0aad0abf85faa81 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:10:30 +0100 -Subject: [PATCH 064/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index cef41b530..115e7a410 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9c3c4328a73821ce6067fb78b01b8817a92fa1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:11:09 +0100 -Subject: [PATCH 065/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index e77ba1424..9d0795181 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 266e72d24b767dbcdb97f597c899c7f495609c6f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:12:07 +0100 -Subject: [PATCH 066/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index bedde8fb6..3475f890d 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 72c3a4d1bd1daf3a98413dbea081f19fc6ee897d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:06 +0100 -Subject: [PATCH 067/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 56493f8cb..eed36ffd0 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 157e65ff74b7760a19ed38e8796aab6ad0d2a152 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:41 +0100 -Subject: [PATCH 068/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index bd52ba01f..9aeea975b 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 212b0a106d83491aeac793c6d45b4e494d06d868 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:14:28 +0100 -Subject: [PATCH 069/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index d6a9ff394..cbd9b378f 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2fa6d8107c40d780c988c8f23b5d61d6a0f8e8eb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:15:18 +0100 -Subject: [PATCH 070/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 58d4c7286..5fc56aec7 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 79d5dd461d13953e8cade9a1dad43ad38cf93aaa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:09 +0100 -Subject: [PATCH 071/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index ed66cc674..a80eac003 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cb5cfffb1765ac8ef1e2f149aea1dc3e5fbb9623 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:55 +0100 -Subject: [PATCH 072/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 0e6ac55db..963d2e3bd 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From f4e5f931ae5c14d284749c65d1e9ed08873afaa2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:17:46 +0100 -Subject: [PATCH 073/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 416265ae2..88d4e1bbb 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -107,10 +107,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -199,10 +199,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae2f3e617df8894ebe1779d3bcc78170bcad8b4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:18:27 +0100 -Subject: [PATCH 074/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index fe1613fd4..2f11fe562 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 379aa11f4bfc5bb352372a3f423062267e73dd77 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:21 +0100 -Subject: [PATCH 075/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index faf5cdc40..ca2209340 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 983c72ab0fc182264a635d1c5286ceebc2b2f3e2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:51 +0100 -Subject: [PATCH 076/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index a011b2bfa..b605ea34c 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6f7f9967f945c145e6e4ceac14162e8dbc551f4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:21 +0100 -Subject: [PATCH 077/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index c760d6540..72d37afed 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,7 +97,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From aa799573b5f91e786ef41116b9fd030161fb6a10 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:59 +0100 -Subject: [PATCH 078/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index b6eb55f9b..7ca7af070 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f9497bdab685ca8b9bea018c900df24b7dd2aad7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:12:37 +0100 -Subject: [PATCH 079/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index c2245c6dc..118655913 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From de4c5a9258b3c29e1e305660c50e7b4cf8204c46 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:09 +0100 -Subject: [PATCH 080/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index c77fc33ef..f3682e6d7 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 59ca748c9ec75cf57148bcf4de06dc328f227845 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:38 +0100 -Subject: [PATCH 081/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index b81fe6562..8feb9f26c 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 5f2ef0e70fb180022f3447826029f42c75c6fbb5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:13 +0100 -Subject: [PATCH 082/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index efe93dfed..4b83124c7 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From c5b01c8be14c3cc3b364b9067124695e2d91c63a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:43 +0100 -Subject: [PATCH 083/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 3b1214f36..db9a45de8 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d4f3b733dc1026c9d1bfa8bea5696353de3b47c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:18 +0100 -Subject: [PATCH 084/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index a5143682f..8e63fcc1d 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From dcfab783f725abb0280a77f61a4083be581e89b8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:57 +0100 -Subject: [PATCH 085/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 62bf7e7dc..5590c5b17 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,7 +67,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0779654cb47dbc9984f344d5b7ffa68e39afdbc3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:26 +0100 -Subject: [PATCH 086/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 0cf4ece65..dbb5487f7 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,7 +78,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 29028652213235c1d2e7dc18d49daa86f3356574 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:53 +0100 -Subject: [PATCH 087/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index 086a0bb91..e5e234e22 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6df88c7c455c37a18a16f1cbd003b640ef6777f0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:17:43 +0100 -Subject: [PATCH 088/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 396dbeaa7..8b9d6d104 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,7 +99,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 81691c726eb55df75f638794fe3afff70cc3286d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:11 +0100 -Subject: [PATCH 089/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 20ba48c00..fe142c38f 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,7 +107,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From ab8cc007364b9477e13c107a7befce7668c10ebb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:47 +0100 -Subject: [PATCH 090/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 01754b147..7350b21c9 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bdcba6adda368da48e450cdc3b9c9f7b6c52e630 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:19:32 +0100 -Subject: [PATCH 091/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 2e2356fb6..9c1305b97 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,7 +65,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From e9fc4dfdead60ed013e016c62215170d04b5ad9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:20 +0100 -Subject: [PATCH 092/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index d7347ebdf..cc4bcd90a 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -147,7 +147,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 9430424102257485eae76482f495402260e9682d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:48 +0100 -Subject: [PATCH 093/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 28b1a8bd1..84493ec27 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 129a987e4b55f13c413f4eaad58465443051dd43 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:21:26 +0100 -Subject: [PATCH 094/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 98cf94acf..27d5244ce 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 49789c39fb2a55dacc146f079c1c5fab45d3ce2e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:17 +0100 -Subject: [PATCH 095/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3bc450f7b..7099ba4c6 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7f556b81fb40ca6d90529829b802b38adbc747d7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:46 +0100 -Subject: [PATCH 096/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index e25156939..88bbb695d 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From cb75878f98892850b29fc7a0b427500a56d244dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:16 +0100 -Subject: [PATCH 097/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87e742ac7..5feea7f24 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -141,7 +141,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 65719fcb41987c499c31455fe7b0290800cacdd6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:44 +0100 -Subject: [PATCH 098/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 6ce67a7d1..0d448d5f8 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From b52e763084040ed624fff574fba1fe1bc58b1cc7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:16 +0100 -Subject: [PATCH 099/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index c7f8cb1a7..8958a33dc 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,7 +68,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 2c021aeb9c018e4da2a7a0a5c0315d06d689a3c2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:42 +0100 -Subject: [PATCH 100/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 417fb3862..91dc928d3 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,7 +81,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bb43f185cf2f4354b62b779a369b53db3607598d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:15 +0100 -Subject: [PATCH 101/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 115e7a410..5a715d008 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 3b98d1e16d48f08540952624e9aa7843d5384ceb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:43 +0100 -Subject: [PATCH 102/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index 9d0795181..ae25d5a50 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,7 +84,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7009a0337f674911ebe6d9ce6d1bf9b21472e05e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:26:24 +0100 -Subject: [PATCH 103/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 3475f890d..bf6a5f287 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,7 +82,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -145,7 +145,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f117a2e4aa3e100015d479dd61530019db66e53f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:27:34 +0100 -Subject: [PATCH 104/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index eed36ffd0..15d367971 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 752d4e88089ce1ff5ab27b25de382750b5e4a9c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:00 +0100 -Subject: [PATCH 105/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 9aeea975b..89d23daf3 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0f905d346e8c0bda5bbf7cb6ae7f7a6ad137aa76 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:40 +0100 -Subject: [PATCH 106/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index cbd9b378f..17b8b24f7 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -178,7 +178,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 39a29ef0ce2de84526cf8e71881e6117b4532f84 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:29:18 +0100 -Subject: [PATCH 107/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 5fc56aec7..907b1ae00 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 1496c1a69f4d0c521d797b1847363c38e46958d5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:03 +0100 -Subject: [PATCH 108/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index a80eac003..db9a48cce 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 8f09f06f2c964ece75730dadd99e569844497fe6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:43 +0100 -Subject: [PATCH 109/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 963d2e3bd..9f2fc2c1d 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,7 +103,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -188,7 +188,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From fca3f8610fbeb0a4a4198eb0f2fc74f91cd6e85d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:31:24 +0100 -Subject: [PATCH 110/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 88d4e1bbb..33415e26e 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,7 +109,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -201,7 +201,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6976222962772b395054016e99faac34986b5e59 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:32:05 +0100 -Subject: [PATCH 111/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 2f11fe562..87138fe9a 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,7 +97,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -174,7 +174,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/1966.patch b/1966.patch deleted file mode 100644 index c2663cd..0000000 --- a/1966.patch +++ /dev/null @@ -1,960 +0,0 @@ -From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:27:38 +0100 -Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------ - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index 3abffc4cf..f526fd611 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:30:03 +0100 -Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++------------- - 1 file changed, 15 insertions(+), 15 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 0a4eb683c..8623dcd10 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", // "0", "1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -- : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:32:48 +0100 -Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index 8346e1748..fbeb857e2 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:35:18 +0100 -Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index de53b0bc4..71d3a9846 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:36:37 +0100 -Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index e732a2718..90790cfdc 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:37:49 +0100 -Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 8d855072b..0f187ba88 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:40:28 +0100 -Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 03882d6b6..1ce59d2c7 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:42:34 +0100 -Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++------------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index d9253c1ed..534370959 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:44:42 +0100 -Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 97b07add6..4b489d9f3 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:38:20 +0100 -Subject: [PATCH 10/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index f526fd611..31451aa6c 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:39:23 +0100 -Subject: [PATCH 11/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 8623dcd10..a04a4c4ab 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", // "0", "1", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", //"%0", "%1", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:40:25 +0100 -Subject: [PATCH 12/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index fbeb857e2..e8073d485 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:41:27 +0100 -Subject: [PATCH 13/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index 71d3a9846..096662781 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:42:12 +0100 -Subject: [PATCH 14/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 0f187ba88..9982b8e58 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:43:14 +0100 -Subject: [PATCH 15/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index 90790cfdc..77ed59a4e 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:46:05 +0100 -Subject: [PATCH 16/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 1ce59d2c7..5e733ffda 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x) // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x) // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - -From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:47:12 +0100 -Subject: [PATCH 17/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index 534370959..8c8f5b75c 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:48:40 +0100 -Subject: [PATCH 18/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 4b489d9f3..c9267ee0c 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", diff --git a/1967.patch b/1967.patch deleted file mode 100644 index c7066fa..0000000 --- a/1967.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:04:44 +0100 -Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 2bf966a5f..944d4c6f1 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:05:47 +0100 -Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index 79180b991..d38fdd551 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:34:12 +0100 -Subject: [PATCH 3/4] Remove stray comma - ---- - kernel/x86_64/sger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index d38fdd551..14f13475b 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:35:56 +0100 -Subject: [PATCH 4/4] Remove stray comma - ---- - kernel/x86_64/dger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 944d4c6f1..e8494500f 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/2010.patch b/2010.patch deleted file mode 100644 index 2393325..0000000 --- a/2010.patch +++ /dev/null @@ -1,499 +0,0 @@ -From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:33:48 +0100 -Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64 - s/dGEMV_T and s/dGEMV_N kernels - -Arguments 0 and 1 need to be tagged as both input and output ---- - kernel/x86_64/dgemv_n_4.c | 10 +++++----- - kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- - kernel/x86_64/sgemv_n_4.c | 14 +++++++------- - kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- - 4 files changed, 30 insertions(+), 30 deletions(-) - -diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c -index 6d2530e81..6d33641e9 100644 ---- a/kernel/x86_64/dgemv_n_4.c -+++ b/kernel/x86_64/dgemv_n_4.c -@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c -index a7478e3a8..ed672a757 100644 ---- a/kernel/x86_64/dgemv_t_4.c -+++ b/kernel/x86_64/dgemv_t_4.c -@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movsd %%xmm11,8(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movsd %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 -diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c -index 65305ac59..63697970f 100644 ---- a/kernel/x86_64/sgemv_n_4.c -+++ b/kernel/x86_64/sgemv_n_4.c -@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - - "3: \n\t" - : -+ "+r" (i), // 0 -+ "+r" (n1) // 1 - : -- "r" (i), // 0 -- "r" (n1), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (src), // 2 - "r" (dest) // 3 - : "cc", -diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c -index 065e5b385..86ecaf516 100644 ---- a/kernel/x86_64/sgemv_t_4.c -+++ b/kernel/x86_64/sgemv_t_4.c -@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movss %%xmm11,4(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movss %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 - -From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:51:43 +0100 -Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly - -Argument 0 is modified as it doubles as a counter ---- - kernel/x86_64/dscal.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c -index ef9a0a6ba..d0d7801fd 100644 ---- a/kernel/x86_64/dscal.c -+++ b/kernel/x86_64/dscal.c -@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ - "jnz 1b \n\t" - - : -+ "+r" (n) // 0 - : -- "r" (n), // 0 - "r" (x), // 1 - "r" (x1), // 2 - "r" (alpha), // 3 - -From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:00:18 +0100 -Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV - microkernels - -Arguments 0 and 1 are both input and output ---- - kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- - 8 files changed, 24 insertions(+), 24 deletions(-) - -diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -index d7166fe4b..ae287b6d8 100644 ---- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c -index d83d20f8e..4778f644a 100644 ---- a/kernel/x86_64/dsymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c -@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -index 1344c75f7..065182286 100644 ---- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c -index 1ef6fbafd..d84e703bd 100644 ---- a/kernel/x86_64/dsymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c -@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -index 8c01ab806..4a4f4d68d 100644 ---- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c -index a32e59b44..e6a09ccf8 100644 ---- a/kernel/x86_64/ssymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -index b8e6ee732..c56ff3b15 100644 ---- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c -index e8650650c..c4919a39a 100644 ---- a/kernel/x86_64/ssymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c -@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 - -From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:14:02 +0100 -Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly - -Argument 0 is modified so should be input and output ---- - kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- - 8 files changed, 18 insertions(+), 18 deletions(-) - -diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -index d84470cc4..bfa07b6d0 100644 ---- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c -index 866782ee6..6241879d5 100644 ---- a/kernel/x86_64/dsymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c -@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -index 38479f77a..a161dcd8b 100644 ---- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c -index b4e6ab369..b205b1019 100644 ---- a/kernel/x86_64/dsymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c -@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -index 9002228f3..602c3edf2 100644 ---- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c -index 69db008b6..fdfe4349a 100644 ---- a/kernel/x86_64/ssymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -index c0fe5d640..6bb9c02f6 100644 ---- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c -index 093ca8073..0c78212e7 100644 ---- a/kernel/x86_64/ssymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c -@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 diff --git a/2018.patch b/2018.patch deleted file mode 100644 index 594a4c4..0000000 --- a/2018.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 -From: Bart Oldeman -Date: Thu, 14 Feb 2019 16:19:41 +0000 -Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for - xmm0,xmm1,xmm2,xmm3 - -This fixes a crash in dblat2 when OpenBLAS is compiled using --march=znver1 -ftree-vectorize -O2 - -See also: -https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 ---- - kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c -index 584a6c6b5..da0fa2fff 100644 ---- a/kernel/x86_64/dgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c -@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", -+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", diff --git a/2019.patch b/2019.patch deleted file mode 100644 index a3aa674..0000000 --- a/2019.patch +++ /dev/null @@ -1,274 +0,0 @@ -From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 14 Feb 2019 22:43:18 +0100 -Subject: [PATCH 1/2] Save and restore input argument 8 (lda4) - -Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index 2c90f8aa9..e89a16785 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -- - #define HAVE_KERNEL_4x8 1 - static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -+ "movq %8, %%xmm10 \n\t" //save lda -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "4: \n\t" - "vzeroupper \n\t" -+ "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -+ "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); -@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - } - - -- - #define HAVE_KERNEL_4x4 1 - static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - - "vbroadcastss (%8), %%ymm6 \n\t" // alpha - -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - -From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 10:10:04 +0100 -Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint - list - ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ - 1 file changed, 61 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index e89a16785..93e1e26e8 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -- "movq %8, %%xmm10 \n\t" //save lda -- - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" - - "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" - "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" - "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "testq $0x08, %1 \n\t" - "jz 3f \n\t" - -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - -@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y -- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y -- -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y -+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y -+ -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" - "addq $16, %0 \n\t" -- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $16, %8 \n\t" -- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y - "subq $16, %1 \n\t" -- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y - - "jnz 1b \n\t" - - "4: \n\t" - "vzeroupper \n\t" -- "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", -@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -- "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); diff --git a/2021.patch b/2021.patch deleted file mode 100644 index 7724f38..0000000 --- a/2021.patch +++ /dev/null @@ -1,255 +0,0 @@ -From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 15:08:16 +0100 -Subject: [PATCH] Fix wrong constraints in inline assembly - -for #2009 ---- - kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- - 1 file changed, 49 insertions(+), 49 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -index fcab8e2c7..9ab78fc8e 100644 ---- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c -+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " cmpq $0, %0 \n\t" - " je 4f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - - - " addq $8, %1 \n\t" -@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .p2align 4 \n\t" - "1: \n\t" - -- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - -- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 - " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" - " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" -- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" -@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 22f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a - - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - -@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 - - " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" -- " vmovups (%9), %%ymm0 \n\t" -+ " vmovups (%3), %%ymm0 \n\t" - " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" - " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" - " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" -@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" - - " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" -- " vmovups 32(%9), %%ymm4 \n\t" -+ " vmovups 32(%3), %%ymm4 \n\t" - " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" - " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" - " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" -@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "5: \n\t" // i = 0 - -- " addq $64, %9 \n\t" // b=b+8 -+ " addq $64, %3 \n\t" // b=b+8 - - " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups %%ymm8 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups %%ymm8 , (%2) \n\t" // write a - " vmovups %%ymm8 , (%4) \n\t" // write c - - " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" -@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm9 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm9 , (%2) \n\t" // write a - " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" -@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm10, (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm10, (%2) \n\t" // write a - " vmovups %%ymm10, (%4,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" -@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm11, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm11, (%2) \n\t" // write a - " vmovups %%ymm11, (%5) \n\t" // write c - - " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" -@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm12, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm12, (%2) \n\t" // write a - " vmovups %%ymm12, (%5,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" -@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm13, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm13, (%2) \n\t" // write a - " vmovups %%ymm13, (%5,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" -@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm14, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm14, (%2) \n\t" // write a - " vmovups %%ymm14, (%6) \n\t" // write c - - " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" - - " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb -- " vmovups %%ymm15, (%8) \n\t" // write a -+ " vmovups %%ymm15, (%2) \n\t" // write a - " vmovups %%ymm15, (%6,%7,1) \n\t" // write c - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c3), // 5 - "r" (c6), // 6 - "r" (ldc), // 7 -- "r" (as), // 8 -- "r" (bs) // 9 -+ "r" (a), // 8 -+ "r" (b) // 9 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2023.patch b/2023.patch deleted file mode 100644 index 225a8a2..0000000 --- a/2023.patch +++ /dev/null @@ -1,874 +0,0 @@ -From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:24:11 +0100 -Subject: [PATCH 1/4] Fix inline assembly constraints - -rework indices to allow marking argument lda4 as input and output. For #2009 ---- - kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ - 1 file changed, 27 insertions(+), 27 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -index 11a3e943b..d21232bfa 100644 ---- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c -+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "movss (%2), %%xmm12 \n\t" // x0 -- "movss 4(%2), %%xmm13 \n\t" // x1 -- "movss 8(%2), %%xmm14 \n\t" // x2 -- "movss 12(%2), %%xmm15 \n\t" // x3 -+ "movss (%3), %%xmm12 \n\t" // x0 -+ "movss 4(%3), %%xmm13 \n\t" // x1 -+ "movss 8(%3), %%xmm14 \n\t" // x2 -+ "movss 12(%3), %%xmm15 \n\t" // x3 - "shufps $0, %%xmm12, %%xmm12\n\t" - "shufps $0, %%xmm13, %%xmm13\n\t" - "shufps $0, %%xmm14, %%xmm14\n\t" - "shufps $0, %%xmm15, %%xmm15\n\t" - -- "movss 16(%2), %%xmm0 \n\t" // x4 -- "movss 20(%2), %%xmm1 \n\t" // x5 -- "movss 24(%2), %%xmm2 \n\t" // x6 -- "movss 28(%2), %%xmm3 \n\t" // x7 -+ "movss 16(%3), %%xmm0 \n\t" // x4 -+ "movss 20(%3), %%xmm1 \n\t" // x5 -+ "movss 24(%3), %%xmm2 \n\t" // x6 -+ "movss 28(%3), %%xmm3 \n\t" // x7 - "shufps $0, %%xmm0 , %%xmm0 \n\t" - "shufps $0, %%xmm1 , %%xmm1 \n\t" - "shufps $0, %%xmm2 , %%xmm2 \n\t" -@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "1: \n\t" - "xorps %%xmm4 , %%xmm4 \n\t" - "xorps %%xmm5 , %%xmm5 \n\t" -- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y - - ".p2align 1 \n\t" -- "movups (%4,%0,4), %%xmm8 \n\t" -- "movups (%5,%0,4), %%xmm9 \n\t" -- "movups (%6,%0,4), %%xmm10 \n\t" -- "movups (%7,%0,4), %%xmm11 \n\t" -+ "movups (%5,%0,4), %%xmm8 \n\t" -+ "movups (%6,%0,4), %%xmm9 \n\t" -+ "movups (%7,%0,4), %%xmm10 \n\t" -+ "movups (%8,%0,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm12, %%xmm8 \n\t" - "mulps %%xmm13, %%xmm9 \n\t" -@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "movups (%4,%8,4), %%xmm8 \n\t" -- "movups (%5,%8,4), %%xmm9 \n\t" -- "movups (%6,%8,4), %%xmm10 \n\t" -- "movups (%7,%8,4), %%xmm11 \n\t" -+ "movups (%5,%2,4), %%xmm8 \n\t" -+ "movups (%6,%2,4), %%xmm9 \n\t" -+ "movups (%7,%2,4), %%xmm10 \n\t" -+ "movups (%8,%2,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm0 , %%xmm8 \n\t" - "mulps %%xmm1 , %%xmm9 \n\t" -@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addps %%xmm5 , %%xmm4 \n\t" - "addq $4 , %0 \n\t" - "mulps %%xmm6 , %%xmm4 \n\t" - "subq $4 , %1 \n\t" - "addps %%xmm4 , %%xmm7 \n\t" - -- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y -+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y - - "jnz 1b \n\t" - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:36:39 +0100 -Subject: [PATCH 2/4] Fix inline assembly constraints - -rework indices to allow marking argument lda as input and output. ---- - kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- - 1 file changed, 65 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c -index b35daa35b..3fc46542b 100644 ---- a/kernel/x86_64/sgemv_n_microk_sandy-4.c -+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c -@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - -- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" -- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" -- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" -- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" -+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" -+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" -+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" -+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - -- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" -- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" -- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" -- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" -+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" -+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" -+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" -+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" -@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" - "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4, %8 \n\t" -+ "addq $4, %2 \n\t" - "addq $4, %0 \n\t" - "subq $4, %1 \n\t" - -@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" -@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" - "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8, %8 \n\t" -+ "addq $8, %2 \n\t" - "addq $8, %0 \n\t" - "subq $8, %1 \n\t" - -@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" -+ "prefetcht0 192(%6,%0,4) \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%0,4) \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" -- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" -- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" -+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" -+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%4,%8,4) \n\t" -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%8,4) \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" -- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" -- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" -+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" -+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" -@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" - "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - -- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y -+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - -- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y -- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - -- "addq $16, %8 \n\t" -+ "addq $16, %2 \n\t" - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:46:17 +0100 -Subject: [PATCH 3/4] Fix inline assembly constraints - ---- - kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- - 1 file changed, 97 insertions(+), 97 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -index 31001c7f3..bbf06c84b 100644 ---- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "vbroadcastss (%2), %%xmm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 -+ "vbroadcastss (%3), %%xmm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 - - "vbroadcastss (%9), %%xmm8 \n\t" // alpha - -@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "addq $4 , %0 \n\t" - -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -- "addq $4 , %8 \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" -+ "addq $4 , %2 \n\t" - - "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" -- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" -+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" - "subq $4 , %1 \n\t" -- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y - - "2: \n\t" - -@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y - - "addq $8 , %0 \n\t" -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "subq $8 , %1 \n\t" - - -@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" - "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" - ".align 2 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" -- -- "prefetcht0 192(%4,%8,4) \n\t" -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "prefetcht0 192(%6,%8,4) \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" -+ -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" - - "addq $16, %0 \n\t" -- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y -- "addq $16, %8 \n\t" -- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y - - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:51:09 +0100 -Subject: [PATCH 4/4] Fix inline assembly constraints - ---- - dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ - 1 file changed, 247 insertions(+) - create mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -new file mode 100644 -index 000000000..466931b82 ---- /dev/null -+++ b/dgemv_n_microk_piledriver-4.c -@@ -0,0 +1,247 @@ -+/*************************************************************************** -+Copyright (c) 2014, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+ -+ -+#define HAVE_KERNEL_4x8 1 -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -+ -+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %2 \n\t" -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "addq $8 , %0 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "addq $8 , %2 \n\t" -+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -+ "subq $8 , %1 \n\t" -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -+ -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 -+ : -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 -+ "r" (alpha) // 9 -+ : "cc", -+ "%xmm0", "%xmm1", -+ "%xmm2", "%xmm3", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ -+ -+#define HAVE_KERNEL_4x4 1 -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -+ -+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -+ -+ "addq $8 , %0 \n\t" -+ "subq $8 , %1 \n\t" -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : -+ "r" (x), // 2 -+ "r" (y), // 3 -+ "r" (ap[0]), // 4 -+ "r" (ap[1]), // 5 -+ "r" (ap[2]), // 6 -+ "r" (ap[3]), // 7 -+ "r" (alpha) // 8 -+ : "cc", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ diff --git a/2024.patch b/2024.patch deleted file mode 100644 index 720a9e2..0000000 --- a/2024.patch +++ /dev/null @@ -1,1349 +0,0 @@ -From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 20:06:48 +0100 -Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels - -rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 ---- - kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- - kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- - kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- - kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- - kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- - 5 files changed, 356 insertions(+), 356 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -index 54df5b359..35ed4cc01 100644 ---- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " prefetcht0 384(%3,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " prefetcht0 384(%7,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vmovddup (%7), %%xmm1 \n\t" // read b -- " vmovddup 8(%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm1 \n\t" // read b -+ " vmovddup 8(%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $16 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 8 -+ " subq $16 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 8 - -- " vmovddup (%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -index 1b8991c6c..3cd215000 100644 ---- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -index 0623dddb0..a4a62491c 100644 ---- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -index 4cc557d55..c11c84cec 100644 ---- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 0 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" - - " \n\t" // i = 1 -- " addq $8 , %7 \n\t" // b = b + 2 -- " addq $64 , %6 \n\t" // a = a + 16 -+ " addq $8 , %3 \n\t" // b = b + 2 -+ " addq $64 , %2 \n\t" // a = a + 16 - -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -index 73f6e8a95..326ca2976 100644 ---- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vbroadcastss (%7), %%xmm1 \n\t" // read b -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm1 \n\t" // read b -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $8 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 16 -+ " subq $8 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 16 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2028.patch b/2028.patch deleted file mode 100644 index 64d050f..0000000 --- a/2028.patch +++ /dev/null @@ -1,412 +0,0 @@ -From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 -From: Andrew <16061801+brada4@users.noreply.github.com> -Date: Sun, 24 Feb 2019 20:41:02 +0200 -Subject: [PATCH 2/2] move fix to right place - ---- - dgemv_n_microk_piledriver-4.c | 247 -------------------- - kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- - 2 files changed, 49 insertions(+), 296 deletions(-) - delete mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -deleted file mode 100644 -index 466931b82..000000000 ---- a/dgemv_n_microk_piledriver-4.c -+++ /dev/null -@@ -1,247 +0,0 @@ --/*************************************************************************** --Copyright (c) 2014, The OpenBLAS Project --All rights reserved. --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are --met: --1. Redistributions of source code must retain the above copyright --notice, this list of conditions and the following disclaimer. --2. Redistributions in binary form must reproduce the above copyright --notice, this list of conditions and the following disclaimer in --the documentation and/or other materials provided with the --distribution. --3. Neither the name of the OpenBLAS project nor the names of --its contributors may be used to endorse or promote products --derived from this software without specific prior written permission. --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" --AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE --LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR --SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER --CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, --OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE --USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*****************************************************************************/ -- -- -- --#define HAVE_KERNEL_4x8 1 --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -- -- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- -- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -- -- "addq $4 , %2 \n\t" -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "addq $8 , %2 \n\t" -- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -- "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -- -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n), // 1 -- "+r" (lda4) // 2 -- : -- "r" (x), // 3 -- "r" (y), // 4 -- "r" (ap[0]), // 5 -- "r" (ap[1]), // 6 -- "r" (ap[2]), // 7 -- "r" (ap[3]), // 8 -- "r" (alpha) // 9 -- : "cc", -- "%xmm0", "%xmm1", -- "%xmm2", "%xmm3", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -- --#define HAVE_KERNEL_4x4 1 --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- -- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -- -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -- -- "addq $8 , %0 \n\t" -- "subq $8 , %1 \n\t" -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n) // 1 -- : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (alpha) // 8 -- : "cc", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -index 530780bab..466931b82 100644 ---- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c -+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - -@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", diff --git a/openblas.spec b/openblas.spec index 45cc85f..e28699d 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.5 -Release: 5%{?dist} +Version: 0.3.6 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,18 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch -# Fix assembly code -Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch -Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch -Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch -Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch -Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch -Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch -Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch -Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch -Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch -Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch - BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel @@ -251,17 +239,6 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests -%patch10 -p1 -%patch11 -p1 -%patch12 -p1 -%patch13 -p1 -%patch14 -p1 -%patch15 -p1 -%patch16 -p1 -%patch17 -p1 -%patch18 -p1 -%patch19 -p1 - # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -697,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Apr 30 2019 Susi Lehtola - 0.3.6-1 +- Update to 0.3.6. + * Tue Feb 26 2019 Susi Lehtola - 0.3.5-5 - Even more assembly kernel patches. diff --git a/sources b/sources index e303585..a1a5ace 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0 +SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9 From 492f197de6ad8ad36a5ea2d127bf6802f4aa6599 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 2 Jul 2019 14:33:39 +0200 Subject: [PATCH 32/44] Bump spec --- openblas.spec | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index e28699d..8373c02 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.6 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -674,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Jul 02 2019 Susi Lehtola - 0.3.6-2 +- Rebuild since older build doesn't show up in updates system. + * Tue Apr 30 2019 Susi Lehtola - 0.3.6-1 - Update to 0.3.6. From e16a28cf6f2a86e2ab7a1659011dd1299eb3fe82 Mon Sep 17 00:00:00 2001 From: Fedora Release Engineering Date: Thu, 25 Jul 2019 23:22:12 +0000 Subject: [PATCH 33/44] - Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild Signed-off-by: Fedora Release Engineering --- openblas.spec | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 8373c02..5065871 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.6 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -674,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Thu Jul 25 2019 Fedora Release Engineering - 0.3.6-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild + * Tue Jul 02 2019 Susi Lehtola - 0.3.6-2 - Rebuild since older build doesn't show up in updates system. From a94fde6be8ccad92dcf1b99ec89e42ff24ca2df6 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 12 Aug 2019 11:13:10 +0200 Subject: [PATCH 34/44] Update to 0.3.7. --- .gitignore | 1 + openblas.spec | 7 +++++-- sources | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 36744a3..5c1cb22 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ /v0.3.1.tar.gz /openblas-0.3.2.tar.gz /openblas-0.3.6.tar.gz +/openblas-0.3.7.tar.gz diff --git a/openblas.spec b/openblas.spec index 5065871..76f7fa5 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.6 -Release: 3%{?dist} +Version: 0.3.7 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -674,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Mon Aug 12 2019 Susi Lehtola - 0.3.7-1 +- Update to 0.3.7. + * Thu Jul 25 2019 Fedora Release Engineering - 0.3.6-3 - Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild diff --git a/sources b/sources index a1a5ace..a7cb7bd 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9 +SHA512 (openblas-0.3.7.tar.gz) = 9c4898301c675471bbce2bb99b6bbe7c90724784fac06504416d4bd5da3cd4488f727b0a118c9a38ea342daac2af9e32597a847004241cc57de693b58b856262 From 5b3b53ac69456df626d14754c475657ae3f6d45a Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 12 Aug 2019 11:33:02 +0200 Subject: [PATCH 35/44] Update tests patch. --- ....2-tests.patch => openblas-0.3.7-tests.patch | 17 ++++++++++------- openblas.spec | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) rename openblas-0.3.2-tests.patch => openblas-0.3.7-tests.patch (54%) diff --git a/openblas-0.3.2-tests.patch b/openblas-0.3.7-tests.patch similarity index 54% rename from openblas-0.3.2-tests.patch rename to openblas-0.3.7-tests.patch index 0c75289..a3c78a9 100644 --- a/openblas-0.3.2-tests.patch +++ b/openblas-0.3.7-tests.patch @@ -1,18 +1,21 @@ -diff -up OpenBLAS-0.3.2/Makefile.tests OpenBLAS-0.3.2/Makefile ---- OpenBLAS-0.3.2/Makefile.tests 2018-08-02 14:12:01.615117002 +0200 -+++ OpenBLAS-0.3.2/Makefile 2018-08-02 14:13:29.582918971 +0200 -@@ -122,11 +122,11 @@ tests : +diff -up OpenBLAS-0.3.7/Makefile.tests OpenBLAS-0.3.7/Makefile +--- OpenBLAS-0.3.7/Makefile.tests 2019-08-11 23:23:27.000000000 +0200 ++++ OpenBLAS-0.3.7/Makefile 2019-08-12 11:32:09.937281485 +0200 +@@ -123,13 +123,13 @@ tests : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS - $(MAKE) -C test all -- $(MAKE) -C utest all + $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all -+ $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all endif +- $(MAKE) -C utest all ++ $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ifndef NO_CBLAS - $(MAKE) -C ctest all + $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + ifeq ($(CPP_THREAD_SAFETY_TEST), 1) +- $(MAKE) -C cpp_thread_test all ++ $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif endif endif - diff --git a/openblas.spec b/openblas.spec index 76f7fa5..5102f4a 100644 --- a/openblas.spec +++ b/openblas.spec @@ -27,7 +27,7 @@ Patch1: openblas-0.2.5-libname.patch # Don't use constructor priorities on too old architectures Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile -Patch3: openblas-0.3.2-tests.patch +Patch3: openblas-0.3.7-tests.patch BuildRequires: gcc BuildRequires: gcc-gfortran From ed0e4ed4e573c5f8081fdfb79a8b258ac37bac41 Mon Sep 17 00:00:00 2001 From: Dominik 'Rathann' Mierzejewski Date: Wed, 11 Dec 2019 14:18:35 +0100 Subject: [PATCH 36/44] fix USEOPENMP/USE_OPENMP typo in Rblas make call --- openblas.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index 5102f4a..9f03e93 100644 --- a/openblas.spec +++ b/openblas.spec @@ -390,7 +390,7 @@ FCOMMON="%{optflags} -fPIC -frecursive" # Use Fedora linker flags export LDFLAGS="%{__global_ldflags}" -make -C Rblas $TARGET USE_THREAD=0 USEOPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libRblas" LIBSONAME="libRblas.so" $AVX $LAPACKE INTERFACE64=0 +make -C Rblas $TARGET USE_THREAD=0 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libRblas" LIBSONAME="libRblas.so" $AVX $LAPACKE INTERFACE64=0 # Declare some necessary build flags COMMON="%{optflags} -fPIC" From ee852fd091003f384a673df309de99f5bd022a28 Mon Sep 17 00:00:00 2001 From: Dominik 'Rathann' Mierzejewski Date: Wed, 11 Dec 2019 14:27:23 +0100 Subject: [PATCH 37/44] enable C++ thread safety test where applicable See: https://github.com/xianyi/OpenBLAS/blob/develop/Makefile#L131 --- openblas-0.3.7-tests.patch | 21 +++++++++++++++++++++ openblas.spec | 14 ++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/openblas-0.3.7-tests.patch b/openblas-0.3.7-tests.patch index a3c78a9..fbe2fdb 100644 --- a/openblas-0.3.7-tests.patch +++ b/openblas-0.3.7-tests.patch @@ -19,3 +19,24 @@ diff -up OpenBLAS-0.3.7/Makefile.tests OpenBLAS-0.3.7/Makefile endif endif endif +diff -up OpenBLAS-0.3.7/cpp_thread_test/Makefile.tests OpenBLAS-0.3.7/cpp_thread_test/Makefile +--- OpenBLAS-0.3.7/cpp_thread_test/Makefile.tests 2019-08-11 19:23:00.000000000 +0000 ++++ OpenBLAS-0.3.7/cpp_thread_test/Makefile 2019-12-12 11:05:51.426334062 +0000 +@@ -1,13 +1,14 @@ +-include ../Makefile.rule ++TOPDIR = .. ++include $(TOPDIR)/Makefile.system + + all :: dgemv_tester dgemm_tester + + dgemv_tester : +- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester ++ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) -lpthread -o dgemv_tester + ./dgemv_tester + + dgemm_tester : dgemv_tester +- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester ++ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) -lpthread -o dgemm_tester + ./dgemm_tester + + clean :: diff --git a/openblas.spec b/openblas.spec index 9f03e93..c6924a3 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.7 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -30,6 +30,7 @@ Patch2: openblas-0.2.15-constructor.patch Patch3: openblas-0.3.7-tests.patch BuildRequires: gcc +BuildRequires: gcc-c++ BuildRequires: gcc-gfortran BuildRequires: perl-devel BuildRequires: multilib-rpm-config @@ -71,8 +72,10 @@ Provides: bundled(lapack) = %{lapackver} # Build 64-bit interface binaries? %if 0%{?__isa_bits} == 64 %global build64 1 +%bcond_without cpp_thread_check %else %global build64 0 +%bcond_with cpp_thread_check %endif %if %{with system_lapack} @@ -401,7 +404,7 @@ make -C threaded $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_O # USE_THREAD determines use of SMP, not of pthreads COMMON="%{optflags} -fPIC -fopenmp -pthread" FCOMMON="$COMMON -frecursive" -make -C openmp $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso" $AVX $LAPACKE INTERFACE64=0 +make -C openmp $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso" $AVX $LAPACKE INTERFACE64=0 %{with cpp_thread_check:CPP_THREAD_SAFETY_TEST=1} %if %build64 COMMON="%{optflags} -fPIC" @@ -411,7 +414,7 @@ make -C threaded64 $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_O COMMON="%{optflags} -fPIC -fopenmp -pthread" FCOMMON="$COMMON -frecursive -fdefault-integer-8" -make -C openmp64 $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64" $AVX $LAPACKE INTERFACE64=1 +make -C openmp64 $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64" $AVX $LAPACKE INTERFACE64=1 CPP_THREAD_SAFETY_TEST=1 COMMON="%{optflags} -fPIC" FCOMMON="$COMMON -frecursive -fdefault-integer-8" @@ -420,7 +423,7 @@ make -C threaded64_ $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_ COMMON="%{optflags} -fPIC -fopenmp -pthread" FCOMMON="$COMMON -frecursive -fdefault-integer-8" -make -C openmp64_ $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ +make -C openmp64_ $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ CPP_THREAD_SAFETY_TEST=1 %endif %install @@ -674,6 +677,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Wed Dec 11 2019 Dominik Mierzejewski - 0.3.7-2 +- enable C++ thread safety test where possible + * Mon Aug 12 2019 Susi Lehtola - 0.3.7-1 - Update to 0.3.7. From 46e517974f5c553502a2bb799603795a760b19e7 Mon Sep 17 00:00:00 2001 From: Fedora Release Engineering Date: Wed, 29 Jan 2020 20:12:03 +0000 Subject: [PATCH 38/44] - Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild Signed-off-by: Fedora Release Engineering --- openblas.spec | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openblas.spec b/openblas.spec index c6924a3..3f6e674 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.7 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -677,6 +677,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Wed Jan 29 2020 Fedora Release Engineering - 0.3.7-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild + * Wed Dec 11 2019 Dominik Mierzejewski - 0.3.7-2 - enable C++ thread safety test where possible From 33580586399961ff1894e1dc633525d8707a79a4 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 02:38:00 +0100 Subject: [PATCH 39/44] Update to 0.3.8, featuring dynamic cpu detection on all architectures. --- .gitignore | 1 + openblas.spec | 59 +++++++++++++++------------------------------------ sources | 2 +- 3 files changed, 19 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index 5c1cb22..1fabb10 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ /openblas-0.3.2.tar.gz /openblas-0.3.6.tar.gz /openblas-0.3.7.tar.gz +/openblas-0.3.8.tar.gz diff --git a/openblas.spec b/openblas.spec index 3f6e674..9d6f368 100644 --- a/openblas.spec +++ b/openblas.spec @@ -1,6 +1,6 @@ %bcond_with system_lapack # Version of bundled lapack -%global lapackver 3.8.0 +%global lapackver 3.9.0 # DO NOT "CLEAN UP" OR MODIFY THIS SPEC FILE WITHOUT ASKING THE # MAINTAINER FIRST! @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.7 -Release: 3%{?dist} +Version: 0.3.8 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -364,22 +364,22 @@ export AVX="NO_AVX2=1" %endif %ifarch armv7hl -TARGET="TARGET=ARMV7 DYNAMIC_ARCH=0" +TARGET="TARGET=ARMV7 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %ifarch ppc64 -TARGET="TARGET=POWER6 DYNAMIC_ARCH=0" +TARGET="TARGET=POWER6 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %ifarch ppc64p7 -TARGET="TARGET=POWER7 DYNAMIC_ARCH=0" +TARGET="TARGET=POWER7 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %ifarch ppc64le -TARGET="TARGET=POWER8 DYNAMIC_ARCH=0" +TARGET="TARGET=POWER8 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %ifarch aarch64 -TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1" +TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %ifarch s390x -TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=0" +TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif %if 0%{?rhel} == 5 @@ -440,33 +440,9 @@ cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} %multilib_fix_c_header --file %{_includedir}/openblas/openblas_config.h # Fix name of libraries -suffix="" -%ifarch armv7hl -suffix="_armv7" -%endif -%ifarch ppc64 -suffix="_power6" -%endif -%ifarch ppc64p7 -suffix="_power7" -%endif -%ifarch ppc64le -suffix="_power8" -%endif -%ifarch aarch64 -# Runtime CPU detection, no suffix -%endif -%ifarch s390x -suffix="_zarch_generic" -%endif -slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` +slibname=`basename %{buildroot}%{_libdir}/libopenblas-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a -if [[ "$suffix" != "" ]]; then - sname=$(echo $slibname | sed "s|$suffix||g") - mv %{buildroot}%{_libdir}/${slibname}.so %{buildroot}%{_libdir}/${sname}.so -else - sname=${slibname} -fi +sname=${slibname} # Install the Rblas library mkdir -p %{buildroot}%{_libdir}/R/lib/ @@ -499,13 +475,8 @@ install -D -p -m 644 serial64/${slibname64}.a %{buildroot}%{_libdir}/lib%{name}6 slibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64_|g"` install -D -p -m 644 serial64_/${slibname64_}.a %{buildroot}%{_libdir}/lib%{name}64_.a -if [[ "$suffix" != "" ]]; then - sname64=$(echo ${slibname64} | sed "s|$suffix||g") - sname64_=$(echo ${slibname64_} | sed "s|$suffix||g") -else - sname64=${slibname64} - sname64_=${slibname64_} -fi +sname64=${slibname64} +sname64_=${slibname64_} install -D -p -m 755 serial64/${slibname64}.so %{buildroot}%{_libdir}/${sname64}.so install -D -p -m 755 serial64_/${slibname64_}.so %{buildroot}%{_libdir}/${sname64_}.so @@ -677,6 +648,10 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Feb 11 2020 Susi Lehtola - 0.3.8-1 +- Update to 0.3.8; dynamic runtime cpu detection on all architectures. +- Also updates bundled LAPACK to 3.9.0. + * Wed Jan 29 2020 Fedora Release Engineering - 0.3.7-3 - Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild diff --git a/sources b/sources index a7cb7bd..1fd6105 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.7.tar.gz) = 9c4898301c675471bbce2bb99b6bbe7c90724784fac06504416d4bd5da3cd4488f727b0a118c9a38ea342daac2af9e32597a847004241cc57de693b58b856262 +SHA512 (openblas-0.3.8.tar.gz) = d557a332b1f905399d97dd5392ca10ca4eed47d669cae4abea374ed7c2e6c1ab29a4415df1224e940b9041e1545fa5ede2bdfb266986230436014795e7d3289d From 7dd89ce67cc535cd637ec91e415b661125bcf361 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 02:48:35 +0100 Subject: [PATCH 40/44] Fix typo in s390x code. --- openblas-0.3.8-zarch.patch | 22 ++++++++++++++++++++++ openblas.spec | 4 ++++ 2 files changed, 26 insertions(+) create mode 100644 openblas-0.3.8-zarch.patch diff --git a/openblas-0.3.8-zarch.patch b/openblas-0.3.8-zarch.patch new file mode 100644 index 0000000..002cbf9 --- /dev/null +++ b/openblas-0.3.8-zarch.patch @@ -0,0 +1,22 @@ +From dff173e50e01d94e0741e4b4eaa1cf0aa01cf320 Mon Sep 17 00:00:00 2001 +From: Susi Lehtola +Date: Tue, 11 Feb 2020 14:46:30 +1300 +Subject: [PATCH] Fix typo in dynamic_zarch.c + +--- + driver/others/dynamic_zarch.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c +index 1206bf870..896e65bb4 100644 +--- a/driver/others/dynamic_zarch.c ++++ b/driver/others/dynamic_zarch.c +@@ -31,7 +31,7 @@ char* gotoblas_corename(void) { + } + + // __builtin_cpu_is is not supported by zarch +-static gotolabs_t* get_coretype(void) { ++static gotoblas_t* get_coretype(void) { + FILE* infile; + char buffer[512], * p; + diff --git a/openblas.spec b/openblas.spec index 9d6f368..a25c3b0 100644 --- a/openblas.spec +++ b/openblas.spec @@ -29,6 +29,9 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.7-tests.patch +# https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2405.patch +Patch4: openblas-0.3.8-zarch.patch + BuildRequires: gcc BuildRequires: gcc-c++ BuildRequires: gcc-gfortran @@ -241,6 +244,7 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests +%patch4 -p1 -b .zarch # Fix source permissions find -name \*.f -exec chmod 644 {} \; From 4d572b6acaae00a6b3607baf6f78446ec6a1a14d Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 03:09:46 +0100 Subject: [PATCH 41/44] No Z15 kernel appears to exist for now. --- openblas-0.3.8-noz15.patch | 55 ++++++++++++++++++++++++++++++++++++++ openblas.spec | 3 +++ 2 files changed, 58 insertions(+) create mode 100644 openblas-0.3.8-noz15.patch diff --git a/openblas-0.3.8-noz15.patch b/openblas-0.3.8-noz15.patch new file mode 100644 index 0000000..fbb6bd5 --- /dev/null +++ b/openblas-0.3.8-noz15.patch @@ -0,0 +1,55 @@ +From 5a6bba3061f19923eb9972378021e6498bf8e5ed Mon Sep 17 00:00:00 2001 +From: Susi Lehtola +Date: Tue, 11 Feb 2020 15:07:33 +1300 +Subject: [PATCH] Patch out instances of Z15 in dynamic_zarch.c + +There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue. +--- + driver/others/dynamic_zarch.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c +index 1206bf870..c7b82e4df 100644 +--- a/driver/others/dynamic_zarch.c ++++ b/driver/others/dynamic_zarch.c +@@ -3,12 +3,12 @@ + + extern gotoblas_t gotoblas_Z13; + extern gotoblas_t gotoblas_Z14; +-extern gotoblas_t gotoblas_Z15; ++//extern gotoblas_t gotoblas_Z15; + //#if (!defined C_GCC) || (GCC_VERSION >= 60000) + //extern gotoblas_t gotoblas_Z14; + //#endif + +-#define NUM_CORETYPES 5 ++#define NUM_CORETYPES 4 + + extern void openblas_warning(int verbose, const char* msg); + +@@ -16,14 +16,14 @@ static char* corename[] = { + "unknown", + "Z13", + "Z14", +- "Z15", ++// "Z15", + "ZARCH_GENERIC", + }; + + char* gotoblas_corename(void) { + if (gotoblas == &gotoblas_Z13) return corename[1]; + if (gotoblas == &gotoblas_Z14) return corename[2]; +- if (gotoblas == &gotoblas_Z15) return corename[3]; ++// if (gotoblas == &gotoblas_Z15) return corename[3]; + //#if (!defined C_GCC) || (GCC_VERSION >= 60000) + // if (gotoblas == &gotoblas_POWER9) return corename[3]; + //#endif +@@ -78,7 +78,7 @@ static gotoblas_t* force_coretype(char* coretype) { + { + case 1: return (&gotoblas_Z13); + case 2: return (&gotoblas_Z14); +- case 3: return (&gotoblas_Z15); ++// case 3: return (&gotoblas_Z15); + //#if (!defined C_GCC) || (GCC_VERSION >= 60000) + // case 3: return (&gotoblas_POWER9); + //#endif diff --git a/openblas.spec b/openblas.spec index a25c3b0..e580bab 100644 --- a/openblas.spec +++ b/openblas.spec @@ -31,6 +31,8 @@ Patch3: openblas-0.3.7-tests.patch # https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2405.patch Patch4: openblas-0.3.8-zarch.patch +# https://github.com/xianyi/OpenBLAS/pull/2407 +Patch5: openblas-0.3.8-noz15.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -245,6 +247,7 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests %patch4 -p1 -b .zarch +%patch5 -p1 -b .noz15 # Fix source permissions find -name \*.f -exec chmod 644 {} \; From f3b68ef59a7a48b494ec725d9aec8e164ea6a3c5 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 11 Feb 2020 21:35:50 +0100 Subject: [PATCH 42/44] ARMv7 still doesn't have runtime cpu detection. --- openblas.spec | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/openblas.spec b/openblas.spec index e580bab..6e4b99e 100644 --- a/openblas.spec +++ b/openblas.spec @@ -371,7 +371,8 @@ export AVX="NO_AVX2=1" %endif %ifarch armv7hl -TARGET="TARGET=ARMV7 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" +# ARM v7 still doesn't have runtime cpu detection... +TARGET="TARGET=ARMV7 DYNAMIC_ARCH=0" %endif %ifarch ppc64 TARGET="TARGET=POWER6 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" @@ -446,10 +447,20 @@ cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} # Fix i686-x86_64 multilib difference %multilib_fix_c_header --file %{_includedir}/openblas/openblas_config.h -# Fix name of libraries -slibname=`basename %{buildroot}%{_libdir}/libopenblas-*.so .so` +# Fix name of libraries: runtime CPU detection has none +suffix="" +# but archs that don't have it do have one +%ifarch armv7hl +suffix="_armv7" +%endif +slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a -sname=${slibname} +if [[ "$suffix" != "" ]]; then + sname=$(echo $slibname | sed "s|$suffix||g") + mv %{buildroot}%{_libdir}/${slibname}.so %{buildroot}%{_libdir}/${sname}.so +else + sname=${slibname} +fi # Install the Rblas library mkdir -p %{buildroot}%{_libdir}/R/lib/ @@ -482,8 +493,13 @@ install -D -p -m 644 serial64/${slibname64}.a %{buildroot}%{_libdir}/lib%{name}6 slibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64_|g"` install -D -p -m 644 serial64_/${slibname64_}.a %{buildroot}%{_libdir}/lib%{name}64_.a -sname64=${slibname64} -sname64_=${slibname64_} +if [[ "$suffix" != "" ]]; then + sname64=$(echo ${slibname64} | sed "s|$suffix||g") + sname64_=$(echo ${slibname64_} | sed "s|$suffix||g") +else + sname64=${slibname64} + sname64_=${slibname64_} +fi install -D -p -m 755 serial64/${slibname64}.so %{buildroot}%{_libdir}/${sname64}.so install -D -p -m 755 serial64_/${slibname64_}.so %{buildroot}%{_libdir}/${sname64_}.so From 99cb63155f6126d4049dba0284ad25c2e8b01735 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Mon, 2 Mar 2020 06:00:45 +0100 Subject: [PATCH 43/44] Update to 0.3.9. --- .gitignore | 1 + openblas-0.3.8-noz15.patch | 55 -------------------------------------- openblas-0.3.8-zarch.patch | 22 --------------- openblas.spec | 12 +++------ sources | 2 +- 5 files changed, 6 insertions(+), 86 deletions(-) delete mode 100644 openblas-0.3.8-noz15.patch delete mode 100644 openblas-0.3.8-zarch.patch diff --git a/.gitignore b/.gitignore index 1fabb10..83cd3f9 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ /openblas-0.3.6.tar.gz /openblas-0.3.7.tar.gz /openblas-0.3.8.tar.gz +/openblas-0.3.9.tar.gz diff --git a/openblas-0.3.8-noz15.patch b/openblas-0.3.8-noz15.patch deleted file mode 100644 index fbb6bd5..0000000 --- a/openblas-0.3.8-noz15.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 5a6bba3061f19923eb9972378021e6498bf8e5ed Mon Sep 17 00:00:00 2001 -From: Susi Lehtola -Date: Tue, 11 Feb 2020 15:07:33 +1300 -Subject: [PATCH] Patch out instances of Z15 in dynamic_zarch.c - -There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue. ---- - driver/others/dynamic_zarch.c | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c -index 1206bf870..c7b82e4df 100644 ---- a/driver/others/dynamic_zarch.c -+++ b/driver/others/dynamic_zarch.c -@@ -3,12 +3,12 @@ - - extern gotoblas_t gotoblas_Z13; - extern gotoblas_t gotoblas_Z14; --extern gotoblas_t gotoblas_Z15; -+//extern gotoblas_t gotoblas_Z15; - //#if (!defined C_GCC) || (GCC_VERSION >= 60000) - //extern gotoblas_t gotoblas_Z14; - //#endif - --#define NUM_CORETYPES 5 -+#define NUM_CORETYPES 4 - - extern void openblas_warning(int verbose, const char* msg); - -@@ -16,14 +16,14 @@ static char* corename[] = { - "unknown", - "Z13", - "Z14", -- "Z15", -+// "Z15", - "ZARCH_GENERIC", - }; - - char* gotoblas_corename(void) { - if (gotoblas == &gotoblas_Z13) return corename[1]; - if (gotoblas == &gotoblas_Z14) return corename[2]; -- if (gotoblas == &gotoblas_Z15) return corename[3]; -+// if (gotoblas == &gotoblas_Z15) return corename[3]; - //#if (!defined C_GCC) || (GCC_VERSION >= 60000) - // if (gotoblas == &gotoblas_POWER9) return corename[3]; - //#endif -@@ -78,7 +78,7 @@ static gotoblas_t* force_coretype(char* coretype) { - { - case 1: return (&gotoblas_Z13); - case 2: return (&gotoblas_Z14); -- case 3: return (&gotoblas_Z15); -+// case 3: return (&gotoblas_Z15); - //#if (!defined C_GCC) || (GCC_VERSION >= 60000) - // case 3: return (&gotoblas_POWER9); - //#endif diff --git a/openblas-0.3.8-zarch.patch b/openblas-0.3.8-zarch.patch deleted file mode 100644 index 002cbf9..0000000 --- a/openblas-0.3.8-zarch.patch +++ /dev/null @@ -1,22 +0,0 @@ -From dff173e50e01d94e0741e4b4eaa1cf0aa01cf320 Mon Sep 17 00:00:00 2001 -From: Susi Lehtola -Date: Tue, 11 Feb 2020 14:46:30 +1300 -Subject: [PATCH] Fix typo in dynamic_zarch.c - ---- - driver/others/dynamic_zarch.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c -index 1206bf870..896e65bb4 100644 ---- a/driver/others/dynamic_zarch.c -+++ b/driver/others/dynamic_zarch.c -@@ -31,7 +31,7 @@ char* gotoblas_corename(void) { - } - - // __builtin_cpu_is is not supported by zarch --static gotolabs_t* get_coretype(void) { -+static gotoblas_t* get_coretype(void) { - FILE* infile; - char buffer[512], * p; - diff --git a/openblas.spec b/openblas.spec index 6e4b99e..8c78853 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,7 +14,7 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.8 +Version: 0.3.9 Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD @@ -29,11 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.7-tests.patch -# https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2405.patch -Patch4: openblas-0.3.8-zarch.patch -# https://github.com/xianyi/OpenBLAS/pull/2407 -Patch5: openblas-0.3.8-noz15.patch - BuildRequires: gcc BuildRequires: gcc-c++ BuildRequires: gcc-gfortran @@ -246,8 +241,6 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests -%patch4 -p1 -b .zarch -%patch5 -p1 -b .noz15 # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -671,6 +664,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Mon Mar 02 2020 Susi Lehtola - 0.3.9-1 +- Update to 0.3.9. + * Tue Feb 11 2020 Susi Lehtola - 0.3.8-1 - Update to 0.3.8; dynamic runtime cpu detection on all architectures. - Also updates bundled LAPACK to 3.9.0. diff --git a/sources b/sources index 1fd6105..3587dc1 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.8.tar.gz) = d557a332b1f905399d97dd5392ca10ca4eed47d669cae4abea374ed7c2e6c1ab29a4415df1224e940b9041e1545fa5ede2bdfb266986230436014795e7d3289d +SHA512 (openblas-0.3.9.tar.gz) = e34da25b3aaf959ec12826ac68c81e739e453d44f2dba28b15e57d7a827edc4d5f42988e9b6d98ac07999940be7b5876246cb3a980e590ae87f77f4c2f12f40a From 0dc13c877efb1ce89b278d09691945d3b13e2010 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Thu, 2 Apr 2020 14:00:31 +0200 Subject: [PATCH 44/44] Patch for C++ compatibility. --- ...758278b5d82b7242f505ea694f082ef65879.patch | 73 +++++++++++++++++++ openblas.spec | 9 ++- 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 ee2e758278b5d82b7242f505ea694f082ef65879.patch diff --git a/ee2e758278b5d82b7242f505ea694f082ef65879.patch b/ee2e758278b5d82b7242f505ea694f082ef65879.patch new file mode 100644 index 0000000..e4fc0d4 --- /dev/null +++ b/ee2e758278b5d82b7242f505ea694f082ef65879.patch @@ -0,0 +1,73 @@ +From ee2e758278b5d82b7242f505ea694f082ef65879 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 13 Mar 2020 20:34:13 +0100 +Subject: [PATCH] Move declarations of lapack_complex_custom types outside the + extern C + +fixes #2510 +--- + lapack-netlib/LAPACKE/include/lapack.h | 44 ++++++++++++++------------ + 1 file changed, 23 insertions(+), 21 deletions(-) + +diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h +index 0a6226fe4..36e53ec24 100644 +--- a/lapack-netlib/LAPACKE/include/lapack.h ++++ b/lapack-netlib/LAPACKE/include/lapack.h +@@ -12,27 +12,6 @@ + + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- +-/*----------------------------------------------------------------------------*/ +-#ifndef lapack_int +-#define lapack_int int +-#endif +- +-#ifndef lapack_logical +-#define lapack_logical lapack_int +-#endif +- +-/* f2c, hence clapack and MacOS Accelerate, returns double instead of float +- * for sdot, slange, clange, etc. */ +-#if defined(LAPACK_F2C) +- typedef double lapack_float_return; +-#else +- typedef float lapack_float_return; +-#endif +- + /* Complex types are structures equivalent to the + * Fortran complex types COMPLEX(4) and COMPLEX(8). + * +@@ -88,6 +67,29 @@ extern "C" { + + #endif /* LAPACK_COMPLEX_CUSTOM */ + ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/*----------------------------------------------------------------------------*/ ++#ifndef lapack_int ++#define lapack_int int ++#endif ++ ++#ifndef lapack_logical ++#define lapack_logical lapack_int ++#endif ++ ++/* f2c, hence clapack and MacOS Accelerate, returns double instead of float ++ * for sdot, slange, clange, etc. */ ++#if defined(LAPACK_F2C) ++ typedef double lapack_float_return; ++#else ++ typedef float lapack_float_return; ++#endif ++ ++ + /* Callback logical functions of one, two, or three arguments are used + * to select eigenvalues to sort to the top left of the Schur form. + * The value is selected if function returns TRUE (non-zero). */ diff --git a/openblas.spec b/openblas.spec index 8c78853..4016e83 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.9 -Release: 1%{?dist} +Release: 2%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,6 +29,9 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.7-tests.patch +# Fix C++ compatibility (BZ #1820131) +Patch4: https://github.com/xianyi/OpenBLAS/commit/ee2e758278b5d82b7242f505ea694f082ef65879.patch + BuildRequires: gcc BuildRequires: gcc-c++ BuildRequires: gcc-gfortran @@ -241,6 +244,7 @@ cd OpenBLAS-%{version} %patch2 -p1 -b .constructor %endif %patch3 -p1 -b .tests +%patch4 -p1 -b .cplusplus # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -664,6 +668,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Thu Apr 02 2020 Susi Lehtola - 0.3.9-2 +- Patch for BZ #1820131. + * Mon Mar 02 2020 Susi Lehtola - 0.3.9-1 - Update to 0.3.9.