From 64c2df1d8536381f8af32d18071f9a3af81821dc Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Tue, 30 Apr 2019 12:00:26 +0200 Subject: [PATCH] Update to 0.3.6. --- .gitignore | 1 + 1965.patch | 3283 ------------------------------------------------- 1966.patch | 960 --------------- 1967.patch | 99 -- 2010.patch | 499 -------- 2018.patch | 27 - 2019.patch | 274 ----- 2021.patch | 255 ---- 2023.patch | 874 ------------- 2024.patch | 1349 -------------------- 2028.patch | 412 ------- openblas.spec | 30 +- sources | 2 +- 13 files changed, 7 insertions(+), 8058 deletions(-) delete mode 100644 1965.patch delete mode 100644 1966.patch delete mode 100644 1967.patch delete mode 100644 2010.patch delete mode 100644 2018.patch delete mode 100644 2019.patch delete mode 100644 2021.patch delete mode 100644 2023.patch delete mode 100644 2024.patch delete mode 100644 2028.patch diff --git a/.gitignore b/.gitignore index 9b6016d..36744a3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ /v0.3.0.tar.gz /v0.3.1.tar.gz /openblas-0.3.2.tar.gz +/openblas-0.3.6.tar.gz diff --git a/1965.patch b/1965.patch deleted file mode 100644 index 5d8b935..0000000 --- a/1965.patch +++ /dev/null @@ -1,3283 +0,0 @@ -From f0dd0584306b42289cac77fdafe6997e449d4f38 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:56:10 +0100 -Subject: [PATCH 001/111] Tag operands 0 and 1 as both input and output - -For #1964 (basically a continuation of coding problems first seen in #1292) ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index 33bda0943..cb98f208a 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 8a6bbf5a5bf4623795b2ff9aaa8d35467288d6c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:57:27 +0100 -Subject: [PATCH 002/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index 00e2e6a42..f31cf9710 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4e6f8fec31e83648c77c47398829b5191e671966 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:58:19 +0100 -Subject: [PATCH 003/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index a798fd977..931d1ad47 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,8 +97,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 663eef3b666e79c0e93f35cf79eada50040d9dd3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:59:59 +0100 -Subject: [PATCH 004/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 87370b032..9aeb47968 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From a671e19dd2cad6dc1e2e639f45a4faebf53b6f7f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:02:01 +0100 -Subject: [PATCH 005/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index f587aa036..e6d11f1af 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 47e2b4592eb31860a58222bedc8a3208c153aa00 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:03 +0100 -Subject: [PATCH 006/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index fe195a63b..9fee7615d 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 30a7bd8e15fb68d3fa651bbf48e1e65fc6078090 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:50 +0100 -Subject: [PATCH 007/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 01816917d..705c80c5c 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,8 +107,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2f5a7c1656b7975f71db2b8da90080938ccd3757 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:05:03 +0100 -Subject: [PATCH 008/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 76a3aa0eb..5a46aed8c 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bb16456fe1ff372b61a7ab042418248f68ddddc6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:23:58 +0100 -Subject: [PATCH 009/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 8c520dcf1..c9a01580e 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,8 +65,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 7af8f34df4efcc0ecaaa34c380119edcd5d206de Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:24:55 +0100 -Subject: [PATCH 010/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index bbe8b9550..67431659d 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From d94e7da701dae1106854753b2d5b676255c1c0f4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:25:56 +0100 -Subject: [PATCH 011/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 943d893af..61c99904a 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6008f6531855d615ad98febe65364074b99fa5bf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:26:55 +0100 -Subject: [PATCH 012/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index 95eb953b4..e3d605b75 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 9d46f84f24dc7284fc398574b811621e5c61e2dc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:27:48 +0100 -Subject: [PATCH 013/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 85e038cef..1b827e7e2 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From ca02ac724f5b06e16a8941ef3b2582c251234679 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:28:56 +0100 -Subject: [PATCH 014/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index e40009037..2cab80067 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From c18c2c9d9b0cd7e82cb98c7b212ffb29648fb9e0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:32:17 +0100 -Subject: [PATCH 015/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 9756ee46a..379fd3ca1 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,8 +67,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c23c17163f1b7a5fb7652cbc038a50c01f9440c5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:33:07 +0100 -Subject: [PATCH 016/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 365737363..c0c277c32 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,8 +78,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From de207d10c1f11ef1f38b4f766909619ab744d64a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:34:05 +0100 -Subject: [PATCH 017/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index fb5ec9bca..ea0b4eff1 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c9078eb8b4481fbc1841bcbf36ba438bf2749632 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:35:14 +0100 -Subject: [PATCH 018/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index ac950885c..f7b74add6 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -147,8 +147,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 00aff05c4049cd697b4000b5f2e726496b34dc54 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:36:08 +0100 -Subject: [PATCH 019/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 160f95604..e57eb37ea 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From dc15f3b5a7689a6cea1d31e004d7a3488bf9b66d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:37:06 +0100 -Subject: [PATCH 020/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 5ce20b5de..845c78df1 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 3f1719a98da89f0a6f1d435d3f705aa083702ac7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:08 +0100 -Subject: [PATCH 021/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3a743d64c..3b03e11a4 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b13f3c3bcfffcecbcc80454c90c31bc05dd5a04d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:57 +0100 -Subject: [PATCH 022/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 68f68ea3a..4ffb39acf 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 2bd18c7b73731d1b8bd900213fc7fa7a2356a357 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:40:50 +0100 -Subject: [PATCH 023/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 204cf8bac..87c5fe3cf 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -141,8 +141,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6fcb55b22f6e8b80e7f6ffcf228c70c0929915b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:41:41 +0100 -Subject: [PATCH 024/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 0a6bef046..5a8424d66 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 922e44897831f393cbeeb1406feb7fcf6e320281 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:42:35 +0100 -Subject: [PATCH 025/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 36e61b077..5a6fc6da2 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,8 +68,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d384880da564344e92a8d60b08e3183ab02ba75b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:43:24 +0100 -Subject: [PATCH 026/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index df367b61f..89d9cfe61 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,8 +81,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cd3a35ee79b4b5fa00e5a446be2a6cceb3230874 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:44:13 +0100 -Subject: [PATCH 027/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 1a27177f5..cef41b530 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9f792e759ea97e75445b1fe1eaab4f3432f4f1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:45:08 +0100 -Subject: [PATCH 028/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index ca13536f2..e77ba1424 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,8 +84,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c931bb8172bbdcbcfe6d2de281d2f83a7f5a3515 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:46:19 +0100 -Subject: [PATCH 029/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 6b8b2566b..bedde8fb6 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,8 +82,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,8 +145,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0172c51829110a5450b4d6d5f454bd4aa4106269 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:48:16 +0100 -Subject: [PATCH 030/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 0e15761f7..56493f8cb 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 0cfb647a577058cebeaabadbe6ef62eebd2ce49e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:51:34 +0100 -Subject: [PATCH 031/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 30e8b1955..bd52ba01f 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2b542d10368cbb8433b7274fb12b77845606d2fe Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:52:35 +0100 -Subject: [PATCH 032/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index 233af143a..d6a9ff394 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -178,8 +178,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From af29c99c85d9ea5c27b6e917ebb1dcdbe1292f7b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:53:29 +0100 -Subject: [PATCH 033/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 728d09213..58d4c7286 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f78531a9ec8ee28f7790505382231b3f5094b795 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:05:31 +0100 -Subject: [PATCH 034/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index 30a9552d6..ed66cc674 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b6f4ef5aea58e5ea1225283e406cadf9416818fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:06:54 +0100 -Subject: [PATCH 035/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 11056a3c1..0e6ac55db 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,8 +103,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -188,8 +188,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 715b1f263d6903f1af391c5278a9aa61f1753193 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:08:09 +0100 -Subject: [PATCH 036/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 87c5b0340..416265ae2 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,8 +109,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -201,8 +201,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From e8d835ea466a1605db2157b6884a4cfe762478fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:09:03 +0100 -Subject: [PATCH 037/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 325f74ae3..fe1613fd4 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,8 +97,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -174,8 +174,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bbc30700e871d84c07d770f54b645ea3eee549fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:49:34 +0100 -Subject: [PATCH 038/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 4ffb39acf..e25156939 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 300bb19b3ec0a48b7371d7c1be3ee88a29e87cf9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:04 +0100 -Subject: [PATCH 039/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index cb98f208a..faf5cdc40 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 1878e0c95aee9777f7c082bcc98ff12b04edc75d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:54 +0100 -Subject: [PATCH 040/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index f31cf9710..a011b2bfa 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f6be89295f4e21572a743d26e677256fc29ee8cf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:53:35 +0100 -Subject: [PATCH 041/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index 931d1ad47..c760d6540 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4673e5317861de37b326181b0dfc8514a2b3b69d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:54:39 +0100 -Subject: [PATCH 042/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 9aeb47968..b6eb55f9b 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From ba6d2c77a98f55431d8d2d4de4b6df99814352c1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:55:38 +0100 -Subject: [PATCH 043/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index e6d11f1af..c2245c6dc 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 093a3d7d5790efd7441611ee8c8769d4f3d997c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:56:15 +0100 -Subject: [PATCH 044/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 9fee7615d..396dbeaa7 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2224bcb4f070e607ede67f2f6e089e2e99519517 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:01 +0100 -Subject: [PATCH 045/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 705c80c5c..20ba48c00 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2414f1d796e23f8e9e4abba27e948f5877773640 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:56 +0100 -Subject: [PATCH 046/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 5a46aed8c..01754b147 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae00befb3e3a9632d9545ba0af43f9afb90787b2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:58:52 +0100 -Subject: [PATCH 047/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index c9a01580e..2e2356fb6 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 604c574542a5fac237b5134610166fab26db1285 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:59:30 +0100 -Subject: [PATCH 048/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index 67431659d..c77fc33ef 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 082498ee3b8470e992f33414e3097ca301f9efa7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:07 +0100 -Subject: [PATCH 049/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 61c99904a..b81fe6562 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 293f5531e66088d7149bebd68bcd7aa564b3a263 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:53 +0100 -Subject: [PATCH 050/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index e3d605b75..efe93dfed 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6cee8e0fdd463139f85656292971de1e4810d775 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:01:28 +0100 -Subject: [PATCH 051/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 1b827e7e2..3b1214f36 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6450bf14afa94cade7d28330749dfbf255697026 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:02:32 +0100 -Subject: [PATCH 052/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index 2cab80067..a5143682f 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From a339b45e51c58e5b13c01c6918282fb31941acdf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:07 +0100 -Subject: [PATCH 053/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 379fd3ca1..62bf7e7dc 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 64fcdadf39137bdc56c56ead1e4d8f1bea32fe2a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:44 +0100 -Subject: [PATCH 054/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index c0c277c32..0cf4ece65 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 504dd44e887cbd985bac3d48a2a7fdc3a03727d8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:04:20 +0100 -Subject: [PATCH 055/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index ea0b4eff1..086a0bb91 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 56c67a929a2b215c3980a542c74a016f828e119d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:11 +0100 -Subject: [PATCH 056/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index f7b74add6..d7347ebdf 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b7ffbc40eca528e3aae46d004c1ad8e6fd013530 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:43 +0100 -Subject: [PATCH 057/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index e57eb37ea..28b1a8bd1 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0c9c31dbe4817ad24ecc2cc5dc553239a7c31590 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:06:20 +0100 -Subject: [PATCH 058/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 845c78df1..98cf94acf 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d1b69022c935a37bbe3c8b09eb329a7468339ff0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:04 +0100 -Subject: [PATCH 059/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3b03e11a4..3bc450f7b 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 369a2b4af5680dfcbd1d8290077f62a4d74336fb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:54 +0100 -Subject: [PATCH 060/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87c5fe3cf..87e742ac7 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From dc931ad1fe709ad378d6d963fbde5bad421e5514 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:08:27 +0100 -Subject: [PATCH 061/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 5a8424d66..6ce67a7d1 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b2d6fea1cb99f0830c33e3667d1928be4496a31f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:07 +0100 -Subject: [PATCH 062/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 5a6fc6da2..c7f8cb1a7 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ffc008663aef2dd318c58275fb8b68cc93de9a42 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:50 +0100 -Subject: [PATCH 063/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 89d9cfe61..417fb3862 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 88b0dbfbddbc5170263bd06eb0aad0abf85faa81 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:10:30 +0100 -Subject: [PATCH 064/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index cef41b530..115e7a410 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9c3c4328a73821ce6067fb78b01b8817a92fa1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:11:09 +0100 -Subject: [PATCH 065/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index e77ba1424..9d0795181 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 266e72d24b767dbcdb97f597c899c7f495609c6f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:12:07 +0100 -Subject: [PATCH 066/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index bedde8fb6..3475f890d 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 72c3a4d1bd1daf3a98413dbea081f19fc6ee897d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:06 +0100 -Subject: [PATCH 067/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 56493f8cb..eed36ffd0 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 157e65ff74b7760a19ed38e8796aab6ad0d2a152 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:41 +0100 -Subject: [PATCH 068/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index bd52ba01f..9aeea975b 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 212b0a106d83491aeac793c6d45b4e494d06d868 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:14:28 +0100 -Subject: [PATCH 069/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index d6a9ff394..cbd9b378f 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2fa6d8107c40d780c988c8f23b5d61d6a0f8e8eb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:15:18 +0100 -Subject: [PATCH 070/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 58d4c7286..5fc56aec7 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 79d5dd461d13953e8cade9a1dad43ad38cf93aaa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:09 +0100 -Subject: [PATCH 071/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index ed66cc674..a80eac003 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cb5cfffb1765ac8ef1e2f149aea1dc3e5fbb9623 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:55 +0100 -Subject: [PATCH 072/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 0e6ac55db..963d2e3bd 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From f4e5f931ae5c14d284749c65d1e9ed08873afaa2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:17:46 +0100 -Subject: [PATCH 073/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 416265ae2..88d4e1bbb 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -107,10 +107,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -199,10 +199,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae2f3e617df8894ebe1779d3bcc78170bcad8b4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:18:27 +0100 -Subject: [PATCH 074/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index fe1613fd4..2f11fe562 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 379aa11f4bfc5bb352372a3f423062267e73dd77 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:21 +0100 -Subject: [PATCH 075/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index faf5cdc40..ca2209340 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 983c72ab0fc182264a635d1c5286ceebc2b2f3e2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:51 +0100 -Subject: [PATCH 076/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index a011b2bfa..b605ea34c 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6f7f9967f945c145e6e4ceac14162e8dbc551f4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:21 +0100 -Subject: [PATCH 077/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index c760d6540..72d37afed 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,7 +97,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From aa799573b5f91e786ef41116b9fd030161fb6a10 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:59 +0100 -Subject: [PATCH 078/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index b6eb55f9b..7ca7af070 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f9497bdab685ca8b9bea018c900df24b7dd2aad7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:12:37 +0100 -Subject: [PATCH 079/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index c2245c6dc..118655913 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From de4c5a9258b3c29e1e305660c50e7b4cf8204c46 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:09 +0100 -Subject: [PATCH 080/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index c77fc33ef..f3682e6d7 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 59ca748c9ec75cf57148bcf4de06dc328f227845 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:38 +0100 -Subject: [PATCH 081/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index b81fe6562..8feb9f26c 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 5f2ef0e70fb180022f3447826029f42c75c6fbb5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:13 +0100 -Subject: [PATCH 082/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index efe93dfed..4b83124c7 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From c5b01c8be14c3cc3b364b9067124695e2d91c63a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:43 +0100 -Subject: [PATCH 083/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 3b1214f36..db9a45de8 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d4f3b733dc1026c9d1bfa8bea5696353de3b47c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:18 +0100 -Subject: [PATCH 084/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index a5143682f..8e63fcc1d 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From dcfab783f725abb0280a77f61a4083be581e89b8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:57 +0100 -Subject: [PATCH 085/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 62bf7e7dc..5590c5b17 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,7 +67,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0779654cb47dbc9984f344d5b7ffa68e39afdbc3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:26 +0100 -Subject: [PATCH 086/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 0cf4ece65..dbb5487f7 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,7 +78,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 29028652213235c1d2e7dc18d49daa86f3356574 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:53 +0100 -Subject: [PATCH 087/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index 086a0bb91..e5e234e22 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6df88c7c455c37a18a16f1cbd003b640ef6777f0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:17:43 +0100 -Subject: [PATCH 088/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 396dbeaa7..8b9d6d104 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,7 +99,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 81691c726eb55df75f638794fe3afff70cc3286d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:11 +0100 -Subject: [PATCH 089/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 20ba48c00..fe142c38f 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,7 +107,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From ab8cc007364b9477e13c107a7befce7668c10ebb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:47 +0100 -Subject: [PATCH 090/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 01754b147..7350b21c9 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bdcba6adda368da48e450cdc3b9c9f7b6c52e630 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:19:32 +0100 -Subject: [PATCH 091/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 2e2356fb6..9c1305b97 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,7 +65,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From e9fc4dfdead60ed013e016c62215170d04b5ad9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:20 +0100 -Subject: [PATCH 092/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index d7347ebdf..cc4bcd90a 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -147,7 +147,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 9430424102257485eae76482f495402260e9682d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:48 +0100 -Subject: [PATCH 093/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 28b1a8bd1..84493ec27 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 129a987e4b55f13c413f4eaad58465443051dd43 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:21:26 +0100 -Subject: [PATCH 094/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 98cf94acf..27d5244ce 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 49789c39fb2a55dacc146f079c1c5fab45d3ce2e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:17 +0100 -Subject: [PATCH 095/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3bc450f7b..7099ba4c6 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7f556b81fb40ca6d90529829b802b38adbc747d7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:46 +0100 -Subject: [PATCH 096/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index e25156939..88bbb695d 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From cb75878f98892850b29fc7a0b427500a56d244dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:16 +0100 -Subject: [PATCH 097/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87e742ac7..5feea7f24 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -141,7 +141,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 65719fcb41987c499c31455fe7b0290800cacdd6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:44 +0100 -Subject: [PATCH 098/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 6ce67a7d1..0d448d5f8 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From b52e763084040ed624fff574fba1fe1bc58b1cc7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:16 +0100 -Subject: [PATCH 099/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index c7f8cb1a7..8958a33dc 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,7 +68,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 2c021aeb9c018e4da2a7a0a5c0315d06d689a3c2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:42 +0100 -Subject: [PATCH 100/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 417fb3862..91dc928d3 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,7 +81,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bb43f185cf2f4354b62b779a369b53db3607598d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:15 +0100 -Subject: [PATCH 101/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 115e7a410..5a715d008 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 3b98d1e16d48f08540952624e9aa7843d5384ceb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:43 +0100 -Subject: [PATCH 102/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index 9d0795181..ae25d5a50 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,7 +84,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7009a0337f674911ebe6d9ce6d1bf9b21472e05e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:26:24 +0100 -Subject: [PATCH 103/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 3475f890d..bf6a5f287 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,7 +82,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -145,7 +145,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f117a2e4aa3e100015d479dd61530019db66e53f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:27:34 +0100 -Subject: [PATCH 104/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index eed36ffd0..15d367971 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 752d4e88089ce1ff5ab27b25de382750b5e4a9c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:00 +0100 -Subject: [PATCH 105/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 9aeea975b..89d23daf3 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0f905d346e8c0bda5bbf7cb6ae7f7a6ad137aa76 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:40 +0100 -Subject: [PATCH 106/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index cbd9b378f..17b8b24f7 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -178,7 +178,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 39a29ef0ce2de84526cf8e71881e6117b4532f84 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:29:18 +0100 -Subject: [PATCH 107/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 5fc56aec7..907b1ae00 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 1496c1a69f4d0c521d797b1847363c38e46958d5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:03 +0100 -Subject: [PATCH 108/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index a80eac003..db9a48cce 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 8f09f06f2c964ece75730dadd99e569844497fe6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:43 +0100 -Subject: [PATCH 109/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 963d2e3bd..9f2fc2c1d 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,7 +103,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -188,7 +188,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From fca3f8610fbeb0a4a4198eb0f2fc74f91cd6e85d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:31:24 +0100 -Subject: [PATCH 110/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 88d4e1bbb..33415e26e 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,7 +109,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -201,7 +201,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6976222962772b395054016e99faac34986b5e59 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:32:05 +0100 -Subject: [PATCH 111/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 2f11fe562..87138fe9a 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,7 +97,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -174,7 +174,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/1966.patch b/1966.patch deleted file mode 100644 index c2663cd..0000000 --- a/1966.patch +++ /dev/null @@ -1,960 +0,0 @@ -From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:27:38 +0100 -Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------ - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index 3abffc4cf..f526fd611 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:30:03 +0100 -Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++------------- - 1 file changed, 15 insertions(+), 15 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 0a4eb683c..8623dcd10 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", // "0", "1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -- : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:32:48 +0100 -Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index 8346e1748..fbeb857e2 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:35:18 +0100 -Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index de53b0bc4..71d3a9846 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:36:37 +0100 -Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index e732a2718..90790cfdc 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:37:49 +0100 -Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 8d855072b..0f187ba88 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:40:28 +0100 -Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 03882d6b6..1ce59d2c7 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:42:34 +0100 -Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++------------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index d9253c1ed..534370959 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:44:42 +0100 -Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 97b07add6..4b489d9f3 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:38:20 +0100 -Subject: [PATCH 10/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index f526fd611..31451aa6c 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:39:23 +0100 -Subject: [PATCH 11/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 8623dcd10..a04a4c4ab 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", // "0", "1", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", //"%0", "%1", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:40:25 +0100 -Subject: [PATCH 12/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index fbeb857e2..e8073d485 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:41:27 +0100 -Subject: [PATCH 13/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index 71d3a9846..096662781 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:42:12 +0100 -Subject: [PATCH 14/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 0f187ba88..9982b8e58 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:43:14 +0100 -Subject: [PATCH 15/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index 90790cfdc..77ed59a4e 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:46:05 +0100 -Subject: [PATCH 16/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 1ce59d2c7..5e733ffda 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x) // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x) // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - -From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:47:12 +0100 -Subject: [PATCH 17/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index 534370959..8c8f5b75c 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:48:40 +0100 -Subject: [PATCH 18/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 4b489d9f3..c9267ee0c 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", diff --git a/1967.patch b/1967.patch deleted file mode 100644 index c7066fa..0000000 --- a/1967.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:04:44 +0100 -Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 2bf966a5f..944d4c6f1 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:05:47 +0100 -Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index 79180b991..d38fdd551 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:34:12 +0100 -Subject: [PATCH 3/4] Remove stray comma - ---- - kernel/x86_64/sger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index d38fdd551..14f13475b 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:35:56 +0100 -Subject: [PATCH 4/4] Remove stray comma - ---- - kernel/x86_64/dger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 944d4c6f1..e8494500f 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/2010.patch b/2010.patch deleted file mode 100644 index 2393325..0000000 --- a/2010.patch +++ /dev/null @@ -1,499 +0,0 @@ -From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:33:48 +0100 -Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64 - s/dGEMV_T and s/dGEMV_N kernels - -Arguments 0 and 1 need to be tagged as both input and output ---- - kernel/x86_64/dgemv_n_4.c | 10 +++++----- - kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- - kernel/x86_64/sgemv_n_4.c | 14 +++++++------- - kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- - 4 files changed, 30 insertions(+), 30 deletions(-) - -diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c -index 6d2530e81..6d33641e9 100644 ---- a/kernel/x86_64/dgemv_n_4.c -+++ b/kernel/x86_64/dgemv_n_4.c -@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c -index a7478e3a8..ed672a757 100644 ---- a/kernel/x86_64/dgemv_t_4.c -+++ b/kernel/x86_64/dgemv_t_4.c -@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movsd %%xmm11,8(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movsd %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 -diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c -index 65305ac59..63697970f 100644 ---- a/kernel/x86_64/sgemv_n_4.c -+++ b/kernel/x86_64/sgemv_n_4.c -@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - - "3: \n\t" - : -+ "+r" (i), // 0 -+ "+r" (n1) // 1 - : -- "r" (i), // 0 -- "r" (n1), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (src), // 2 - "r" (dest) // 3 - : "cc", -diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c -index 065e5b385..86ecaf516 100644 ---- a/kernel/x86_64/sgemv_t_4.c -+++ b/kernel/x86_64/sgemv_t_4.c -@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movss %%xmm11,4(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movss %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 - -From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:51:43 +0100 -Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly - -Argument 0 is modified as it doubles as a counter ---- - kernel/x86_64/dscal.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c -index ef9a0a6ba..d0d7801fd 100644 ---- a/kernel/x86_64/dscal.c -+++ b/kernel/x86_64/dscal.c -@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ - "jnz 1b \n\t" - - : -+ "+r" (n) // 0 - : -- "r" (n), // 0 - "r" (x), // 1 - "r" (x1), // 2 - "r" (alpha), // 3 - -From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:00:18 +0100 -Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV - microkernels - -Arguments 0 and 1 are both input and output ---- - kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- - 8 files changed, 24 insertions(+), 24 deletions(-) - -diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -index d7166fe4b..ae287b6d8 100644 ---- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c -index d83d20f8e..4778f644a 100644 ---- a/kernel/x86_64/dsymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c -@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -index 1344c75f7..065182286 100644 ---- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c -index 1ef6fbafd..d84e703bd 100644 ---- a/kernel/x86_64/dsymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c -@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -index 8c01ab806..4a4f4d68d 100644 ---- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c -index a32e59b44..e6a09ccf8 100644 ---- a/kernel/x86_64/ssymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -index b8e6ee732..c56ff3b15 100644 ---- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c -index e8650650c..c4919a39a 100644 ---- a/kernel/x86_64/ssymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c -@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 - -From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:14:02 +0100 -Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly - -Argument 0 is modified so should be input and output ---- - kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- - 8 files changed, 18 insertions(+), 18 deletions(-) - -diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -index d84470cc4..bfa07b6d0 100644 ---- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c -index 866782ee6..6241879d5 100644 ---- a/kernel/x86_64/dsymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c -@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -index 38479f77a..a161dcd8b 100644 ---- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c -index b4e6ab369..b205b1019 100644 ---- a/kernel/x86_64/dsymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c -@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -index 9002228f3..602c3edf2 100644 ---- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c -index 69db008b6..fdfe4349a 100644 ---- a/kernel/x86_64/ssymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -index c0fe5d640..6bb9c02f6 100644 ---- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c -index 093ca8073..0c78212e7 100644 ---- a/kernel/x86_64/ssymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c -@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 diff --git a/2018.patch b/2018.patch deleted file mode 100644 index 594a4c4..0000000 --- a/2018.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 -From: Bart Oldeman -Date: Thu, 14 Feb 2019 16:19:41 +0000 -Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for - xmm0,xmm1,xmm2,xmm3 - -This fixes a crash in dblat2 when OpenBLAS is compiled using --march=znver1 -ftree-vectorize -O2 - -See also: -https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 ---- - kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c -index 584a6c6b5..da0fa2fff 100644 ---- a/kernel/x86_64/dgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c -@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", -+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", diff --git a/2019.patch b/2019.patch deleted file mode 100644 index a3aa674..0000000 --- a/2019.patch +++ /dev/null @@ -1,274 +0,0 @@ -From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 14 Feb 2019 22:43:18 +0100 -Subject: [PATCH 1/2] Save and restore input argument 8 (lda4) - -Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index 2c90f8aa9..e89a16785 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -- - #define HAVE_KERNEL_4x8 1 - static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -+ "movq %8, %%xmm10 \n\t" //save lda -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "4: \n\t" - "vzeroupper \n\t" -+ "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -+ "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); -@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - } - - -- - #define HAVE_KERNEL_4x4 1 - static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - - "vbroadcastss (%8), %%ymm6 \n\t" // alpha - -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - -From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 10:10:04 +0100 -Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint - list - ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ - 1 file changed, 61 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index e89a16785..93e1e26e8 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -- "movq %8, %%xmm10 \n\t" //save lda -- - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" - - "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" - "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" - "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "testq $0x08, %1 \n\t" - "jz 3f \n\t" - -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - -@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y -- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y -- -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y -+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y -+ -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" - "addq $16, %0 \n\t" -- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $16, %8 \n\t" -- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y - "subq $16, %1 \n\t" -- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y - - "jnz 1b \n\t" - - "4: \n\t" - "vzeroupper \n\t" -- "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", -@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -- "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); diff --git a/2021.patch b/2021.patch deleted file mode 100644 index 7724f38..0000000 --- a/2021.patch +++ /dev/null @@ -1,255 +0,0 @@ -From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 15:08:16 +0100 -Subject: [PATCH] Fix wrong constraints in inline assembly - -for #2009 ---- - kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- - 1 file changed, 49 insertions(+), 49 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -index fcab8e2c7..9ab78fc8e 100644 ---- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c -+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " cmpq $0, %0 \n\t" - " je 4f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - - - " addq $8, %1 \n\t" -@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .p2align 4 \n\t" - "1: \n\t" - -- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - -- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 - " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" - " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" -- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" -@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 22f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a - - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - -@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 - - " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" -- " vmovups (%9), %%ymm0 \n\t" -+ " vmovups (%3), %%ymm0 \n\t" - " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" - " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" - " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" -@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" - - " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" -- " vmovups 32(%9), %%ymm4 \n\t" -+ " vmovups 32(%3), %%ymm4 \n\t" - " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" - " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" - " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" -@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "5: \n\t" // i = 0 - -- " addq $64, %9 \n\t" // b=b+8 -+ " addq $64, %3 \n\t" // b=b+8 - - " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups %%ymm8 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups %%ymm8 , (%2) \n\t" // write a - " vmovups %%ymm8 , (%4) \n\t" // write c - - " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" -@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm9 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm9 , (%2) \n\t" // write a - " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" -@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm10, (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm10, (%2) \n\t" // write a - " vmovups %%ymm10, (%4,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" -@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm11, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm11, (%2) \n\t" // write a - " vmovups %%ymm11, (%5) \n\t" // write c - - " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" -@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm12, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm12, (%2) \n\t" // write a - " vmovups %%ymm12, (%5,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" -@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm13, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm13, (%2) \n\t" // write a - " vmovups %%ymm13, (%5,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" -@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm14, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm14, (%2) \n\t" // write a - " vmovups %%ymm14, (%6) \n\t" // write c - - " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" - - " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb -- " vmovups %%ymm15, (%8) \n\t" // write a -+ " vmovups %%ymm15, (%2) \n\t" // write a - " vmovups %%ymm15, (%6,%7,1) \n\t" // write c - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c3), // 5 - "r" (c6), // 6 - "r" (ldc), // 7 -- "r" (as), // 8 -- "r" (bs) // 9 -+ "r" (a), // 8 -+ "r" (b) // 9 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2023.patch b/2023.patch deleted file mode 100644 index 225a8a2..0000000 --- a/2023.patch +++ /dev/null @@ -1,874 +0,0 @@ -From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:24:11 +0100 -Subject: [PATCH 1/4] Fix inline assembly constraints - -rework indices to allow marking argument lda4 as input and output. For #2009 ---- - kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ - 1 file changed, 27 insertions(+), 27 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -index 11a3e943b..d21232bfa 100644 ---- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c -+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "movss (%2), %%xmm12 \n\t" // x0 -- "movss 4(%2), %%xmm13 \n\t" // x1 -- "movss 8(%2), %%xmm14 \n\t" // x2 -- "movss 12(%2), %%xmm15 \n\t" // x3 -+ "movss (%3), %%xmm12 \n\t" // x0 -+ "movss 4(%3), %%xmm13 \n\t" // x1 -+ "movss 8(%3), %%xmm14 \n\t" // x2 -+ "movss 12(%3), %%xmm15 \n\t" // x3 - "shufps $0, %%xmm12, %%xmm12\n\t" - "shufps $0, %%xmm13, %%xmm13\n\t" - "shufps $0, %%xmm14, %%xmm14\n\t" - "shufps $0, %%xmm15, %%xmm15\n\t" - -- "movss 16(%2), %%xmm0 \n\t" // x4 -- "movss 20(%2), %%xmm1 \n\t" // x5 -- "movss 24(%2), %%xmm2 \n\t" // x6 -- "movss 28(%2), %%xmm3 \n\t" // x7 -+ "movss 16(%3), %%xmm0 \n\t" // x4 -+ "movss 20(%3), %%xmm1 \n\t" // x5 -+ "movss 24(%3), %%xmm2 \n\t" // x6 -+ "movss 28(%3), %%xmm3 \n\t" // x7 - "shufps $0, %%xmm0 , %%xmm0 \n\t" - "shufps $0, %%xmm1 , %%xmm1 \n\t" - "shufps $0, %%xmm2 , %%xmm2 \n\t" -@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "1: \n\t" - "xorps %%xmm4 , %%xmm4 \n\t" - "xorps %%xmm5 , %%xmm5 \n\t" -- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y - - ".p2align 1 \n\t" -- "movups (%4,%0,4), %%xmm8 \n\t" -- "movups (%5,%0,4), %%xmm9 \n\t" -- "movups (%6,%0,4), %%xmm10 \n\t" -- "movups (%7,%0,4), %%xmm11 \n\t" -+ "movups (%5,%0,4), %%xmm8 \n\t" -+ "movups (%6,%0,4), %%xmm9 \n\t" -+ "movups (%7,%0,4), %%xmm10 \n\t" -+ "movups (%8,%0,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm12, %%xmm8 \n\t" - "mulps %%xmm13, %%xmm9 \n\t" -@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "movups (%4,%8,4), %%xmm8 \n\t" -- "movups (%5,%8,4), %%xmm9 \n\t" -- "movups (%6,%8,4), %%xmm10 \n\t" -- "movups (%7,%8,4), %%xmm11 \n\t" -+ "movups (%5,%2,4), %%xmm8 \n\t" -+ "movups (%6,%2,4), %%xmm9 \n\t" -+ "movups (%7,%2,4), %%xmm10 \n\t" -+ "movups (%8,%2,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm0 , %%xmm8 \n\t" - "mulps %%xmm1 , %%xmm9 \n\t" -@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addps %%xmm5 , %%xmm4 \n\t" - "addq $4 , %0 \n\t" - "mulps %%xmm6 , %%xmm4 \n\t" - "subq $4 , %1 \n\t" - "addps %%xmm4 , %%xmm7 \n\t" - -- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y -+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y - - "jnz 1b \n\t" - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:36:39 +0100 -Subject: [PATCH 2/4] Fix inline assembly constraints - -rework indices to allow marking argument lda as input and output. ---- - kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- - 1 file changed, 65 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c -index b35daa35b..3fc46542b 100644 ---- a/kernel/x86_64/sgemv_n_microk_sandy-4.c -+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c -@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - -- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" -- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" -- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" -- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" -+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" -+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" -+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" -+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - -- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" -- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" -- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" -- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" -+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" -+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" -+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" -+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" -@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" - "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4, %8 \n\t" -+ "addq $4, %2 \n\t" - "addq $4, %0 \n\t" - "subq $4, %1 \n\t" - -@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" -@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" - "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8, %8 \n\t" -+ "addq $8, %2 \n\t" - "addq $8, %0 \n\t" - "subq $8, %1 \n\t" - -@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" -+ "prefetcht0 192(%6,%0,4) \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%0,4) \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" -- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" -- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" -+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" -+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%4,%8,4) \n\t" -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%8,4) \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" -- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" -- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" -+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" -+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" -@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" - "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - -- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y -+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - -- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y -- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - -- "addq $16, %8 \n\t" -+ "addq $16, %2 \n\t" - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:46:17 +0100 -Subject: [PATCH 3/4] Fix inline assembly constraints - ---- - kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- - 1 file changed, 97 insertions(+), 97 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -index 31001c7f3..bbf06c84b 100644 ---- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "vbroadcastss (%2), %%xmm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 -+ "vbroadcastss (%3), %%xmm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 - - "vbroadcastss (%9), %%xmm8 \n\t" // alpha - -@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "addq $4 , %0 \n\t" - -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -- "addq $4 , %8 \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" -+ "addq $4 , %2 \n\t" - - "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" -- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" -+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" - "subq $4 , %1 \n\t" -- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y - - "2: \n\t" - -@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y - - "addq $8 , %0 \n\t" -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "subq $8 , %1 \n\t" - - -@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" - "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" - ".align 2 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" -- -- "prefetcht0 192(%4,%8,4) \n\t" -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "prefetcht0 192(%6,%8,4) \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" -+ -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" - - "addq $16, %0 \n\t" -- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y -- "addq $16, %8 \n\t" -- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y - - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:51:09 +0100 -Subject: [PATCH 4/4] Fix inline assembly constraints - ---- - dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ - 1 file changed, 247 insertions(+) - create mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -new file mode 100644 -index 000000000..466931b82 ---- /dev/null -+++ b/dgemv_n_microk_piledriver-4.c -@@ -0,0 +1,247 @@ -+/*************************************************************************** -+Copyright (c) 2014, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+ -+ -+#define HAVE_KERNEL_4x8 1 -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -+ -+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %2 \n\t" -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "addq $8 , %0 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "addq $8 , %2 \n\t" -+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -+ "subq $8 , %1 \n\t" -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -+ -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 -+ : -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 -+ "r" (alpha) // 9 -+ : "cc", -+ "%xmm0", "%xmm1", -+ "%xmm2", "%xmm3", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ -+ -+#define HAVE_KERNEL_4x4 1 -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -+ -+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -+ -+ "addq $8 , %0 \n\t" -+ "subq $8 , %1 \n\t" -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : -+ "r" (x), // 2 -+ "r" (y), // 3 -+ "r" (ap[0]), // 4 -+ "r" (ap[1]), // 5 -+ "r" (ap[2]), // 6 -+ "r" (ap[3]), // 7 -+ "r" (alpha) // 8 -+ : "cc", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ diff --git a/2024.patch b/2024.patch deleted file mode 100644 index 720a9e2..0000000 --- a/2024.patch +++ /dev/null @@ -1,1349 +0,0 @@ -From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 20:06:48 +0100 -Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels - -rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 ---- - kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- - kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- - kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- - kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- - kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- - 5 files changed, 356 insertions(+), 356 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -index 54df5b359..35ed4cc01 100644 ---- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " prefetcht0 384(%3,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " prefetcht0 384(%7,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vmovddup (%7), %%xmm1 \n\t" // read b -- " vmovddup 8(%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm1 \n\t" // read b -+ " vmovddup 8(%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $16 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 8 -+ " subq $16 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 8 - -- " vmovddup (%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -index 1b8991c6c..3cd215000 100644 ---- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -index 0623dddb0..a4a62491c 100644 ---- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -index 4cc557d55..c11c84cec 100644 ---- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 0 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" - - " \n\t" // i = 1 -- " addq $8 , %7 \n\t" // b = b + 2 -- " addq $64 , %6 \n\t" // a = a + 16 -+ " addq $8 , %3 \n\t" // b = b + 2 -+ " addq $64 , %2 \n\t" // a = a + 16 - -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -index 73f6e8a95..326ca2976 100644 ---- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vbroadcastss (%7), %%xmm1 \n\t" // read b -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm1 \n\t" // read b -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $8 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 16 -+ " subq $8 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 16 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2028.patch b/2028.patch deleted file mode 100644 index 64d050f..0000000 --- a/2028.patch +++ /dev/null @@ -1,412 +0,0 @@ -From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 -From: Andrew <16061801+brada4@users.noreply.github.com> -Date: Sun, 24 Feb 2019 20:41:02 +0200 -Subject: [PATCH 2/2] move fix to right place - ---- - dgemv_n_microk_piledriver-4.c | 247 -------------------- - kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- - 2 files changed, 49 insertions(+), 296 deletions(-) - delete mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -deleted file mode 100644 -index 466931b82..000000000 ---- a/dgemv_n_microk_piledriver-4.c -+++ /dev/null -@@ -1,247 +0,0 @@ --/*************************************************************************** --Copyright (c) 2014, The OpenBLAS Project --All rights reserved. --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are --met: --1. Redistributions of source code must retain the above copyright --notice, this list of conditions and the following disclaimer. --2. Redistributions in binary form must reproduce the above copyright --notice, this list of conditions and the following disclaimer in --the documentation and/or other materials provided with the --distribution. --3. Neither the name of the OpenBLAS project nor the names of --its contributors may be used to endorse or promote products --derived from this software without specific prior written permission. --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" --AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE --LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR --SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER --CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, --OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE --USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*****************************************************************************/ -- -- -- --#define HAVE_KERNEL_4x8 1 --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -- -- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- -- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -- -- "addq $4 , %2 \n\t" -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "addq $8 , %2 \n\t" -- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -- "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -- -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n), // 1 -- "+r" (lda4) // 2 -- : -- "r" (x), // 3 -- "r" (y), // 4 -- "r" (ap[0]), // 5 -- "r" (ap[1]), // 6 -- "r" (ap[2]), // 7 -- "r" (ap[3]), // 8 -- "r" (alpha) // 9 -- : "cc", -- "%xmm0", "%xmm1", -- "%xmm2", "%xmm3", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -- --#define HAVE_KERNEL_4x4 1 --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- -- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -- -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -- -- "addq $8 , %0 \n\t" -- "subq $8 , %1 \n\t" -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n) // 1 -- : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (alpha) // 8 -- : "cc", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -index 530780bab..466931b82 100644 ---- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c -+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - -@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", diff --git a/openblas.spec b/openblas.spec index 45cc85f..e28699d 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.5 -Release: 5%{?dist} +Version: 0.3.6 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,18 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch -# Fix assembly code -Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch -Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch -Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch -Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch -Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch -Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch -Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch -Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch -Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch -Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch - BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel @@ -251,17 +239,6 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests -%patch10 -p1 -%patch11 -p1 -%patch12 -p1 -%patch13 -p1 -%patch14 -p1 -%patch15 -p1 -%patch16 -p1 -%patch17 -p1 -%patch18 -p1 -%patch19 -p1 - # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -697,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Apr 30 2019 Susi Lehtola - 0.3.6-1 +- Update to 0.3.6. + * Tue Feb 26 2019 Susi Lehtola - 0.3.5-5 - Even more assembly kernel patches. diff --git a/sources b/sources index e303585..a1a5ace 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0 +SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9