Update to 0.3.6.

This commit is contained in:
Susi Lehtola 2019-04-30 12:00:26 +02:00
parent 4e591d8725
commit 64c2df1d85
13 changed files with 7 additions and 8058 deletions

1
.gitignore vendored
View File

@ -15,3 +15,4 @@
/v0.3.0.tar.gz
/v0.3.1.tar.gz
/openblas-0.3.2.tar.gz
/openblas-0.3.6.tar.gz

3283
1965.patch

File diff suppressed because it is too large Load Diff

View File

@ -1,960 +0,0 @@
From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:27:38 +0100
Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c
index 3abffc4cf..f526fd611 100644
--- a/kernel/x86_64/cscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:30:03 +0100
Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++-------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c
index 0a4eb683c..8623dcd10 100644
--- a/kernel/x86_64/cscal_microk_haswell-2.c
+++ b/kernel/x86_64/cscal_microk_haswell-2.c
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"0", "1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
: "cc", // "0", "1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
- :
- :
- "r" (n), // 0
- "r" (x), // 1
+ :
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"0", "1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:32:48 +0100
Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++-----------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c
index 8346e1748..fbeb857e2 100644
--- a/kernel/x86_64/cscal_microk_steamroller-2.c
+++ b/kernel/x86_64/cscal_microk_steamroller-2.c
@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"0", "1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
+ :
+ "+r" (n), // 0
+ "+r" (x), // 1
:
- :
- "r" (n), // 0
- "r" (x), // 1
"r" (alpha) // 2
- : "cc", //"0", "1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"0", "1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:35:18 +0100
Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c
index de53b0bc4..71d3a9846 100644
--- a/kernel/x86_64/dscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n1), // 0
- "r" (x), // 1
+ "+r" (n1), // 0
+ "+r" (x), // 1
+ :
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n1), // 0
- "r" (x), // 1
+ "+r" (n1), // 0
+ "+r" (x), // 1
+ :
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:36:37 +0100
Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
index e732a2718..90790cfdc 100644
--- a/kernel/x86_64/dscal_microk_haswell-2.c
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n1), // 0
- "r" (x), // 1
+ "+r" (n1), // 0
+ "+r" (x), // 1
+ :
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
+ :
+ "+r" (n1), // 0
+ "+r" (x), // 1
:
- :
- "r" (n1), // 0
- "r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:37:49 +0100
Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c
index 8d855072b..0f187ba88 100644
--- a/kernel/x86_64/dscal_microk_sandy-2.c
+++ b/kernel/x86_64/dscal_microk_sandy-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n1), // 0
- "r" (x), // 1
+ "+r" (n1), // 0
+ "+r" (x), // 1
+ :
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
+ :
+ "+r" (n1), // 0
+ "+r" (x), // 1
:
- :
- "r" (n1), // 0
- "r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:40:28 +0100
Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c
index 03882d6b6..1ce59d2c7 100644
--- a/kernel/x86_64/zscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:42:34 +0100
Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++-------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c
index d9253c1ed..534370959 100644
--- a/kernel/x86_64/zscal_microk_haswell-2.c
+++ b/kernel/x86_64/zscal_microk_haswell-2.c
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Jan 2019 23:44:42 +0100
Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++-----------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c
index 97b07add6..4b489d9f3 100644
--- a/kernel/x86_64/zscal_microk_steamroller-2.c
+++ b/kernel/x86_64/zscal_microk_steamroller-2.c
@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
+ :
+ "+r" (n), // 0
+ "+r" (x), // 1
:
- :
- "r" (n), // 0
- "r" (x), // 1
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x), // 1
+ :
"r" (alpha) // 2
- : "cc", //"%0", "%1",
+ : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:38:20 +0100
Subject: [PATCH 10/18] Remove stray comma
---
kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c
index f526fd611..31451aa6c 100644
--- a/kernel/x86_64/cscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c
@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:39:23 +0100
Subject: [PATCH 11/18] Remove stray comma
---
kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c
index 8623dcd10..a04a4c4ab 100644
--- a/kernel/x86_64/cscal_microk_haswell-2.c
+++ b/kernel/x86_64/cscal_microk_haswell-2.c
@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc", // "0", "1",
@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:40:25 +0100
Subject: [PATCH 12/18] Remove stray comma
---
kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c
index fbeb857e2..e8073d485 100644
--- a/kernel/x86_64/cscal_microk_steamroller-2.c
+++ b/kernel/x86_64/cscal_microk_steamroller-2.c
@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:41:27 +0100
Subject: [PATCH 13/18] Remove stray comma
---
kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c
index 71d3a9846..096662781 100644
--- a/kernel/x86_64/dscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:42:12 +0100
Subject: [PATCH 14/18] Remove stray comma
---
kernel/x86_64/dscal_microk_sandy-2.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c
index 0f187ba88..9982b8e58 100644
--- a/kernel/x86_64/dscal_microk_sandy-2.c
+++ b/kernel/x86_64/dscal_microk_sandy-2.c
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:43:14 +0100
Subject: [PATCH 15/18] Remove stray comma
---
kernel/x86_64/dscal_microk_haswell-2.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
index 90790cfdc..77ed59a4e 100644
--- a/kernel/x86_64/dscal_microk_haswell-2.c
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n1), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:46:05 +0100
Subject: [PATCH 16/18] Remove stray comma
---
kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c
index 1ce59d2c7..5e733ffda 100644
--- a/kernel/x86_64/zscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c
@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
:
- :
- "r" (n), // 0
- "r" (x), // 1
+ "+r" (n), // 0
+ "+r" (x) // 1
+ :
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"
+ :
+ "+r" (n), // 0
+ "+r" (x) // 1
:
- :
- "r" (n), // 0
- "r" (x), // 1
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:47:12 +0100
Subject: [PATCH 17/18] Remove stray comma
---
kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c
index 534370959..8c8f5b75c 100644
--- a/kernel/x86_64/zscal_microk_haswell-2.c
+++ b/kernel/x86_64/zscal_microk_haswell-2.c
@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:48:40 +0100
Subject: [PATCH 18/18] Remove stray comma
---
kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c
index 4b489d9f3..c9267ee0c 100644
--- a/kernel/x86_64/zscal_microk_steamroller-2.c
+++ b/kernel/x86_64/zscal_microk_steamroller-2.c
@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",
@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
:
"+r" (n), // 0
- "+r" (x), // 1
+ "+r" (x) // 1
:
"r" (alpha) // 2
: "cc",

View File

@ -1,99 +0,0 @@
From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 00:04:44 +0100
Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/dger_microk_sandy-2.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c
index 2bf966a5f..944d4c6f1 100644
--- a/kernel/x86_64/dger_microk_sandy-2.c
+++ b/kernel/x86_64/dger_microk_sandy-2.c
@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n), // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 00:05:47 +0100
Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output
---
kernel/x86_64/sger_microk_sandy-2.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c
index 79180b991..d38fdd551 100644
--- a/kernel/x86_64/sger_microk_sandy-2.c
+++ b/kernel/x86_64/sger_microk_sandy-2.c
@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n), // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:34:12 +0100
Subject: [PATCH 3/4] Remove stray comma
---
kernel/x86_64/sger_microk_sandy-2.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c
index d38fdd551..14f13475b 100644
--- a/kernel/x86_64/sger_microk_sandy-2.c
+++ b/kernel/x86_64/sger_microk_sandy-2.c
@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
:
"+r" (i), // 0
- "+r" (n), // 1
+ "+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Jan 2019 09:35:56 +0100
Subject: [PATCH 4/4] Remove stray comma
---
kernel/x86_64/dger_microk_sandy-2.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c
index 944d4c6f1..e8494500f 100644
--- a/kernel/x86_64/dger_microk_sandy-2.c
+++ b/kernel/x86_64/dger_microk_sandy-2.c
@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
:
"+r" (i), // 0
- "+r" (n), // 1
+ "+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3

View File

@ -1,499 +0,0 @@
From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 15:33:48 +0100
Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64
s/dGEMV_T and s/dGEMV_N kernels
Arguments 0 and 1 need to be tagged as both input and output
---
kernel/x86_64/dgemv_n_4.c | 10 +++++-----
kernel/x86_64/dgemv_t_4.c | 18 +++++++++---------
kernel/x86_64/sgemv_n_4.c | 14 +++++++-------
kernel/x86_64/sgemv_t_4.c | 18 +++++++++---------
4 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 6d2530e81..6d33641e9 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t"
:
+ "+r" (i), // 0
+ "+r" (n) // 1
:
- "r" (i), // 0
- "r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index a7478e3a8..ed672a757 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 65305ac59..63697970f 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"3: \n\t"
:
+ "+r" (i), // 0
+ "+r" (n1) // 1
:
- "r" (i), // 0
- "r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t"
:
+ "+r" (i), // 0
+ "+r" (n) // 1
:
- "r" (i), // 0
- "r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 065e5b385..86ecaf516 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 15:51:43 +0100
Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly
Argument 0 is modified as it doubles as a counter
---
kernel/x86_64/dscal.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index ef9a0a6ba..d0d7801fd 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t"
:
+ "+r" (n) // 0
:
- "r" (n), // 0
"r" (x), // 1
"r" (x1), // 2
"r" (alpha), // 3
From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 16:00:18 +0100
Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV
microkernels
Arguments 0 and 1 are both input and output
---
kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++---
8 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
index d7166fe4b..ae287b6d8 100644
--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c
index d83d20f8e..4778f644a 100644
--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
index 1344c75f7..065182286 100644
--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c
index 1ef6fbafd..d84e703bd 100644
--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
index 8c01ab806..4a4f4d68d 100644
--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c
index a32e59b44..e6a09ccf8 100644
--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
index b8e6ee732..c56ff3b15 100644
--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c
index e8650650c..c4919a39a 100644
--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 16:14:02 +0100
Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly
Argument 0 is modified so should be input and output
---
kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++----
8 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
index d84470cc4..bfa07b6d0 100644
--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c
index 866782ee6..6241879d5 100644
--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
index 38479f77a..a161dcd8b 100644
--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c
index b4e6ab369..b205b1019 100644
--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
index 9002228f3..602c3edf2 100644
--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c
index 69db008b6..fdfe4349a 100644
--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
index c0fe5d640..6bb9c02f6 100644
--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c
index 093ca8073..0c78212e7 100644
--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

View File

@ -1,27 +0,0 @@
From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Thu, 14 Feb 2019 16:19:41 +0000
Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for
xmm0,xmm1,xmm2,xmm3
This fixes a crash in dblat2 when OpenBLAS is compiled using
-march=znver1 -ftree-vectorize -O2
See also:
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
---
kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
index 584a6c6b5..da0fa2fff 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",

View File

@ -1,274 +0,0 @@
From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Feb 2019 22:43:18 +0100
Subject: [PATCH 1/2] Save and restore input argument 8 (lda4)
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
---
kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index 2c90f8aa9..e89a16785 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
+ "movq %8, %%xmm10 \n\t" //save lda
+
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"4: \n\t"
"vzeroupper \n\t"
+ "movq %%xmm10, %8 \n\t" //restore lda
:
"+r" (i), // 0
@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
+ "%xmm10",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}
-
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
+
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Feb 2019 10:10:04 +0100
Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint
list
---
kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------
1 file changed, 61 insertions(+), 65 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index e89a16785..93e1e26e8 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
- "movq %8, %%xmm10 \n\t" //save lda
-
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
- "addq $4 , %8 \n\t"
+ "addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
- "addq $8 , %8 \n\t"
+ "addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
-
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
+
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
- "addq $16, %8 \n\t"
- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
+ "addq $16, %2 \n\t"
+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
"jnz 1b \n\t"
"4: \n\t"
"vzeroupper \n\t"
- "movq %%xmm10, %8 \n\t" //restore lda
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
- "%xmm10",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

View File

@ -1,255 +0,0 @@
From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Feb 2019 15:08:16 +0100
Subject: [PATCH] Fix wrong constraints in inline assembly
for #2009
---
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
1 file changed, 49 insertions(+), 49 deletions(-)
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index fcab8e2c7..9ab78fc8e 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
- " vmovups (%9), %%ymm0 \n\t"
+ " vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
- " vmovups 32(%9), %%ymm4 \n\t"
+ " vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0
- " addq $64, %9 \n\t" // b=b+8
+ " addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups %%ymm8 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm9 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm10, (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm11, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm12, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm13, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm14, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
- " addq $32, %8 \n\t" // a=a+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
- " vmovups %%ymm15, (%8) \n\t" // write a
+ " vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
- "r" (as), // 8
- "r" (bs) // 9
+ "r" (a), // 8
+ "r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

View File

@ -1,874 +0,0 @@
From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:24:11 +0100
Subject: [PATCH 1/4] Fix inline assembly constraints
rework indices to allow marking argument lda4 as input and output. For #2009
---
kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------
1 file changed, 27 insertions(+), 27 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
index 11a3e943b..d21232bfa 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
- "movss (%2), %%xmm12 \n\t" // x0
- "movss 4(%2), %%xmm13 \n\t" // x1
- "movss 8(%2), %%xmm14 \n\t" // x2
- "movss 12(%2), %%xmm15 \n\t" // x3
+ "movss (%3), %%xmm12 \n\t" // x0
+ "movss 4(%3), %%xmm13 \n\t" // x1
+ "movss 8(%3), %%xmm14 \n\t" // x2
+ "movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
- "movss 16(%2), %%xmm0 \n\t" // x4
- "movss 20(%2), %%xmm1 \n\t" // x5
- "movss 24(%2), %%xmm2 \n\t" // x6
- "movss 28(%2), %%xmm3 \n\t" // x7
+ "movss 16(%3), %%xmm0 \n\t" // x4
+ "movss 20(%3), %%xmm1 \n\t" // x5
+ "movss 24(%3), %%xmm2 \n\t" // x6
+ "movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
".p2align 1 \n\t"
- "movups (%4,%0,4), %%xmm8 \n\t"
- "movups (%5,%0,4), %%xmm9 \n\t"
- "movups (%6,%0,4), %%xmm10 \n\t"
- "movups (%7,%0,4), %%xmm11 \n\t"
+ "movups (%5,%0,4), %%xmm8 \n\t"
+ "movups (%6,%0,4), %%xmm9 \n\t"
+ "movups (%7,%0,4), %%xmm10 \n\t"
+ "movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
- "movups (%4,%8,4), %%xmm8 \n\t"
- "movups (%5,%8,4), %%xmm9 \n\t"
- "movups (%6,%8,4), %%xmm10 \n\t"
- "movups (%7,%8,4), %%xmm11 \n\t"
+ "movups (%5,%2,4), %%xmm8 \n\t"
+ "movups (%6,%2,4), %%xmm9 \n\t"
+ "movups (%7,%2,4), %%xmm10 \n\t"
+ "movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
- "addq $4 , %8 \n\t"
+ "addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
"jnz 1b \n\t"
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:36:39 +0100
Subject: [PATCH 2/4] Fix inline assembly constraints
rework indices to allow marking argument lda as input and output.
---
kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++-------------
1 file changed, 65 insertions(+), 65 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
index b35daa35b..3fc46542b 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
- "addq $4, %8 \n\t"
+ "addq $4, %2 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
- "addq $8, %8 \n\t"
+ "addq $8, %2 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%6,%0,4) \n\t"
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
+ "prefetcht0 192(%8,%0,4) \n\t"
+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%4,%8,4) \n\t"
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
- "prefetcht0 192(%5,%8,4) \n\t"
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
+ "prefetcht0 192(%5,%2,4) \n\t"
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
+ "prefetcht0 192(%6,%2,4) \n\t"
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%6,%8,4) \n\t"
- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
- "prefetcht0 192(%7,%8,4) \n\t"
- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "prefetcht0 192(%7,%2,4) \n\t"
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
+ "prefetcht0 192(%8,%2,4) \n\t"
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
- "addq $16, %8 \n\t"
+ "addq $16, %2 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:46:17 +0100
Subject: [PATCH 3/4] Fix inline assembly constraints
---
kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++-----------
1 file changed, 97 insertions(+), 97 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
index 31001c7f3..bbf06c84b 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
- "vbroadcastss (%2), %%xmm12 \n\t" // x0
- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4
- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5
- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6
- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7
+ "vbroadcastss (%3), %%xmm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
- "addq $4 , %8 \n\t"
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
+ "addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
"2: \n\t"
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
-
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
- "addq $8 , %8 \n\t"
+ "addq $8 , %2 \n\t"
"subq $8 , %1 \n\t"
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
+ "prefetcht0 192(%8,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
-
- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
-
- "prefetcht0 192(%4,%8,4) \n\t"
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
- "prefetcht0 192(%5,%8,4) \n\t"
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "prefetcht0 192(%6,%8,4) \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
- "prefetcht0 192(%7,%8,4) \n\t"
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
+
+ "prefetcht0 192(%5,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
+ "prefetcht0 192(%6,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "prefetcht0 192(%7,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
+ "prefetcht0 192(%8,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
- "addq $16, %8 \n\t"
- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
+ "addq $16, %2 \n\t"
+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:51:09 +0100
Subject: [PATCH 4/4] Fix inline assembly constraints
---
dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++
1 file changed, 247 insertions(+)
create mode 100644 dgemv_n_microk_piledriver-4.c
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
new file mode 100644
index 000000000..466931b82
--- /dev/null
+++ b/dgemv_n_microk_piledriver-4.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
+
+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz 2f \n\t"
+
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %2 \n\t"
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ "2: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je 3f \n\t"
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
+ "addq $8 , %0 \n\t"
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "addq $8 , %2 \n\t"
+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
+ "subq $8 , %1 \n\t"
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
+
+ "jnz 1b \n\t"
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ "+r" (i), // 0
+ "+r" (n), // 1
+ "+r" (lda4) // 2
+ :
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
+
+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz 2f \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ "2: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je 3f \n\t"
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+

1349
2024.patch

File diff suppressed because it is too large Load Diff

View File

@ -1,412 +0,0 @@
From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001
From: Andrew <16061801+brada4@users.noreply.github.com>
Date: Sun, 24 Feb 2019 20:41:02 +0200
Subject: [PATCH 2/2] move fix to right place
---
dgemv_n_microk_piledriver-4.c | 247 --------------------
kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++----
2 files changed, 49 insertions(+), 296 deletions(-)
delete mode 100644 dgemv_n_microk_piledriver-4.c
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
deleted file mode 100644
index 466931b82..000000000
--- a/dgemv_n_microk_piledriver-4.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-
-#define HAVE_KERNEL_4x8 1
-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
-
-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
- "vbroadcastsd (%3), %%ymm12 \n\t" // x0
- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
-
- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
-
- "testq $0x04, %1 \n\t"
- "jz 2f \n\t"
-
- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
-
- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
-
- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
-
-
- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
-
- "addq $4 , %2 \n\t"
- "addq $4 , %0 \n\t"
- "subq $4 , %1 \n\t"
-
- "2: \n\t"
-
- "cmpq $0, %1 \n\t"
- "je 3f \n\t"
-
-
- ".align 16 \n\t"
- "1: \n\t"
-
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
-
- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
- "addq $8 , %0 \n\t"
- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
-
- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
-
- "addq $8 , %2 \n\t"
- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
- "subq $8 , %1 \n\t"
- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
-
- "jnz 1b \n\t"
-
- "3: \n\t"
- "vzeroupper \n\t"
-
- :
- "+r" (i), // 0
- "+r" (n), // 1
- "+r" (lda4) // 2
- :
- "r" (x), // 3
- "r" (y), // 4
- "r" (ap[0]), // 5
- "r" (ap[1]), // 6
- "r" (ap[2]), // 7
- "r" (ap[3]), // 8
- "r" (alpha) // 9
- : "cc",
- "%xmm0", "%xmm1",
- "%xmm2", "%xmm3",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-#define HAVE_KERNEL_4x4 1
-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
-
-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
- "vbroadcastsd (%2), %%ymm12 \n\t" // x0
- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
-
- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
-
- "testq $0x04, %1 \n\t"
- "jz 2f \n\t"
-
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
-
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
-
- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
-
- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
-
- "addq $4 , %0 \n\t"
- "subq $4 , %1 \n\t"
-
- "2: \n\t"
-
- "cmpq $0, %1 \n\t"
- "je 3f \n\t"
-
-
- ".align 16 \n\t"
- "1: \n\t"
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
-
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
-
- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
-
- "addq $8 , %0 \n\t"
- "subq $8 , %1 \n\t"
- "jnz 1b \n\t"
-
- "3: \n\t"
- "vzeroupper \n\t"
-
- :
- "+r" (i), // 0
- "+r" (n) // 1
- :
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (alpha) // 8
- : "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
index 530780bab..466931b82 100644
--- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c
+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
- "vbroadcastsd (%2), %%ymm12 \n\t" // x0
- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
- "addq $4 , %8 \n\t"
+ "addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
-
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
- "addq $8 , %8 \n\t"
+ "addq $8 , %2 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
"jnz 1b \n\t"
@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",

View File

@ -14,8 +14,8 @@
# "obsoleted" features are still kept in the spec.
Name: openblas
Version: 0.3.5
Release: 5%{?dist}
Version: 0.3.6
Release: 1%{?dist}
Summary: An optimized BLAS library based on GotoBLAS2
License: BSD
URL: https://github.com/xianyi/OpenBLAS/
@ -29,18 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch
# Supply the proper flags to the test makefile
Patch3: openblas-0.3.2-tests.patch
# Fix assembly code
Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch
Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch
Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch
Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch
Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch
Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch
Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch
Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch
Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch
Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch
BuildRequires: gcc
BuildRequires: gcc-gfortran
BuildRequires: perl-devel
@ -251,17 +239,6 @@ cd OpenBLAS-%{version}
%endif
%patch3 -p1 -b .tests
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%patch17 -p1
%patch18 -p1
%patch19 -p1
# Fix source permissions
find -name \*.f -exec chmod 644 {} \;
@ -697,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig
%endif
%changelog
* Tue Apr 30 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.6-1
- Update to 0.3.6.
* Tue Feb 26 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.5-5
- Even more assembly kernel patches.

View File

@ -1 +1 @@
SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0
SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9