openblas/2021.patch

256 lines
9.7 KiB
Diff

From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Feb 2019 15:08:16 +0100
Subject: [PATCH] Fix wrong constraints in inline assembly
for #2009
---
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
1 file changed, 49 insertions(+), 49 deletions(-)
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index fcab8e2c7..9ab78fc8e 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
- " vmovups (%9), %%ymm0 \n\t"
+ " vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
- " vmovups 32(%9), %%ymm4 \n\t"
+ " vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0
- " addq $64, %9 \n\t" // b=b+8
+ " addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups %%ymm8 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm9 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm10, (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm11, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm12, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm13, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm14, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
- " addq $32, %8 \n\t" // a=a+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
- " vmovups %%ymm15, (%8) \n\t" // write a
+ " vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
- "r" (as), // 8
- "r" (bs) // 9
+ "r" (a), // 8
+ "r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",