openblas/2024.patch

1350 lines
82 KiB
Diff

From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 20:06:48 +0100
Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels
rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009
---
kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++----
kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++-----------
kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++-----------
kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++---
kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++---
5 files changed, 356 insertions(+), 356 deletions(-)
diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
index 54df5b359..35ed4cc01 100644
--- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
+++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
- " prefetcht0 384(%2,%1,8) \n\t"
- " prefetcht0 384(%3,%1,8) \n\t"
- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " prefetcht0 384(%6,%1,8) \n\t"
+ " prefetcht0 384(%7,%1,8) \n\t"
+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
- " prefetcht0 384(%2,%1,8) \n\t"
- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " prefetcht0 384(%6,%1,8) \n\t"
+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
- " prefetcht0 384(%2,%1,8) \n\t"
- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " prefetcht0 384(%6,%1,8) \n\t"
+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 2f \n\t"
- " prefetcht0 384(%2,%1,8) \n\t"
- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
- " vmovddup 8(%3,%1,2), %%xmm1 \n\t"
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " prefetcht0 384(%6,%1,8) \n\t"
+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t"
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1
- " vmovddup (%7), %%xmm1 \n\t" // read b
- " vmovddup 8(%7), %%xmm0 \n\t" // read bb
+ " vmovddup (%3), %%xmm1 \n\t" // read b
+ " vmovddup 8(%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
- " vmovups %%xmm12 , (%6) \n\t" // write a
- " vmovups %%xmm13 , 16(%6) \n\t" // write a
- " vmovups %%xmm14 , 32(%6) \n\t" // write a
- " vmovups %%xmm15 , 48(%6) \n\t" // write a
+ " vmovups %%xmm12 , (%2) \n\t" // write a
+ " vmovups %%xmm13 , 16(%2) \n\t" // write a
+ " vmovups %%xmm14 , 32(%2) \n\t" // write a
+ " vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0
- " subq $16 , %7 \n\t" // b = b - 2
- " subq $64 , %6 \n\t" // a = a - 8
+ " subq $16 , %3 \n\t" // b = b - 2
+ " subq $64 , %2 \n\t" // a = a - 8
- " vmovddup (%7), %%xmm0 \n\t" // read bb
+ " vmovddup (%3), %%xmm0 \n\t" // read bb
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
- " vmovups %%xmm8 , (%6) \n\t" // write a
- " vmovups %%xmm9 , 16(%6) \n\t"
- " vmovups %%xmm10 , 32(%6) \n\t"
- " vmovups %%xmm11 , 48(%6) \n\t"
+ " vmovups %%xmm8 , (%2) \n\t" // write a
+ " vmovups %%xmm9 , 16(%2) \n\t"
+ " vmovups %%xmm10 , 32(%2) \n\t"
+ " vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
- "r" (as), // 6
- "r" (bs) // 7
+ "r" (a), // 6
+ "r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c
index 1b8991c6c..3cd215000 100644
--- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c
@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t"
- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i]
+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i]
+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i]
+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i]
+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i]
+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i]
+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i]
+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i]
+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i]
+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
- " subq $64 , %6 \n\t" // a -= m
- " subq $8 , %7 \n\t" // b -= n
+ " subq $64 , %2 \n\t" // a -= m
+ " subq $8 , %3 \n\t" // b -= n
- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i]
+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
- "r" (as), // 6
- "r" (bs) // 7
+ "r" (a), // 6
+ "r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c
index 0623dddb0..a4a62491c 100644
--- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c
@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t"
- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i]
+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i]
+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i]
+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i]
+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i]
+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i]
+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i]
+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i]
+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i]
+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i]
+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"
- " addq $64 , %6 \n\t" // a -= m
- " addq $8 , %7 \n\t" // b -= n
+ " addq $64 , %2 \n\t" // a -= m
+ " addq $8 , %3 \n\t" // b -= n
- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
- "r" (c), // 4
- "r" (c1), // 5
- "r" (as), // 6
- "r" (bs) // 7
+ "r" (c), // 4
+ "r" (c1), // 5
+ "r" (a), // 6
+ "r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c
index 4cc557d55..c11c84cec 100644
--- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c
@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 0
- " vbroadcastss (%7), %%xmm0 \n\t" // read bb
- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b
+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb
+ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
- " vmovups %%xmm8 , (%6) \n\t" // write a
- " vmovups %%xmm9 , 16(%6) \n\t"
- " vmovups %%xmm10 , 32(%6) \n\t"
- " vmovups %%xmm11 , 48(%6) \n\t"
+ " vmovups %%xmm8 , (%2) \n\t" // write a
+ " vmovups %%xmm9 , 16(%2) \n\t"
+ " vmovups %%xmm10 , 32(%2) \n\t"
+ " vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t"
" \n\t" // i = 1
- " addq $8 , %7 \n\t" // b = b + 2
- " addq $64 , %6 \n\t" // a = a + 16
+ " addq $8 , %3 \n\t" // b = b + 2
+ " addq $64 , %2 \n\t" // a = a + 16
- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
- " vmovups %%xmm12 , (%6) \n\t" // write a
- " vmovups %%xmm13 , 16(%6) \n\t" // write a
- " vmovups %%xmm14 , 32(%6) \n\t" // write a
- " vmovups %%xmm15 , 48(%6) \n\t" // write a
+ " vmovups %%xmm12 , (%2) \n\t" // write a
+ " vmovups %%xmm13 , 16(%2) \n\t" // write a
+ " vmovups %%xmm14 , 32(%2) \n\t" // write a
+ " vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
- "r" (c), // 4
- "r" (c1), // 5
- "r" (as), // 6
- "r" (bs) // 7
+ "r" (c), // 4
+ "r" (c1), // 5
+ "r" (a), // 6
+ "r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c
index 73f6e8a95..326ca2976 100644
--- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c
@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"
- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
- " vmovups (%2,%1,8), %%xmm4 \n\t"
- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
- " vmovups 16(%2,%1,8), %%xmm5 \n\t"
- " vmovups 32(%2,%1,8), %%xmm6 \n\t"
- " vmovups 48(%2,%1,8), %%xmm7 \n\t"
+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
+ " vmovups (%6,%1,8), %%xmm4 \n\t"
+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
+ " vmovups 16(%6,%1,8), %%xmm5 \n\t"
+ " vmovups 32(%6,%1,8), %%xmm6 \n\t"
+ " vmovups 48(%6,%1,8), %%xmm7 \n\t"
" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"3: \n\t" // i = 1
- " vbroadcastss (%7), %%xmm1 \n\t" // read b
- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
+ " vbroadcastss (%3), %%xmm1 \n\t" // read b
+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
- " vmovups %%xmm12 , (%6) \n\t" // write a
- " vmovups %%xmm13 , 16(%6) \n\t" // write a
- " vmovups %%xmm14 , 32(%6) \n\t" // write a
- " vmovups %%xmm15 , 48(%6) \n\t" // write a
+ " vmovups %%xmm12 , (%2) \n\t" // write a
+ " vmovups %%xmm13 , 16(%2) \n\t" // write a
+ " vmovups %%xmm14 , 32(%2) \n\t" // write a
+ " vmovups %%xmm15 , 48(%2) \n\t" // write a
" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
" \n\t" // i = 0
- " subq $8 , %7 \n\t" // b = b - 2
- " subq $64 , %6 \n\t" // a = a - 16
+ " subq $8 , %3 \n\t" // b = b - 2
+ " subq $64 , %2 \n\t" // a = a - 16
- " vbroadcastss (%7), %%xmm0 \n\t" // read bb
+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb
" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"
- " vmovups %%xmm8 , (%6) \n\t" // write a
- " vmovups %%xmm9 , 16(%6) \n\t"
- " vmovups %%xmm10 , 32(%6) \n\t"
- " vmovups %%xmm11 , 48(%6) \n\t"
+ " vmovups %%xmm8 , (%2) \n\t" // write a
+ " vmovups %%xmm9 , 16(%2) \n\t"
+ " vmovups %%xmm10 , 32(%2) \n\t"
+ " vmovups %%xmm11 , 48(%2) \n\t"
" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
- "r" (c), // 4
- "r" (c1), // 5
- "r" (as), // 6
- "r" (bs) // 7
+ "r" (c), // 4
+ "r" (c1), // 5
+ "r" (a), // 6
+ "r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",