Fix-up patch

2017-02-13 22:49:31 +01:00 · 2017-02-13 22:49:31 +01:00 · fe8187055b
commit fe8187055b
parent 33992f7d55
1 changed files with 114 additions and 146 deletions
--- a/openblas-0.2.19-fix_register_clobbers.patch
+++ b/openblas-0.2.19-fix_register_clobbers.patch
@ -1,38 +1,107 @@
-From b8c0a1f7e25aa18d97e8a330764fc5464939b036 Mon Sep 17 00:00:00 2001
-From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
-Date: Fri, 3 Feb 2017 21:17:33 +0100
-Subject: [PATCH] Fix register clobbers
+From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
+From: Alan Modra <amodra@gmail.com>
+Date: Thu, 9 Feb 2017 08:41:51 +1030
+Subject: [PATCH] Fix power8 asm()

-Remove PIC registers and memory from clobber list, add vector registers to list - fixes accidental overwriting of callee saved registers and compilation with gcc7
-Copied from patch provided by Alan Modra in #1078
---
- kernel/power/sasum_microk_power8.c | 233 ++++++++++++++++++-------------------
- 1 file changed, 112 insertions(+), 121 deletions(-)
+Lots of issues here.
+- The vsx regs weren't listed as clobbered.
+- Poor choice of vsx regs, which along with the lack of clobbers led to
+  trashing v0..v21 and fr14..fr23.  Ideally you'd let gcc choose all
+  temp vsx regs, but asms currently have a limit of 30 i/o parms.
+- Other regs were clobbered unnecessarily, seemingly in an attempt to
+  clobber inputs, with gcc-7 complaining about the clobber of r2.
+  (Changed inputs should be also listed as outputs or as an i/o.)
+- "r" constraint used instead of "b" for gprs used in insns where the
+  r0 encoding means zero rather than r0.
+- There were unused asm inputs too.
+- All memory was clobbered rather than hooking up memory outputs with
+  proper memory constraints, and that and the lack of proper memory
+  input constraints meant the asms needed to be volatile and their
+  containing function noinline.
+- Some parameters were being passed unnecessarily via memory.
+- When a copy of a pointer input parm was needed, the value passed to
+  the asm was incremented in C and decremented in asm, rather than
+  using i/o parms, an early clobber constraint, or a temp output reg
+  copied in the asm.  In most cases a small change to assembly could
+  be made that obviated the need for the extra pointer.
+- A number of functions did not compute the final sum or dot-product
+  in assembly, instead using scalar code in C.
+- dcbt was bogus.

-diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
-index 847fffe..f28eb49 100644
--- a/kernel/power/sasum_microk_power8.c
-+++ b/kernel/power/sasum_microk_power8.c
-@@ -38,9 +38,6 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((
+I've also fixed formatting of the asm.
+
+diff --git a/kernel/power/casum.c b/kernel/power/casum.c
+index aeed0ca..d110858 100644
+--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
+@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
- static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
+ #ifndef HAVE_KERNEL_16
+ 
+-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
+ {
+ 
+ 	BLASLONG i=0;
+@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+ 
+ 	}
+ 
+-	svec[0] = sum0+sum1+sum2+sum3;
+-	svec[1] = 0.0;
+-	svec[2] = 0.0;
+-	svec[3] = 0.0;
+-
+	return sum0+sum1+sum2+sum3;
+ }
+ 
+ #endif
+@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+ 	BLASLONG i=0;
+ 	BLASLONG ip=0;
+ 	FLOAT sumf = 0.0;
+-	FLOAT svec[4] __attribute__ ((aligned (16)));;
+ 	BLASLONG n1;
+ 	BLASLONG inc_x2;
+ 
+@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+ 		if ( n1 > 0 )
+ 		{
+ 
+-			casum_kernel_16(n1, x, svec);
+-			sumf = svec[0] + svec[1]+svec[2]+svec[3];
+			sumf = casum_kernel_16(n1, x);
+ 			i=n1;
+ 			ip = 2 * n1;
+ 		}
+diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
+index cb50234..38a1143 100644
+--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
+@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+ 
+ #define HAVE_KERNEL_16 1
+-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+ 
+-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float casum_kernel_16 (long n, float *x)
 {
 -
 -
 -	BLASLONG i = n;
- 	BLASLONG o16 = 16;
- 	BLASLONG o32 = 32;
- 	BLASLONG o48 = 48;
-@@ -48,130 +45,124 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
- 	BLASLONG o80 = 80;
- 	BLASLONG o96 = 96;
- 	BLASLONG o112 = 112;
+-	BLASLONG o16 = 16;
+-	BLASLONG o32 = 32;
+-	BLASLONG o48 = 48;
+-	BLASLONG o64 = 64;
+-	BLASLONG o80 = 80;
+-	BLASLONG o96 = 96;
+-	BLASLONG o112 = 112;
 -	FLOAT *x1=x;
- 	BLASLONG pre = 384;
- 
+-	BLASLONG pre = 384;
+-
 -	__asm__  __volatile__
-+	__asm__
- 	(
+-	(
 -
 -	"dcbt		%2 , %4				    \n\t"
 -
@ -56,7 +125,7 @@ index 847fffe..f28eb49 100644
 -
 -	"addi		%2, %2, 128			    \n\t"
 -
-	"addic.		%0 , %0	, -32  	 	             \n\t"
+-	"addic.		%0 , %0	, -16  	 	             \n\t"
 -	"ble		2f		             	     \n\t"
 -
 -	".align 5				            \n\t"
@ -95,7 +164,7 @@ index 847fffe..f28eb49 100644
 -	"addi		%2, %2, 128			    \n\t"
 -	"xvaddsp	36, 36, 52		    \n\t"
 -	"xvaddsp	37, 37, 53		    \n\t"
-	"addic.		%0 , %0	, -32  	 	             \n\t"
+-	"addic.		%0 , %0	, -16  	 	             \n\t"
 -	"xvaddsp	38, 38, 54		    \n\t"
 -	"xvaddsp	39, 39, 55		    \n\t"
 -
@ -134,103 +203,8 @@ index 847fffe..f28eb49 100644
 -
 -
 -	"stxvw4x	32, 0, %3		     \n\t"
-+	"dcbt		%1, %3			\n\t"
-+
-+	"xxlxor		32, 32,	32		\n\t"
-+	"xxlxor		33, 33,	33		\n\t"
-+	"xxlxor		34, 34,	34		\n\t"
-+	"xxlxor		35, 35,	35		\n\t"
-+	"xxlxor		36, 36,	36		\n\t"
-+	"xxlxor		37, 37,	37		\n\t"
-+	"xxlxor		38, 38,	38		\n\t"
-+	"xxlxor		39, 39,	39		\n\t"
-+
-+	"lxvw4x		40, 0, %1		\n\t"
-+	"lxvw4x		41, %4, %1		\n\t"
-+	"lxvw4x		42, %5, %1		\n\t"
-+	"lxvw4x		43, %6, %1		\n\t"
-+	"lxvw4x		44, %7, %1		\n\t"
-+	"lxvw4x		45, %8, %1		\n\t"
-+	"lxvw4x		46, %9, %1		\n\t"
-+	"lxvw4x		47, %10, %1		\n\t"
-+
-+	"addi		%1, %1, 128		\n\t"
-+	"addic.		%2, %2, -32		\n\t"
-+	"ble		2f			\n\t"
-+
-+	".p2align 5				\n\t"
-+	"1:					\n\t"
-+	"dcbt		%1, %3			\n\t"
-+
-+	"xvabssp	48, 40			\n\t"
-+	"xvabssp	49, 41			\n\t"
-+	"xvabssp	50, 42			\n\t"
-+	"xvabssp	51, 43			\n\t"
-+
-+	"lxvw4x		40, 0, %1		\n\t"
-+	"lxvw4x		41, %4, %1		\n\t"
-+
-+	"xvabssp	52, 44			\n\t"
-+	"xvabssp	53, 45			\n\t"
-+
-+	"lxvw4x		42, %5, %1		\n\t"
-+	"lxvw4x		43, %6, %1		\n\t"
-+
-+	"xvabssp	54, 46			\n\t"
-+	"xvabssp	55, 47			\n\t"
-+
-+	"lxvw4x		44, %7, %1		\n\t"
-+	"lxvw4x		45, %8, %1		\n\t"
-+
-+	"xvaddsp	32, 32, 48		\n\t"
-+	"xvaddsp	33, 33, 49		\n\t"
-+
-+	"lxvw4x		46, %9, %1		\n\t"
-+	"lxvw4x		47, %10, %1		\n\t"
-+
-+	"xvaddsp	34, 34, 50		\n\t"
-+	"xvaddsp	35, 35, 51		\n\t"
-+	"addi		%1, %1, 128		\n\t"
-+	"xvaddsp	36, 36, 52		\n\t"
-+	"xvaddsp	37, 37, 53		\n\t"
-+	"addic.		%2, %2, -32		\n\t"
-+	"xvaddsp	38, 38, 54		\n\t"
-+	"xvaddsp	39, 39, 55		\n\t"
-+
-+	"bgt		1b			\n\t"
-+
-+	"2:					\n\t"
-+	"xvabssp	48, 40			\n\t"
-+	"xvabssp	49, 41			\n\t"
-+	"xvabssp	50, 42			\n\t"
-+	"xvabssp	51, 43			\n\t"
-+	"xvabssp	52, 44			\n\t"
-+	"xvabssp	53, 45			\n\t"
-+	"xvabssp	54, 46			\n\t"
-+	"xvabssp	55, 47			\n\t"
-+
-+	"xvaddsp	32, 32, 48		\n\t"
-+	"xvaddsp	33, 33, 49		\n\t"
-+	"xvaddsp	34, 34, 50		\n\t"
-+	"xvaddsp	35, 35, 51		\n\t"
-+	"xvaddsp	36, 36, 52		\n\t"
-+	"xvaddsp	37, 37, 53		\n\t"
-+	"xvaddsp	38, 38, 54		\n\t"
-+	"xvaddsp	39, 39, 55		\n\t"
-+
-+	"xvaddsp	32, 32, 33		\n\t"
-+	"xvaddsp	34, 34, 35		\n\t"
-+	"xvaddsp	36, 36, 37		\n\t"
-+	"xvaddsp	38, 38, 39		\n\t"
-+
-+	"xvaddsp	32, 32, 34		\n\t"
-+	"xvaddsp	36, 36, 38		\n\t"
-+
-+	"xvaddsp	32, 32, 36		\n\t"
-+
-+	"stxvw4x	32, %y0			\n\t"
- 
- 	:
+-
+-	:
 -        : 
 -          "r" (i),	// 0	
 -	  "r" (n),  	// 1
@ -245,28 +219,22 @@ index 847fffe..f28eb49 100644
 -          "r" (o96),    // 10
 -          "r" (o112)   // 11
 -	: "cr0", "%0", "%2",  "memory"
-+	  "=m" (*svec),	// 0
-+	  "+b" (x),	// 1
-+	  "+r" (n)	// 2
-+	:
-+	  "r" (pre),	// 3
-+	  "r" (o16),	// 4
-+	  "r" (o32),	// 5
-+	  "r" (o48),	// 6
-+	  "r" (o64),	// 7
-+	  "r" (o80),	// 8
-+	  "r" (o96),	// 9
-+	  "r" (o112)	// 10
-+	:
-+	  "cr0","32","33","34","35","36","37","38","39",
-+	  "40","41","42","43","44","45","46","47",
-+	  "48","49","50","51","52","53","54","55"
- 	);
+-	);
 -
- } 
- 
- 
-	32	\n\t"
+-} 
+-
+-
+  float sum;
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
 +       "xxlxor		33, 33,	33	\n\t"
 +       "xxlxor		34, 34,	34	\n\t"
 +       "xxlxor		35, 35,	35	\n\t"