elpa/elpa-missing-intrinsics.patch
2015-05-04 14:35:03 +02:00

148 lines
4.6 KiB
Diff

diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp 2015-05-04 13:36:14.915917441 +0200
@@ -604,6 +604,35 @@ static __forceinline void hh_trafo_compl
}
#else
+#ifndef _mm_loaddup_pd
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } a;
+ a.f64[0] = *dp;
+ a.f64[1] = *dp;
+ return a.d;
+}
+#endif
+#ifndef _mm_addsub_pd
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } A, B;
+ A.d = a;
+ B.d = b;
+
+ A.f64[0] -= B.f64[0];
+ A.f64[1] += B.f64[1];
+ return A.d;
+}
+#endif
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp 2015-05-04 13:37:31.900484062 +0200
@@ -1493,6 +1493,35 @@ static __forceinline void hh_trafo_compl
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
}
#else
+#ifndef _mm_loaddup_pd
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } a;
+ a.f64[0] = *dp;
+ a.f64[1] = *dp;
+ return a.d;
+}
+#endif
+#ifndef _mm_addsub_pd
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } A, B;
+ A.d = a;
+ B.d = b;
+
+ A.f64[0] -= B.f64[0];
+ A.f64[1] += B.f64[1];
+ return A.d;
+}
+#endif
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
double* q_dbl = (double*)q;
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c 2015-05-04 13:30:17.618286220 +0200
@@ -1001,6 +1001,19 @@ void double_hh_trafo_fast_(double* q, do
* matrix vector product with two householder
* vectors + a rank 2 update is performed
*/
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } a;
+ a.f64[0] = *dp;
+ a.f64[1] = *dp;
+ return a.d;
+}
+#endif
__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
/////////////////////////////////////////////////////
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c 2015-05-04 13:30:12.830250942 +0200
@@ -1334,6 +1334,19 @@ __forceinline void hh_trafo_kernel_4_AVX
* matrix vector product with two householder
* vectors + a rank 1 update is performed
*/
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } a;
+ a.f64[0] = *dp;
+ a.f64[1] = *dp;
+ return a.d;
+}
+#endif
__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
{
/////////////////////////////////////////////////////
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c 2015-05-04 13:30:06.741206082 +0200
@@ -1744,6 +1744,19 @@ __forceinline void hh_trafo_kernel_4_AVX
_mm256_store_pd(&q[(nb+4)*ldq],q1);
}
#else
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+ union
+ {
+ __m128d d;
+ double f64[2];
+ } a;
+ a.f64[0] = *dp;
+ a.f64[1] = *dp;
+ return a.d;
+}
+#endif
/**
* Unrolled kernel that computes
* 4 rows of Q simultaneously, a