148 lines
4.6 KiB
Diff
148 lines
4.6 KiB
Diff
diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
|
|
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100
|
|
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp 2015-05-04 13:36:14.915917441 +0200
|
|
@@ -604,6 +604,35 @@ static __forceinline void hh_trafo_compl
|
|
}
|
|
|
|
#else
|
|
+#ifndef _mm_loaddup_pd
|
|
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } a;
|
|
+ a.f64[0] = *dp;
|
|
+ a.f64[1] = *dp;
|
|
+ return a.d;
|
|
+}
|
|
+#endif
|
|
+#ifndef _mm_addsub_pd
|
|
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } A, B;
|
|
+ A.d = a;
|
|
+ B.d = b;
|
|
+
|
|
+ A.f64[0] -= B.f64[0];
|
|
+ A.f64[1] += B.f64[1];
|
|
+ return A.d;
|
|
+}
|
|
+#endif
|
|
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
|
|
{
|
|
double* q_dbl = (double*)q;
|
|
diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
|
|
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100
|
|
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp 2015-05-04 13:37:31.900484062 +0200
|
|
@@ -1493,6 +1493,35 @@ static __forceinline void hh_trafo_compl
|
|
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
|
|
}
|
|
#else
|
|
+#ifndef _mm_loaddup_pd
|
|
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } a;
|
|
+ a.f64[0] = *dp;
|
|
+ a.f64[1] = *dp;
|
|
+ return a.d;
|
|
+}
|
|
+#endif
|
|
+#ifndef _mm_addsub_pd
|
|
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } A, B;
|
|
+ A.d = a;
|
|
+ B.d = b;
|
|
+
|
|
+ A.f64[0] -= B.f64[0];
|
|
+ A.f64[1] += B.f64[1];
|
|
+ return A.d;
|
|
+}
|
|
+#endif
|
|
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
|
|
{
|
|
double* q_dbl = (double*)q;
|
|
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
|
|
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
|
|
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c 2015-05-04 13:30:17.618286220 +0200
|
|
@@ -1001,6 +1001,19 @@ void double_hh_trafo_fast_(double* q, do
|
|
* matrix vector product with two householder
|
|
* vectors + a rank 2 update is performed
|
|
*/
|
|
+#ifndef _mm_loaddup_pd
|
|
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } a;
|
|
+ a.f64[0] = *dp;
|
|
+ a.f64[1] = *dp;
|
|
+ return a.d;
|
|
+}
|
|
+#endif
|
|
__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
|
|
{
|
|
/////////////////////////////////////////////////////
|
|
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
|
|
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
|
|
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c 2015-05-04 13:30:12.830250942 +0200
|
|
@@ -1334,6 +1334,19 @@ __forceinline void hh_trafo_kernel_4_AVX
|
|
* matrix vector product with two householder
|
|
* vectors + a rank 1 update is performed
|
|
*/
|
|
+#ifndef _mm_loaddup_pd
|
|
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } a;
|
|
+ a.f64[0] = *dp;
|
|
+ a.f64[1] = *dp;
|
|
+ return a.d;
|
|
+}
|
|
+#endif
|
|
__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
|
|
{
|
|
/////////////////////////////////////////////////////
|
|
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
|
|
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100
|
|
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c 2015-05-04 13:30:06.741206082 +0200
|
|
@@ -1744,6 +1744,19 @@ __forceinline void hh_trafo_kernel_4_AVX
|
|
_mm256_store_pd(&q[(nb+4)*ldq],q1);
|
|
}
|
|
#else
|
|
+#ifndef _mm_loaddup_pd
|
|
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
|
|
+{
|
|
+ union
|
|
+ {
|
|
+ __m128d d;
|
|
+ double f64[2];
|
|
+ } a;
|
|
+ a.f64[0] = *dp;
|
|
+ a.f64[1] = *dp;
|
|
+ return a.d;
|
|
+}
|
|
+#endif
|
|
/**
|
|
* Unrolled kernel that computes
|
|
* 4 rows of Q simultaneously, a
|