elpa/elpa-missing-intrinsics.patch

diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics	2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp	2015-05-04 13:36:14.915917441 +0200
@@ -604,6 +604,35 @@ static __forceinline void hh_trafo_compl
 }

 #else
+#ifndef _mm_loaddup_pd
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} a;
+	a.f64[0] = *dp;
+	a.f64[1] = *dp;
+	return a.d;
+}
+#endif
+#ifndef _mm_addsub_pd
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} A, B;
+	A.d = a;
+	B.d = b;
+
+	A.f64[0] -= B.f64[0];
+	A.f64[1] += B.f64[1];
+	return A.d;
+}
+#endif
 static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
 {
 	double* q_dbl = (double*)q;
diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
--- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics	2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp	2015-05-04 13:37:31.900484062 +0200
@@ -1493,6 +1493,35 @@ static __forceinline void hh_trafo_compl
 	_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
 }
 #else
+#ifndef _mm_loaddup_pd
+static __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} a;
+	a.f64[0] = *dp;
+	a.f64[1] = *dp;
+	return a.d;
+}
+#endif
+#ifndef _mm_addsub_pd
+static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} A, B;
+	A.d = a;
+	B.d = b;
+
+	A.f64[0] -= B.f64[0];
+	A.f64[1] += B.f64[1];
+	return A.d;
+}
+#endif
 static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
 {
 	double* q_dbl = (double*)q;
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics	2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c	2015-05-04 13:30:17.618286220 +0200
@@ -1001,6 +1001,19 @@ void double_hh_trafo_fast_(double* q, do
  * matrix vector product with two householder
  * vectors + a rank 2 update is performed
  */
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} a;
+	a.f64[0] = *dp;
+	a.f64[1] = *dp;
+	return a.d;
+}
+#endif
  __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
 {
 	/////////////////////////////////////////////////////
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics	2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c	2015-05-04 13:30:12.830250942 +0200
@@ -1334,6 +1334,19 @@ __forceinline void hh_trafo_kernel_4_AVX
  * matrix vector product with two householder
  * vectors + a rank 1 update is performed
  */
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} a;
+	a.f64[0] = *dp;
+	a.f64[1] = *dp;
+	return a.d;
+}
+#endif
 __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
 {
 	/////////////////////////////////////////////////////
diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
--- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics	2015-03-16 11:39:58.000000000 +0100
+++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c	2015-05-04 13:30:06.741206082 +0200
@@ -1744,6 +1744,19 @@ __forceinline void hh_trafo_kernel_4_AVX
 	_mm256_store_pd(&q[(nb+4)*ldq],q1);
 }
 #else
+#ifndef _mm_loaddup_pd
+ __forceinline __m128d _mm_loaddup_pd(double const * dp)
+{
+	union
+	{
+		__m128d d;
+		double f64[2];
+	} a;
+	a.f64[0] = *dp;
+	a.f64[1] = *dp;
+	return a.d;
+}
+#endif
 /**
  * Unrolled kernel that computes
  * 4 rows of Q simultaneously, a