diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp --- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100 +++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp 2015-05-04 13:36:14.915917441 +0200 @@ -604,6 +604,35 @@ static __forceinline void hh_trafo_compl } #else +#ifndef _mm_loaddup_pd +static __forceinline __m128d _mm_loaddup_pd(double const * dp) +{ + union + { + __m128d d; + double f64[2]; + } a; + a.f64[0] = *dp; + a.f64[1] = *dp; + return a.d; +} +#endif +#ifndef _mm_addsub_pd +static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b) +{ + union + { + __m128d d; + double f64[2]; + } A, B; + A.d = a; + B.d = b; + + A.f64[0] -= B.f64[0]; + A.f64[1] += B.f64[1]; + return A.d; +} +#endif static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex* q, std::complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; diff -up mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp --- mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp.intrinsics 2015-03-16 11:39:58.000000000 +0100 +++ mpich/src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp 2015-05-04 13:37:31.900484062 +0200 @@ -1493,6 +1493,35 @@ static __forceinline void hh_trafo_compl _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); } #else +#ifndef _mm_loaddup_pd +static __forceinline __m128d _mm_loaddup_pd(double const * dp) +{ + union + { + __m128d d; + double f64[2]; + } a; + a.f64[0] = *dp; + a.f64[1] = *dp; + return a.d; +} +#endif +#ifndef _mm_addsub_pd +static __forceinline __m128d _mm_addsub_pd(__m128d a, __m128d b) +{ + union + { + __m128d d; + double f64[2]; + } A, B; + A.d = a; + B.d = b; + + A.f64[0] -= B.f64[0]; + A.f64[1] += B.f64[1]; + return A.d; +} +#endif static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex* q, std::complex* hh, int nb, int ldq, int ldh, std::complex s) { double* q_dbl = (double*)q; diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c --- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100 +++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c 2015-05-04 13:30:17.618286220 +0200 @@ -1001,6 +1001,19 @@ void double_hh_trafo_fast_(double* q, do * matrix vector product with two householder * vectors + a rank 2 update is performed */ +#ifndef _mm_loaddup_pd + __forceinline __m128d _mm_loaddup_pd(double const * dp) +{ + union + { + __m128d d; + double f64[2]; + } a; + a.f64[0] = *dp; + a.f64[1] = *dp; + return a.d; +} +#endif __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c --- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100 +++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c 2015-05-04 13:30:12.830250942 +0200 @@ -1334,6 +1334,19 @@ __forceinline void hh_trafo_kernel_4_AVX * matrix vector product with two householder * vectors + a rank 1 update is performed */ +#ifndef _mm_loaddup_pd + __forceinline __m128d _mm_loaddup_pd(double const * dp) +{ + union + { + __m128d d; + double f64[2]; + } a; + a.f64[0] = *dp; + a.f64[1] = *dp; + return a.d; +} +#endif __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// diff -up mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c --- mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c.intrinsics 2015-03-16 11:39:58.000000000 +0100 +++ mpich/src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c 2015-05-04 13:30:06.741206082 +0200 @@ -1744,6 +1744,19 @@ __forceinline void hh_trafo_kernel_4_AVX _mm256_store_pd(&q[(nb+4)*ldq],q1); } #else +#ifndef _mm_loaddup_pd + __forceinline __m128d _mm_loaddup_pd(double const * dp) +{ + union + { + __m128d d; + double f64[2]; + } a; + a.f64[0] = *dp; + a.f64[1] = *dp; + return a.d; +} +#endif /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a