2288 lines
78 KiB
Diff
2288 lines
78 KiB
Diff
--- doc/config.txt.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ doc/config.txt 2016-07-20 19:57:16.756611904 -0600
|
|
@@ -292,6 +292,7 @@ NTL_GF2X_NOINLINE=off
|
|
NTL_GF2X_ALTCODE=off
|
|
NTL_GF2X_ALTCODE1=off
|
|
NTL_PCLMUL=off
|
|
+NTL_LOADTIME_CPU=off
|
|
|
|
GMP_INCDIR=$(GMP_PREFIX)/include
|
|
GMP_LIBDIR=$(GMP_PREFIX)/lib
|
|
@@ -644,6 +645,10 @@ NTL_PCLMUL=off
|
|
# switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
|
|
# over GF(2)[X] (without relying on the gf2x package)
|
|
|
|
+NTL_LOADTIME_CPU=off
|
|
+
|
|
+# switch to check CPU characteristics at load time and use routines
|
|
+# optimized for the executing CPU.
|
|
|
|
|
|
########## More GMP Options:
|
|
--- include/NTL/config.h.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ include/NTL/config.h 2016-07-20 19:57:16.766611105 -0600
|
|
@@ -625,6 +625,23 @@ using the configure script.
|
|
#endif
|
|
|
|
|
|
+#if 0
|
|
+#define NTL_LOADTIME_CPU
|
|
+
|
|
+/*
|
|
+ * With this flag enabled, detect advanced CPU features at load time instead
|
|
+ * of at compile time. This flag is intended for distributions, so that they
|
|
+ * can compile for the lowest common denominator CPU, but still support newer
|
|
+ * CPUs.
|
|
+ *
|
|
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
|
|
+ *
|
|
+ * To re-build after changing this flag:
|
|
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
|
|
+ */
|
|
+
|
|
+#endif
|
|
+
|
|
|
|
|
|
|
|
--- include/NTL/ctools.h.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ include/NTL/ctools.h 2016-07-20 19:57:16.767611025 -0600
|
|
@@ -473,6 +473,137 @@ char *_ntl_make_aligned(char *p, long al
|
|
// this should be big enough to satisfy any SIMD instructions,
|
|
// and it should also be as big as a cache line
|
|
|
|
+/* Determine CPU characteristics at runtime */
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+#if !defined(__x86_64__)
|
|
+#error Runtime CPU support is only available on x86_64.
|
|
+#endif
|
|
+#ifndef __GNUC__
|
|
+#error Runtime CPU support is only available with GCC.
|
|
+#endif
|
|
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
|
|
+#error Runtime CPU support is only available with GCC 4.6 or later.
|
|
+#endif
|
|
+
|
|
+#include <cpuid.h>
|
|
+#ifndef bit_PCLMUL
|
|
+#define bit_PCLMUL (1 << 1)
|
|
+#endif
|
|
+#ifndef bit_AVX
|
|
+#define bit_AVX (1 << 28)
|
|
+#endif
|
|
+#ifndef bit_FMA
|
|
+#define bit_FMA (1 << 12)
|
|
+#endif
|
|
+#ifndef bit_AVX2
|
|
+#define bit_AVX2 (1 << 5)
|
|
+#endif
|
|
+
|
|
+#define BASE_FUNC(type,name) static type name##_base
|
|
+#define TARGET_FUNC(arch,suffix,type,name) \
|
|
+ static type __attribute__((target (arch))) name##_##suffix
|
|
+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name)
|
|
+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name)
|
|
+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name)
|
|
+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name)
|
|
+#define PCLMUL_RESOLVER(type,name,params) \
|
|
+ extern "C" { \
|
|
+ static void __attribute__((optimize ("O0"))) \
|
|
+ (*resolve_##name (void))(void) { \
|
|
+ if (__builtin_expect(have_pclmul, 0) < 0) { \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \
|
|
+ have_pclmul = ((ecx & bit_PCLMUL) != 0); \
|
|
+ have_avx = ((ecx & bit_AVX) != 0); \
|
|
+ have_fma = ((ecx & bit_FMA) != 0); \
|
|
+ } else { \
|
|
+ have_pclmul = 0; \
|
|
+ have_avx = 0; \
|
|
+ have_fma = 0; \
|
|
+ } \
|
|
+ } \
|
|
+ if (have_avx) return (void (*)(void))&name##_avx; \
|
|
+ if (have_pclmul) return (void (*)(void))&name##_pclmul; \
|
|
+ return (void (*)(void))&name##_base; \
|
|
+ } \
|
|
+ } \
|
|
+ type __attribute__((ifunc ("resolve_" #name))) name params
|
|
+#define AVX_RESOLVER(type,name,params) \
|
|
+ extern "C" { \
|
|
+ static void __attribute__((optimize ("O0"))) \
|
|
+ (*resolve_##name (void))(void) { \
|
|
+ if (__builtin_expect(have_pclmul, 0) < 0) { \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \
|
|
+ have_pclmul = ((ecx & bit_PCLMUL) != 0); \
|
|
+ have_avx = ((ecx & bit_AVX) != 0); \
|
|
+ have_fma = ((ecx & bit_FMA) != 0); \
|
|
+ } else { \
|
|
+ have_pclmul = 0; \
|
|
+ have_avx = 0; \
|
|
+ have_fma = 0; \
|
|
+ } \
|
|
+ } \
|
|
+ return have_avx \
|
|
+ ? (void (*)(void))&name##_avx \
|
|
+ : (void (*)(void))&name##_base; \
|
|
+ } \
|
|
+ } \
|
|
+ type __attribute__((ifunc ("resolve_" #name))) name params
|
|
+#define FMA_RESOLVER(type,name,params) \
|
|
+ extern "C" { \
|
|
+ static void __attribute__((optimize ("O0"))) \
|
|
+ (*resolve_##name (void))(void) { \
|
|
+ if (__builtin_expect(have_pclmul, 0) < 0) { \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \
|
|
+ have_pclmul = ((ecx & bit_PCLMUL) != 0); \
|
|
+ have_avx = ((ecx & bit_AVX) != 0); \
|
|
+ have_fma = ((ecx & bit_FMA) != 0); \
|
|
+ } else { \
|
|
+ have_pclmul = 0; \
|
|
+ have_avx = 0; \
|
|
+ have_fma = 0; \
|
|
+ } \
|
|
+ } \
|
|
+ return have_fma \
|
|
+ ? (void (*)(void))&name##_fma \
|
|
+ : (void (*)(void))&name##_avx; \
|
|
+ } \
|
|
+ } \
|
|
+ type __attribute__((ifunc ("resolve_" #name))) name params
|
|
+#define AVX2_RESOLVER(type,name,params) \
|
|
+ extern "C" { \
|
|
+ static void __attribute__((optimize ("O0"))) \
|
|
+ (*resolve_##name (void))(void) { \
|
|
+ if (__builtin_expect(have_avx2, 0) < 0) { \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \
|
|
+ have_avx2 = ((ebx & bit_AVX2) != 0); \
|
|
+ } else { \
|
|
+ have_avx2 = 0; \
|
|
+ } \
|
|
+ } \
|
|
+ if (__builtin_expect(have_pclmul, 0) < 0) { \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \
|
|
+ have_pclmul = ((ecx & bit_PCLMUL) != 0); \
|
|
+ have_avx = ((ecx & bit_AVX) != 0); \
|
|
+ have_fma = ((ecx & bit_FMA) != 0); \
|
|
+ } else { \
|
|
+ have_pclmul = 0; \
|
|
+ have_avx = 0; \
|
|
+ have_fma = 0; \
|
|
+ } \
|
|
+ } \
|
|
+ unsigned int eax, ebx, ecx, edx; \
|
|
+ return have_avx2 \
|
|
+ ? (void (*)(void))&name##_avx2 \
|
|
+ : (void (*)(void))&name##_fma; \
|
|
+ } \
|
|
+ } \
|
|
+ type __attribute__((ifunc ("resolve_" #name))) name params
|
|
+#endif
|
|
|
|
|
|
|
|
--- include/NTL/def_config.h.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ include/NTL/def_config.h 2016-07-20 19:57:16.767611025 -0600
|
|
@@ -625,6 +625,22 @@ using the configure script.
|
|
#endif
|
|
|
|
|
|
+#if 0
|
|
+#define NTL_LOADTIME_CPU
|
|
+
|
|
+/*
|
|
+ * With this flag enabled, detect advanced CPU features at load time instead
|
|
+ * of at compile time. This flag is intended for distributions, so that they
|
|
+ * can compile for the lowest common denominator CPU, but still support newer
|
|
+ * CPUs.
|
|
+ *
|
|
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
|
|
+ *
|
|
+ * To re-build after changing this flag:
|
|
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
|
|
+ */
|
|
+
|
|
+#endif
|
|
|
|
|
|
|
|
--- src/cfile.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/cfile 2016-07-20 19:57:16.768610945 -0600
|
|
@@ -625,6 +625,23 @@ using the configure script.
|
|
#endif
|
|
|
|
|
|
+#if @{NTL_LOADTIME_CPU}
|
|
+#define NTL_LOADTIME_CPU
|
|
+
|
|
+/*
|
|
+ * With this flag enabled, detect advanced CPU features at load time instead
|
|
+ * of at compile time. This flag is intended for distributions, so that they
|
|
+ * can compile for the lowest common denominator CPU, but still support newer
|
|
+ * CPUs.
|
|
+ *
|
|
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
|
|
+ *
|
|
+ * To re-build after changing this flag:
|
|
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
|
|
+ */
|
|
+
|
|
+#endif
|
|
+
|
|
|
|
@{WIZARD_HACK}
|
|
|
|
--- src/DispSettings.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/DispSettings.c 2016-07-20 19:57:16.768610945 -0600
|
|
@@ -191,6 +191,10 @@ cout << "Performance Options:\n";
|
|
cout << "NTL_PCLMUL\n";
|
|
#endif
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+ cout << "NTL_LOADTIME_CPU\n";
|
|
+#endif
|
|
+
|
|
|
|
cout << "***************************/\n";
|
|
cout << "\n\n";
|
|
--- src/DoConfig.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/DoConfig 2016-07-20 19:57:16.769610865 -0600
|
|
@@ -1,7 +1,7 @@
|
|
# This is a perl script, invoked from a shell
|
|
|
|
# use warnings; # this doesn't work on older versions of perl
|
|
-
|
|
+use Config;
|
|
|
|
|
|
%MakeFlag = (
|
|
@@ -83,6 +83,7 @@
|
|
'NTL_RANGE_CHECK' => 'off',
|
|
'NTL_FFT_BIGTAB' => 'off',
|
|
'NTL_FFT_LAZYMUL' => 'off',
|
|
+'NTL_LOADTIME_CPU' => 'off',
|
|
|
|
);
|
|
|
|
@@ -149,6 +150,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' &
|
|
}
|
|
|
|
|
|
+# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE
|
|
+
|
|
+if ($ConfigFlag{'NTL_LOADTIME_CPU'} eq 'on') {
|
|
+ if (!$Config{archname} =~ /x86_64/) {
|
|
+ die "Error: NTL_LOADTIME_CPU currently only available with x86_64...sorry\n";
|
|
+ }
|
|
+ $ConfigFlag{'NTL_GF2X_NOINLINE'} = 'on';
|
|
+}
|
|
+
|
|
|
|
# some special MakeVal values that are determined by SHARED
|
|
|
|
--- src/GF2X1.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/GF2X1.c 2016-07-20 19:57:16.770610785 -0600
|
|
@@ -19,7 +19,7 @@
|
|
// simple scaling factor for some crossover points:
|
|
// we use a lower crossover of the underlying multiplication
|
|
// is faster
|
|
-#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL))
|
|
+#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL) || defined (NTL_LOADTIME_CPU))
|
|
#define XOVER_SCALE (1L)
|
|
#else
|
|
#define XOVER_SCALE (2L)
|
|
--- src/GF2X.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/GF2X.c 2016-07-20 19:57:16.771610706 -0600
|
|
@@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned
|
|
__m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
|
|
_mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
|
|
}
|
|
+#elif defined (NTL_LOADTIME_CPU)
|
|
+
|
|
+#include <wmmintrin.h>
|
|
+
|
|
+static int have_pclmul = -1;
|
|
+static int have_avx = -1;
|
|
+static int have_fma = -1;
|
|
+
|
|
+#define NTL_INLINE inline
|
|
+
|
|
+#define pclmul_mul1(c,a,b) do { \
|
|
+ __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \
|
|
+ __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \
|
|
+ _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0)); \
|
|
+} while (0)
|
|
+
|
|
#else
|
|
|
|
|
|
@@ -579,6 +595,27 @@ void add(GF2X& x, const GF2X& a, const G
|
|
|
|
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ NTL_EFF_BB_MUL_CODE0
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ pclmul_mul1(c, a, b);
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ pclmul_mul1(c, a, b);
|
|
+}
|
|
+
|
|
+PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
|
|
+
|
|
+#else
|
|
+
|
|
static
|
|
void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
{
|
|
@@ -592,6 +629,7 @@ NTL_EFF_BB_MUL_CODE0
|
|
|
|
}
|
|
|
|
+#endif
|
|
|
|
#ifdef NTL_GF2X_NOINLINE
|
|
|
|
@@ -616,6 +654,51 @@ NTL_EFF_BB_MUL_CODE0
|
|
#endif
|
|
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,Mul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ NTL_EFF_BB_MUL_CODE1
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,Mul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] = carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] = carry;
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,Mul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] = carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] = carry;
|
|
+}
|
|
+
|
|
+PCLMUL_RESOLVER(static void,Mul1,
|
|
+ (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
|
|
+
|
|
+#else
|
|
+
|
|
static
|
|
void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
{
|
|
@@ -643,6 +726,53 @@ NTL_EFF_BB_MUL_CODE1
|
|
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,AddMul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ NTL_EFF_BB_MUL_CODE2
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,AddMul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] ^= carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] ^= carry;
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,AddMul1)
|
|
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] ^= carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] ^= carry;
|
|
+}
|
|
+
|
|
+PCLMUL_RESOLVER(static void,AddMul1,
|
|
+ (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a));
|
|
+
|
|
+#else
|
|
+
|
|
static
|
|
void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
|
|
{
|
|
@@ -671,6 +801,52 @@ NTL_EFF_BB_MUL_CODE2
|
|
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,Mul1_short)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ NTL_EFF_SHORT_BB_MUL_CODE1
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,Mul1_short)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] = carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] = carry;
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,Mul1_short)
|
|
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
+{
|
|
+ long i;
|
|
+ unsigned long carry, prod[2];
|
|
+
|
|
+ carry = 0;
|
|
+ for (i = 0; i < sb; i++) {
|
|
+ pclmul_mul1(prod, bp[i], a);
|
|
+ cp[i] = carry ^ prod[0];
|
|
+ carry = prod[1];
|
|
+ }
|
|
+
|
|
+ cp[sb] = carry;
|
|
+}
|
|
+
|
|
+PCLMUL_RESOLVER(static void,Mul1_short,
|
|
+ (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
|
|
+
|
|
+#else
|
|
|
|
static
|
|
void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
|
|
@@ -699,9 +875,29 @@ NTL_EFF_SHORT_BB_MUL_CODE1
|
|
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ NTL_EFF_HALF_BB_MUL_CODE0
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ pclmul_mul1(c, a, b);
|
|
+}
|
|
|
|
+AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
+{
|
|
+ pclmul_mul1(c, a, b);
|
|
+}
|
|
|
|
+PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
|
|
|
|
+#else
|
|
|
|
static
|
|
void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
|
|
@@ -716,6 +912,8 @@ NTL_EFF_HALF_BB_MUL_CODE0
|
|
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
|
|
// mul2...mul8 hard-code 2x2...8x8 word multiplies.
|
|
// I adapted these routines from LiDIA (except mul3, see below).
|
|
@@ -1627,6 +1825,77 @@ static const _ntl_ulong sqrtab[256] = {
|
|
|
|
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,sqr)(GF2X& c, const GF2X& a)
|
|
+{
|
|
+ long sa = a.xrep.length();
|
|
+ if (sa <= 0) {
|
|
+ clear(c);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ c.xrep.SetLength(sa << 1);
|
|
+ _ntl_ulong *cp = c.xrep.elts();
|
|
+ const _ntl_ulong *ap = a.xrep.elts();
|
|
+
|
|
+ for (long i = sa-1; i >= 0; i--) {
|
|
+ _ntl_ulong *c = cp + (i << 1);
|
|
+ _ntl_ulong a = ap[i];
|
|
+ _ntl_ulong hi, lo;
|
|
+
|
|
+ NTL_BB_SQR_CODE
|
|
+
|
|
+ c[0] = lo;
|
|
+ c[1] = hi;
|
|
+ }
|
|
+
|
|
+ c.normalize();
|
|
+ return;
|
|
+}
|
|
+
|
|
+PCLMUL_FUNC(void,sqr)(GF2X& c, const GF2X& a)
|
|
+{
|
|
+ long sa = a.xrep.length();
|
|
+ if (sa <= 0) {
|
|
+ clear(c);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ c.xrep.SetLength(sa << 1);
|
|
+ _ntl_ulong *cp = c.xrep.elts();
|
|
+ const _ntl_ulong *ap = a.xrep.elts();
|
|
+
|
|
+ for (long i = sa-1; i >= 0; i--)
|
|
+ pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
|
|
+
|
|
+ c.normalize();
|
|
+ return;
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,sqr)(GF2X& c, const GF2X& a)
|
|
+{
|
|
+ long sa = a.xrep.length();
|
|
+ if (sa <= 0) {
|
|
+ clear(c);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ c.xrep.SetLength(sa << 1);
|
|
+ _ntl_ulong *cp = c.xrep.elts();
|
|
+ const _ntl_ulong *ap = a.xrep.elts();
|
|
+
|
|
+ for (long i = sa-1; i >= 0; i--)
|
|
+ pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
|
|
+
|
|
+ c.normalize();
|
|
+ return;
|
|
+}
|
|
+
|
|
+PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a));
|
|
+
|
|
+#else
|
|
+
|
|
static inline
|
|
void sqr1(_ntl_ulong *c, _ntl_ulong a)
|
|
{
|
|
@@ -1667,6 +1936,7 @@ void sqr(GF2X& c, const GF2X& a)
|
|
return;
|
|
}
|
|
|
|
+#endif
|
|
|
|
|
|
void LeftShift(GF2X& c, const GF2X& a, long n)
|
|
--- src/InitSettings.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/InitSettings.c 2016-07-20 19:57:16.772610626 -0600
|
|
@@ -156,6 +156,11 @@ int main()
|
|
cout << "NTL_RANGE_CHECK=0\n";
|
|
#endif
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+ cout << "NTL_LOADTIME_CPU=1\n";
|
|
+#else
|
|
+ cout << "NTL_LOADTIME_CPU=0\n";
|
|
+#endif
|
|
|
|
// the following is synthetically defined
|
|
#ifdef NTL_LONGLONG_SP_MULMOD
|
|
--- src/lzz_pX1.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/lzz_pX1.c 2016-07-20 19:57:16.773610546 -0600
|
|
@@ -4,6 +4,12 @@
|
|
|
|
#ifdef NTL_HAVE_AVX
|
|
#include <immintrin.h>
|
|
+#elif defined(NTL_LOADTIME_CPU)
|
|
+#include <immintrin.h>
|
|
+
|
|
+static int have_pclmul = -1;
|
|
+static int have_avx = -1;
|
|
+static int have_fma = -1;
|
|
#endif
|
|
|
|
|
|
@@ -1076,6 +1082,175 @@ void Comp3Mod(zz_pX& x1, zz_pX& x2, zz_p
|
|
|
|
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+BASE_FUNC(void,build)
|
|
+(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
|
|
+{
|
|
+ altH.orig = &H;
|
|
+ altH.mem.kill();
|
|
+ altH.row.kill();
|
|
+
|
|
+ if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; }
|
|
+
|
|
+ altH.n = F.n;
|
|
+ altH.m = H.H.length()-1;
|
|
+
|
|
+ long p = zz_p::modulus();
|
|
+ long n = altH.n;
|
|
+ long m = altH.m;
|
|
+
|
|
+
|
|
+ if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
|
|
+ cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
|
|
+ altH.strategy = 1;
|
|
+ altH.pinv_L = sp_PrepRem(p);
|
|
+ }
|
|
+ else {
|
|
+ altH.strategy = 2;
|
|
+ altH.pinv_LL = make_sp_ll_reduce_struct(p);
|
|
+ }
|
|
+
|
|
+
|
|
+ if (altH.strategy == 1 || altH.strategy == 2) {
|
|
+
|
|
+ altH.row.SetLength(n);
|
|
+ long **row = altH.row.elts();
|
|
+
|
|
+ const long AllocAmt = 1L << 18;
|
|
+
|
|
+ long BlockSize = (AllocAmt + m - 1)/m;
|
|
+ long NumBlocks = (n + BlockSize - 1)/BlockSize;
|
|
+
|
|
+ altH.mem.SetLength(NumBlocks);
|
|
+
|
|
+ for (long i = 0; i < NumBlocks; i++) {
|
|
+ long first = i*BlockSize;
|
|
+ long last = min(n, first + BlockSize);
|
|
+ altH.mem[i].SetLength((last-first)*m);
|
|
+ for (long j = first; j < last; j++) {
|
|
+ row[j] = altH.mem[i].elts() + (j-first)*m;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (long i = 0; i < m; i++) {
|
|
+ const zz_p* ptr = H.H[i].rep.elts();
|
|
+ long len = H.H[i].rep.length();
|
|
+ for (long j = 0; j < len; j++)
|
|
+ row[j][i] = rep(ptr[j]);
|
|
+ for (long j = len; j < n; j++)
|
|
+ row[j][i] = 0;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+AVX_FUNC(void,build)
|
|
+(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
|
|
+{
|
|
+ altH.orig = &H;
|
|
+ altH.mem.kill();
|
|
+ altH.row.kill();
|
|
+ altH.dmem.kill();
|
|
+ altH.drow.kill();
|
|
+
|
|
+ if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; }
|
|
+
|
|
+ altH.n = F.n;
|
|
+ altH.m = H.H.length()-1;
|
|
+
|
|
+ long p = zz_p::modulus();
|
|
+ long n = altH.n;
|
|
+ long m = altH.m;
|
|
+
|
|
+ if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) &&
|
|
+ m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) {
|
|
+ altH.strategy = 3;
|
|
+ altH.pinv_L = sp_PrepRem(p);
|
|
+ }
|
|
+ else if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
|
|
+ cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
|
|
+ altH.strategy = 1;
|
|
+ altH.pinv_L = sp_PrepRem(p);
|
|
+ }
|
|
+ else {
|
|
+ altH.strategy = 2;
|
|
+ altH.pinv_LL = make_sp_ll_reduce_struct(p);
|
|
+ }
|
|
+
|
|
+
|
|
+ if (altH.strategy == 1 || altH.strategy == 2) {
|
|
+
|
|
+ altH.row.SetLength(n);
|
|
+ long **row = altH.row.elts();
|
|
+
|
|
+ const long AllocAmt = 1L << 18;
|
|
+
|
|
+ long BlockSize = (AllocAmt + m - 1)/m;
|
|
+ long NumBlocks = (n + BlockSize - 1)/BlockSize;
|
|
+
|
|
+ altH.mem.SetLength(NumBlocks);
|
|
+
|
|
+ for (long i = 0; i < NumBlocks; i++) {
|
|
+ long first = i*BlockSize;
|
|
+ long last = min(n, first + BlockSize);
|
|
+ altH.mem[i].SetLength((last-first)*m);
|
|
+ for (long j = first; j < last; j++) {
|
|
+ row[j] = altH.mem[i].elts() + (j-first)*m;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (long i = 0; i < m; i++) {
|
|
+ const zz_p* ptr = H.H[i].rep.elts();
|
|
+ long len = H.H[i].rep.length();
|
|
+ for (long j = 0; j < len; j++)
|
|
+ row[j][i] = rep(ptr[j]);
|
|
+ for (long j = len; j < n; j++)
|
|
+ row[j][i] = 0;
|
|
+ }
|
|
+ } else {
|
|
+
|
|
+ // sanity check
|
|
+ if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow");
|
|
+
|
|
+ long npanels = (n+15)/16;
|
|
+ long panel_size = 16*m;
|
|
+
|
|
+ const long AllocAmt = 1L << 18;
|
|
+
|
|
+ long BlockSize = (AllocAmt + panel_size - 1)/panel_size;
|
|
+ long NumBlocks = (npanels + BlockSize - 1)/BlockSize;
|
|
+
|
|
+ altH.dmem.SetLength(NumBlocks);
|
|
+ altH.drow.SetLength(npanels);
|
|
+ double **drow = altH.drow.elts();
|
|
+
|
|
+ for (long i = 0; i < NumBlocks; i++) {
|
|
+ long first = i*BlockSize;
|
|
+ long last = min(npanels, first + BlockSize);
|
|
+ altH.dmem[i].SetLength((last-first)*panel_size);
|
|
+
|
|
+ double *ptr = altH.dmem[i].get();
|
|
+
|
|
+ for (long j = first; j < last; j++)
|
|
+ drow[j] = ptr + (j-first)*panel_size;
|
|
+ }
|
|
+
|
|
+ for (long i = 0; i < m; i++) {
|
|
+ const zz_p *ptr = H.H[i].rep.elts();
|
|
+ long len = H.H[i].rep.length();
|
|
+ for (long j = 0; j < len; j++)
|
|
+ drow[j/16][(i*16) + (j%16)] = rep(ptr[j]);
|
|
+ for (long j = len; j < npanels*16; j++)
|
|
+ drow[j/16][(i*16) + (j%16)] = 0;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+AVX_RESOLVER(void,build,
|
|
+ (zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F));
|
|
+
|
|
+#else
|
|
+
|
|
void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
|
|
{
|
|
altH.orig = &H;
|
|
@@ -1194,11 +1369,82 @@ void build(zz_pXAltArgument& altH, const
|
|
#endif
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
|
|
#ifdef NTL_HAVE_LL_TYPE
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,mul16rowsD)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, avec2, avec3;
|
|
+
|
|
+ __m256d acc0 = _mm256_setzero_pd();
|
|
+ __m256d acc1 = _mm256_setzero_pd();
|
|
+ __m256d acc2 = _mm256_setzero_pd();
|
|
+ __m256d acc3 = _mm256_setzero_pd();
|
|
+
|
|
+ __m256d bvec;
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ bvec = _mm256_broadcast_sd(&b[i]);
|
|
+
|
|
+ avec0 = _mm256_load_pd(a); a += 4;
|
|
+ avec1 = _mm256_load_pd(a); a += 4;
|
|
+ avec2 = _mm256_load_pd(a); a += 4;
|
|
+ avec3 = _mm256_load_pd(a); a += 4;
|
|
+
|
|
+ acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
|
|
+ acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
|
|
+ acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
|
|
+ acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
|
|
+ }
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, acc0);
|
|
+ _mm256_store_pd(x + 1*4, acc1);
|
|
+ _mm256_store_pd(x + 2*4, acc2);
|
|
+ _mm256_store_pd(x + 3*4, acc3);
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,mul16rowsD)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, avec2, avec3;
|
|
+
|
|
+ __m256d acc0 = _mm256_setzero_pd();
|
|
+ __m256d acc1 = _mm256_setzero_pd();
|
|
+ __m256d acc2 = _mm256_setzero_pd();
|
|
+ __m256d acc3 = _mm256_setzero_pd();
|
|
+
|
|
+ __m256d bvec;
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ bvec = _mm256_broadcast_sd(&b[i]);
|
|
+
|
|
+ avec0 = _mm256_load_pd(a); a += 4;
|
|
+ avec1 = _mm256_load_pd(a); a += 4;
|
|
+ avec2 = _mm256_load_pd(a); a += 4;
|
|
+ avec3 = _mm256_load_pd(a); a += 4;
|
|
+
|
|
+ acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
|
|
+ acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
|
|
+ acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
|
|
+ acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
|
|
+ }
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, acc0);
|
|
+ _mm256_store_pd(x + 1*4, acc1);
|
|
+ _mm256_store_pd(x + 2*4, acc2);
|
|
+ _mm256_store_pd(x + 3*4, acc3);
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,mul16rowsD,
|
|
+ (double *x, const double *a, const double *b, long n));
|
|
+
|
|
+#elif defined(NTL_HAVE_AVX)
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
static
|
|
void mul16rowsD(double *x, const double *a, const double *b, long n)
|
|
{
|
|
@@ -1243,6 +1489,114 @@ void mul16rowsD(double *x, const double
|
|
_mm256_store_pd(x + 3*4, acc3);
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,mul16rows2D)
|
|
+(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, avec2, avec3;
|
|
+
|
|
+ __m256d acc0 = _mm256_setzero_pd();
|
|
+ __m256d acc1 = _mm256_setzero_pd();
|
|
+ __m256d acc2 = _mm256_setzero_pd();
|
|
+ __m256d acc3 = _mm256_setzero_pd();
|
|
+
|
|
+ __m256d acc0_ = _mm256_setzero_pd();
|
|
+ __m256d acc1_ = _mm256_setzero_pd();
|
|
+ __m256d acc2_ = _mm256_setzero_pd();
|
|
+ __m256d acc3_ = _mm256_setzero_pd();
|
|
+
|
|
+
|
|
+ __m256d bvec;
|
|
+ __m256d bvec_;
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ bvec = _mm256_broadcast_sd(&b[i]);
|
|
+ bvec_ = _mm256_broadcast_sd(&b_[i]);
|
|
+
|
|
+ avec0 = _mm256_load_pd(a); a += 4;
|
|
+ avec1 = _mm256_load_pd(a); a += 4;
|
|
+ avec2 = _mm256_load_pd(a); a += 4;
|
|
+ avec3 = _mm256_load_pd(a); a += 4;
|
|
+
|
|
+ acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
|
|
+ acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
|
|
+ acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
|
|
+ acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
|
|
+
|
|
+ acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_);
|
|
+ acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_);
|
|
+ acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_);
|
|
+ acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_);
|
|
+ }
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, acc0);
|
|
+ _mm256_store_pd(x + 1*4, acc1);
|
|
+ _mm256_store_pd(x + 2*4, acc2);
|
|
+ _mm256_store_pd(x + 3*4, acc3);
|
|
+
|
|
+ _mm256_store_pd(x_ + 0*4, acc0_);
|
|
+ _mm256_store_pd(x_ + 1*4, acc1_);
|
|
+ _mm256_store_pd(x_ + 2*4, acc2_);
|
|
+ _mm256_store_pd(x_ + 3*4, acc3_);
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,mul16rows2D)
|
|
+(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, avec2, avec3;
|
|
+
|
|
+ __m256d acc0 = _mm256_setzero_pd();
|
|
+ __m256d acc1 = _mm256_setzero_pd();
|
|
+ __m256d acc2 = _mm256_setzero_pd();
|
|
+ __m256d acc3 = _mm256_setzero_pd();
|
|
+
|
|
+ __m256d acc0_ = _mm256_setzero_pd();
|
|
+ __m256d acc1_ = _mm256_setzero_pd();
|
|
+ __m256d acc2_ = _mm256_setzero_pd();
|
|
+ __m256d acc3_ = _mm256_setzero_pd();
|
|
+
|
|
+
|
|
+ __m256d bvec;
|
|
+ __m256d bvec_;
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ bvec = _mm256_broadcast_sd(&b[i]);
|
|
+ bvec_ = _mm256_broadcast_sd(&b_[i]);
|
|
+
|
|
+ avec0 = _mm256_load_pd(a); a += 4;
|
|
+ avec1 = _mm256_load_pd(a); a += 4;
|
|
+ avec2 = _mm256_load_pd(a); a += 4;
|
|
+ avec3 = _mm256_load_pd(a); a += 4;
|
|
+
|
|
+ acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
|
|
+ acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
|
|
+ acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
|
|
+ acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
|
|
+
|
|
+ acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_);
|
|
+ acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_);
|
|
+ acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_);
|
|
+ acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_);
|
|
+ }
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, acc0);
|
|
+ _mm256_store_pd(x + 1*4, acc1);
|
|
+ _mm256_store_pd(x + 2*4, acc2);
|
|
+ _mm256_store_pd(x + 3*4, acc3);
|
|
+
|
|
+ _mm256_store_pd(x_ + 0*4, acc0_);
|
|
+ _mm256_store_pd(x_ + 1*4, acc1_);
|
|
+ _mm256_store_pd(x_ + 2*4, acc2_);
|
|
+ _mm256_store_pd(x_ + 3*4, acc3_);
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,mul16rows2D,
|
|
+ (double *x, double *x_, const double *a, const double *b, const double *b_, long n));
|
|
+
|
|
+#elif defined(NTL_HAVE_AVX)
|
|
static
|
|
void mul16rows2D(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
|
|
{
|
|
@@ -1309,6 +1663,7 @@ void mul16rows2D(double *x, double *x_,
|
|
_mm256_store_pd(x_ + 3*4, acc3_);
|
|
}
|
|
|
|
+#endif
|
|
|
|
#endif
|
|
|
|
@@ -1422,7 +1777,7 @@ void CompMod_L(zz_pX& x, const zz_pX& g,
|
|
}
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
static
|
|
void InnerProduct_AVX(zz_pX& x, const Vec<double>& v, long low, long high,
|
|
@@ -1534,7 +1889,6 @@ void CompMod_AVX(zz_pX& x, const zz_pX&
|
|
|
|
x = t;
|
|
}
|
|
-#endif
|
|
|
|
|
|
|
|
@@ -1570,6 +1924,14 @@ void CompMod(zz_pX& x, const zz_pX& g, c
|
|
break;
|
|
|
|
#endif
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+ case 3:
|
|
+ if (have_avx) {
|
|
+ CompMod_AVX(x, g, A, F);
|
|
+ break;
|
|
+ }
|
|
+ /* FALLTHRU */
|
|
+#endif
|
|
|
|
default:
|
|
LogicError("CompMod: bad strategy");
|
|
--- src/mat_lzz_p.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/mat_lzz_p.c 2016-07-20 19:57:16.776610306 -0600
|
|
@@ -10,6 +10,15 @@
|
|
|
|
#ifdef NTL_HAVE_AVX
|
|
#include <immintrin.h>
|
|
+#define AVX_ACTIVE 1
|
|
+#elif defined(NTL_LOADTIME_CPU)
|
|
+#include <immintrin.h>
|
|
+#define AVX_ACTIVE have_avx
|
|
+
|
|
+static int have_pclmul = -1;
|
|
+static int have_avx = -1;
|
|
+static int have_fma = -1;
|
|
+static int have_avx2 = -1;
|
|
#endif
|
|
|
|
NTL_START_IMPL
|
|
@@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
|
|
|
|
#ifdef NTL_HAVE_LL_TYPE
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
#define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
|
|
// max int representable exactly as a double
|
|
@@ -640,19 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
|
|
|
|
|
|
// MUL_ADD(a, b, c): a += b*c
|
|
+#define FMA_MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
|
|
+#define AVX_MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
|
|
#ifdef NTL_HAVE_FMA
|
|
-#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
|
|
+#define MUL_ADD(a, b, c) FMA_MUL_ADD(a, b, c)
|
|
#else
|
|
-#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
|
|
+#define MUL_ADD(a, b, c) AVX_MUL_ADD(a, b, c)
|
|
#endif
|
|
|
|
-#if 0
|
|
-static
|
|
-void muladd1_by_32(double *x, const double *a, const double *b, long n)
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,muladd1_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
{
|
|
- __m256d avec, bvec;
|
|
+ __m256d acc0=_mm256_load_pd(x + 0*4);
|
|
+ __m256d acc1=_mm256_load_pd(x + 1*4);
|
|
+ __m256d acc2=_mm256_load_pd(x + 2*4);
|
|
+ __m256d acc3=_mm256_load_pd(x + 3*4);
|
|
+ __m256d acc4=_mm256_load_pd(x + 4*4);
|
|
+ __m256d acc5=_mm256_load_pd(x + 5*4);
|
|
+ __m256d acc6=_mm256_load_pd(x + 6*4);
|
|
+ __m256d acc7=_mm256_load_pd(x + 7*4);
|
|
+
|
|
+ long i = 0;
|
|
+ for (; i <= n-4; i +=4) {
|
|
+
|
|
+ // the following code sequences are a bit faster than
|
|
+ // just doing 4 _mm256_broadcast_sd's
|
|
+ // it requires a to point to aligned storage, however
|
|
+
|
|
+#if 1
|
|
+ // this one seems slightly faster
|
|
+ __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
|
|
+ __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
|
|
+#else
|
|
+ __m256d avec = _mm256_load_pd(a);
|
|
+ __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
|
|
+ __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
|
|
+
|
|
+#endif
|
|
+
|
|
+ __m256d avec0 = _mm256_permute_pd(a0101, 0);
|
|
+ __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
|
|
+ __m256d avec2 = _mm256_permute_pd(a2323, 0);
|
|
+ __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
|
|
+
|
|
+ a += 4;
|
|
+
|
|
+ __m256d bvec;
|
|
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec2, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec3, bvec);
|
|
+ }
|
|
+
|
|
+ for (; i < n; i++) {
|
|
+ __m256d avec = _mm256_broadcast_sd(a); a++;
|
|
+ __m256d bvec;
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, acc0);
|
|
+ _mm256_store_pd(x + 1*4, acc1);
|
|
+ _mm256_store_pd(x + 2*4, acc2);
|
|
+ _mm256_store_pd(x + 3*4, acc3);
|
|
+ _mm256_store_pd(x + 4*4, acc4);
|
|
+ _mm256_store_pd(x + 5*4, acc5);
|
|
+ _mm256_store_pd(x + 6*4, acc6);
|
|
+ _mm256_store_pd(x + 7*4, acc7);
|
|
+}
|
|
|
|
+FMA_FUNC(void,muladd1_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
__m256d acc0=_mm256_load_pd(x + 0*4);
|
|
__m256d acc1=_mm256_load_pd(x + 1*4);
|
|
__m256d acc2=_mm256_load_pd(x + 2*4);
|
|
@@ -662,19 +772,82 @@ void muladd1_by_32(double *x, const doub
|
|
__m256d acc6=_mm256_load_pd(x + 6*4);
|
|
__m256d acc7=_mm256_load_pd(x + 7*4);
|
|
|
|
+ long i = 0;
|
|
+ for (; i <= n-4; i +=4) {
|
|
|
|
- for (long i = 0; i < n; i++) {
|
|
- avec = _mm256_broadcast_sd(a); a++;
|
|
+ // the following code sequences are a bit faster than
|
|
+ // just doing 4 _mm256_broadcast_sd's
|
|
+ // it requires a to point to aligned storage, however
|
|
+
|
|
+#if 1
|
|
+ // this one seems slightly faster
|
|
+ __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
|
|
+ __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
|
|
+#else
|
|
+ __m256d avec = _mm256_load_pd(a);
|
|
+ __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
|
|
+ __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
|
|
|
|
+#endif
|
|
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
|
|
- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
|
|
+ __m256d avec0 = _mm256_permute_pd(a0101, 0);
|
|
+ __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
|
|
+ __m256d avec2 = _mm256_permute_pd(a2323, 0);
|
|
+ __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
|
|
+
|
|
+ a += 4;
|
|
+
|
|
+ __m256d bvec;
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec0, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec0, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec1, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec2, bvec);
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec3, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec3, bvec);
|
|
+ }
|
|
+
|
|
+ for (; i < n; i++) {
|
|
+ __m256d avec = _mm256_broadcast_sd(a); a++;
|
|
+ __m256d bvec;
|
|
+
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec, bvec);
|
|
+ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec, bvec);
|
|
}
|
|
|
|
|
|
@@ -688,6 +861,9 @@ void muladd1_by_32(double *x, const doub
|
|
_mm256_store_pd(x + 7*4, acc7);
|
|
}
|
|
|
|
+FMA_RESOLVER(static void,muladd1_by_32,
|
|
+ (double *x, const double *a, const double *b, long n));
|
|
+
|
|
#else
|
|
|
|
static
|
|
@@ -794,7 +970,164 @@ void muladd1_by_32(double *x, const doub
|
|
#endif
|
|
|
|
// experiment: process two rows at a time
|
|
-#if 1
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+AVX_FUNC(void,muladd2_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, bvec;
|
|
+ __m256d acc00, acc01, acc02, acc03;
|
|
+ __m256d acc10, acc11, acc12, acc13;
|
|
+
|
|
+
|
|
+ // round 0
|
|
+
|
|
+ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
|
|
+
|
|
+ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+ // round 1
|
|
+
|
|
+ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
|
|
+
|
|
+ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,muladd2_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, bvec;
|
|
+ __m256d acc00, acc01, acc02, acc03;
|
|
+ __m256d acc10, acc11, acc12, acc13;
|
|
+
|
|
+
|
|
+ // round 0
|
|
+
|
|
+ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
|
|
+
|
|
+ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+ // round 1
|
|
+
|
|
+ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
|
|
+
|
|
+ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,muladd2_by_32,
|
|
+ (double *x, const double *a, const double *b, long n));
|
|
+
|
|
+#else
|
|
+
|
|
static
|
|
void muladd2_by_32(double *x, const double *a, const double *b, long n)
|
|
{
|
|
@@ -870,96 +1203,217 @@ void muladd2_by_32(double *x, const doub
|
|
_mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
|
|
|
|
}
|
|
+#endif
|
|
|
|
-#else
|
|
|
|
-static
|
|
-void muladd2_by_32(double *x, const double *a, const double *b, long n)
|
|
+
|
|
+// experiment: process three rows at a time
|
|
+// NOTE: this makes things slower on an AVX1 platform --- not enough registers
|
|
+// it could be faster on AVX2/FMA, where there should be enough registers
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+FMA_FUNC(void,muladd3_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
{
|
|
- long i, j;
|
|
- __m256d bvec;
|
|
+ __m256d avec0, avec1, avec2, bvec;
|
|
__m256d acc00, acc01, acc02, acc03;
|
|
__m256d acc10, acc11, acc12, acc13;
|
|
+ __m256d acc20, acc21, acc22, acc23;
|
|
+
|
|
|
|
+ // round 0
|
|
|
|
- for (j = 0; j < 2; j++) {
|
|
+ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
|
|
|
|
- acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
+ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
|
|
|
|
- acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
- acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
|
|
+ acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
|
|
+ acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
|
|
+ acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
|
|
+ acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
|
|
|
|
- for (i = 0; i <= n-4; i+=4) {
|
|
- __m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0));
|
|
- __m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2));
|
|
- __m256d avec00 = _mm256_permute_pd(a0_0101, 0);
|
|
- __m256d avec01 = _mm256_permute_pd(a0_0101, 0xf);
|
|
- __m256d avec02 = _mm256_permute_pd(a0_2323, 0);
|
|
- __m256d avec03 = _mm256_permute_pd(a0_2323, 0xf);
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
|
|
|
|
- __m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ));
|
|
- __m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ));
|
|
- __m256d avec10 = _mm256_permute_pd(a1_0101, 0);
|
|
- __m256d avec11 = _mm256_permute_pd(a1_0101, 0xf);
|
|
- __m256d avec12 = _mm256_permute_pd(a1_2323, 0);
|
|
- __m256d avec13 = _mm256_permute_pd(a1_2323, 0xf);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
|
|
+ }
|
|
|
|
- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec);
|
|
|
|
- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec);
|
|
+ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
|
|
|
|
- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec);
|
|
+ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
|
|
|
|
- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec);
|
|
- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec);
|
|
- }
|
|
+ _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
|
|
+ _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
|
|
+ _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
|
|
+ _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
|
|
|
|
- for (; i < n; i++) {
|
|
- __m256d avec0 = _mm256_broadcast_sd(&a[i]);
|
|
- __m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+ // round 1
|
|
|
|
- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
|
|
- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
|
|
- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
|
|
- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
|
|
- }
|
|
+ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
|
|
|
|
+ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
|
|
|
|
- _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00);
|
|
- _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01);
|
|
- _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02);
|
|
- _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03);
|
|
+ acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
|
|
+ acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
|
|
+ acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
|
|
+ acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
|
|
|
|
- _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10);
|
|
- _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11);
|
|
- _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12);
|
|
- _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13);
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
|
|
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
|
|
}
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
|
|
+ _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
|
|
+ _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
|
|
+ _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
|
|
+
|
|
}
|
|
-#endif
|
|
|
|
+AVX2_FUNC(void,muladd3_by_32)
|
|
+(double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ __m256d avec0, avec1, avec2, bvec;
|
|
+ __m256d acc00, acc01, acc02, acc03;
|
|
+ __m256d acc10, acc11, acc12, acc13;
|
|
+ __m256d acc20, acc21, acc22, acc23;
|
|
+
|
|
+
|
|
+ // round 0
|
|
|
|
+ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
|
|
|
|
-// experiment: process three rows at a time
|
|
-// NOTE: this makes things slower on an AVX1 platform --- not enough registers
|
|
-// it could be faster on AVX2/FMA, where there should be enough registers
|
|
+ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
|
|
+ acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
|
|
+ acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
|
|
+ acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
|
|
+ _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
|
|
+ _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
|
|
+ _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
|
|
+
|
|
+ // round 1
|
|
+
|
|
+ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
|
|
+ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
|
|
+ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
|
|
+ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
|
|
+
|
|
+ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
|
|
+ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
|
|
+ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
|
|
+ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
|
|
+
|
|
+ acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
|
|
+ acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
|
|
+ acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
|
|
+ acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
|
|
+
|
|
+ for (long i = 0; i < n; i++) {
|
|
+ avec0 = _mm256_broadcast_sd(&a[i]);
|
|
+ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
|
|
+ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
|
|
+
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
|
|
+ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
|
|
+ }
|
|
+
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
|
|
+ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
|
|
+ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
|
|
+ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
|
|
+ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
|
|
+ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
|
|
+ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
|
|
+
|
|
+ _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
|
|
+ _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
|
|
+ _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
|
|
+ _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
|
|
+
|
|
+}
|
|
+
|
|
+AVX2_RESOLVER(static void,muladd3_by_32,
|
|
+ (double *x, const double *a, const double *b, long n));
|
|
+
|
|
+#else
|
|
|
|
static
|
|
void muladd3_by_32(double *x, const double *a, const double *b, long n)
|
|
@@ -1060,6 +1514,32 @@ void muladd3_by_32(double *x, const doub
|
|
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+static inline
|
|
+void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
|
|
+{
|
|
+ long i = first;
|
|
+
|
|
+ if (have_fma) {
|
|
+ // processing three rows at a time is faster
|
|
+ for (; i <= last-3; i+=3)
|
|
+ muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
|
|
+ for (; i < last; i++)
|
|
+ muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
|
|
+ } else {
|
|
+ // process only two rows at a time: not enough registers :-(
|
|
+ for (; i <= last-2; i+=2)
|
|
+ muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
|
|
+ for (; i < last; i++)
|
|
+ muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
|
|
+ }
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
static inline
|
|
void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
|
|
{
|
|
@@ -1079,8 +1559,79 @@ void muladd_all_by_32(long first, long l
|
|
#endif
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
|
|
// this assumes n is a multiple of 16
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,muladd_interval)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+ __m256d xvec0, xvec1, xvec2, xvec3;
|
|
+ __m256d yvec0, yvec1, yvec2, yvec3;
|
|
+
|
|
+ __m256d cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ for (long i = 0; i < n; i += 16, x += 16, y += 16) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ xvec1 = _mm256_load_pd(x+1*4);
|
|
+ xvec2 = _mm256_load_pd(x+2*4);
|
|
+ xvec3 = _mm256_load_pd(x+3*4);
|
|
+
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ yvec1 = _mm256_load_pd(y+1*4);
|
|
+ yvec2 = _mm256_load_pd(y+2*4);
|
|
+ yvec3 = _mm256_load_pd(y+3*4);
|
|
+
|
|
+ AVX_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ AVX_MUL_ADD(xvec1, yvec1, cvec);
|
|
+ AVX_MUL_ADD(xvec2, yvec2, cvec);
|
|
+ AVX_MUL_ADD(xvec3, yvec3, cvec);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ _mm256_store_pd(x + 1*4, xvec1);
|
|
+ _mm256_store_pd(x + 2*4, xvec2);
|
|
+ _mm256_store_pd(x + 3*4, xvec3);
|
|
+ }
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,muladd_interval)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+ __m256d xvec0, xvec1, xvec2, xvec3;
|
|
+ __m256d yvec0, yvec1, yvec2, yvec3;
|
|
+
|
|
+ __m256d cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ for (long i = 0; i < n; i += 16, x += 16, y += 16) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ xvec1 = _mm256_load_pd(x+1*4);
|
|
+ xvec2 = _mm256_load_pd(x+2*4);
|
|
+ xvec3 = _mm256_load_pd(x+3*4);
|
|
+
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ yvec1 = _mm256_load_pd(y+1*4);
|
|
+ yvec2 = _mm256_load_pd(y+2*4);
|
|
+ yvec3 = _mm256_load_pd(y+3*4);
|
|
+
|
|
+ FMA_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ FMA_MUL_ADD(xvec1, yvec1, cvec);
|
|
+ FMA_MUL_ADD(xvec2, yvec2, cvec);
|
|
+ FMA_MUL_ADD(xvec3, yvec3, cvec);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ _mm256_store_pd(x + 1*4, xvec1);
|
|
+ _mm256_store_pd(x + 2*4, xvec2);
|
|
+ _mm256_store_pd(x + 3*4, xvec3);
|
|
+ }
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,muladd_interval,
|
|
+ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
|
|
+
|
|
+#else
|
|
+
|
|
static inline
|
|
void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
{
|
|
@@ -1111,9 +1662,109 @@ void muladd_interval(double * NTL_RESTRI
|
|
_mm256_store_pd(x + 3*4, xvec3);
|
|
}
|
|
}
|
|
+#endif
|
|
|
|
// this one is more general: does not assume that n is a
|
|
// multiple of 16
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,muladd_interval1)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+
|
|
+ __m256d xvec0, xvec1, xvec2, xvec3;
|
|
+ __m256d yvec0, yvec1, yvec2, yvec3;
|
|
+ __m256d cvec;
|
|
+
|
|
+ if (n >= 4)
|
|
+ cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ long i=0;
|
|
+ for (; i <= n-16; i += 16, x += 16, y += 16) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ xvec1 = _mm256_load_pd(x+1*4);
|
|
+ xvec2 = _mm256_load_pd(x+2*4);
|
|
+ xvec3 = _mm256_load_pd(x+3*4);
|
|
+
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ yvec1 = _mm256_load_pd(y+1*4);
|
|
+ yvec2 = _mm256_load_pd(y+2*4);
|
|
+ yvec3 = _mm256_load_pd(y+3*4);
|
|
+
|
|
+ AVX_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ AVX_MUL_ADD(xvec1, yvec1, cvec);
|
|
+ AVX_MUL_ADD(xvec2, yvec2, cvec);
|
|
+ AVX_MUL_ADD(xvec3, yvec3, cvec);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ _mm256_store_pd(x + 1*4, xvec1);
|
|
+ _mm256_store_pd(x + 2*4, xvec2);
|
|
+ _mm256_store_pd(x + 3*4, xvec3);
|
|
+ }
|
|
+
|
|
+ for (; i <= n-4; i += 4, x += 4, y += 4) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ AVX_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ }
|
|
+
|
|
+ for (; i < n; i++, x++, y++) {
|
|
+ *x += (*y)*c;
|
|
+ }
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,muladd_interval1)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+
|
|
+ __m256d xvec0, xvec1, xvec2, xvec3;
|
|
+ __m256d yvec0, yvec1, yvec2, yvec3;
|
|
+ __m256d cvec;
|
|
+
|
|
+ if (n >= 4)
|
|
+ cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ long i=0;
|
|
+ for (; i <= n-16; i += 16, x += 16, y += 16) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ xvec1 = _mm256_load_pd(x+1*4);
|
|
+ xvec2 = _mm256_load_pd(x+2*4);
|
|
+ xvec3 = _mm256_load_pd(x+3*4);
|
|
+
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ yvec1 = _mm256_load_pd(y+1*4);
|
|
+ yvec2 = _mm256_load_pd(y+2*4);
|
|
+ yvec3 = _mm256_load_pd(y+3*4);
|
|
+
|
|
+ FMA_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ FMA_MUL_ADD(xvec1, yvec1, cvec);
|
|
+ FMA_MUL_ADD(xvec2, yvec2, cvec);
|
|
+ FMA_MUL_ADD(xvec3, yvec3, cvec);
|
|
+
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ _mm256_store_pd(x + 1*4, xvec1);
|
|
+ _mm256_store_pd(x + 2*4, xvec2);
|
|
+ _mm256_store_pd(x + 3*4, xvec3);
|
|
+ }
|
|
+
|
|
+ for (; i <= n-4; i += 4, x += 4, y += 4) {
|
|
+ xvec0 = _mm256_load_pd(x+0*4);
|
|
+ yvec0 = _mm256_load_pd(y+0*4);
|
|
+ FMA_MUL_ADD(xvec0, yvec0, cvec);
|
|
+ _mm256_store_pd(x + 0*4, xvec0);
|
|
+ }
|
|
+
|
|
+ for (; i < n; i++, x++, y++) {
|
|
+ *x += (*y)*c;
|
|
+ }
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,muladd_interval1,
|
|
+ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
|
|
+
|
|
+#else
|
|
+
|
|
static inline
|
|
void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
{
|
|
@@ -1159,11 +1810,74 @@ void muladd_interval1(double * NTL_RESTR
|
|
*x += (*y)*c;
|
|
}
|
|
}
|
|
+#endif
|
|
|
|
#define AVX_PD_SZ (4)
|
|
|
|
// experimental: assumes n is a multiple of 4 in the range [0..32]
|
|
-#if 1
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+
|
|
+AVX_FUNC(void,muladd_interval2)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+ n /= 4;
|
|
+ if (n <= 0 || n > 8) return;
|
|
+
|
|
+ x += n*4;
|
|
+ y += n*4;
|
|
+
|
|
+ // n in [1..8]
|
|
+
|
|
+ __m256d xvec, yvec, cvec;
|
|
+
|
|
+ cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ switch (n) {
|
|
+ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
|
|
+ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
|
|
+ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
|
|
+ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
|
|
+ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
|
|
+ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
|
|
+ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
|
|
+ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
|
|
+ }
|
|
+
|
|
+}
|
|
+
|
|
+FMA_FUNC(void,muladd_interval2)
|
|
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
+{
|
|
+ n /= 4;
|
|
+ if (n <= 0 || n > 8) return;
|
|
+
|
|
+ x += n*4;
|
|
+ y += n*4;
|
|
+
|
|
+ // n in [1..8]
|
|
+
|
|
+ __m256d xvec, yvec, cvec;
|
|
+
|
|
+ cvec = _mm256_broadcast_sd(&c);
|
|
+
|
|
+ switch (n) {
|
|
+ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
|
|
+ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
|
|
+ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
|
|
+ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
|
|
+ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
|
|
+ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
|
|
+ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
|
|
+ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
|
|
+ }
|
|
+
|
|
+}
|
|
+
|
|
+FMA_RESOLVER(static void,muladd_interval2,
|
|
+ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
|
|
+
|
|
+#else
|
|
+
|
|
static inline
|
|
void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
{
|
|
@@ -1191,13 +1905,6 @@ void muladd_interval2(double * NTL_RESTR
|
|
}
|
|
|
|
}
|
|
-#else
|
|
-static inline
|
|
-void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
|
|
-{
|
|
- for (long i = 0; i < n; i++)
|
|
- x[i] += y[i]*c;
|
|
-}
|
|
#endif
|
|
|
|
#endif
|
|
@@ -2031,10 +2738,10 @@ void alt_mul_LL(const mat_window_zz_p& X
|
|
}
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
-static
|
|
-void blk_mul_DD(const mat_window_zz_p& X,
|
|
+static void __attribute__((target ("avx,pclmul")))
|
|
+blk_mul_DD(const mat_window_zz_p& X,
|
|
const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
|
|
{
|
|
long n = A.NumRows();
|
|
@@ -2401,8 +3108,9 @@ void mul_base (const mat_window_zz_p& X,
|
|
|
|
long V = MAT_BLK_SZ*4;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined (NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
@@ -2482,7 +3190,8 @@ void mul_strassen(const mat_window_zz_p&
|
|
// this code determines if mul_base triggers blk_mul_DD,
|
|
// in which case a higher crossover is used
|
|
|
|
-#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
|
|
+#if (defined(NTL_HAVE_LL_TYPE) && (defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)))
|
|
+ if (AVX_ACTIVE)
|
|
{
|
|
long V = MAT_BLK_SZ*4;
|
|
long p = zz_p::modulus();
|
|
@@ -2982,10 +3691,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
|
|
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
-static
|
|
-void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
|
|
+static void __attribute__((target ("avx,pclmul")))
|
|
+alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
|
|
{
|
|
long n = A.NumRows();
|
|
|
|
@@ -3151,10 +3860,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
|
|
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
-static
|
|
-void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
|
|
+static void __attribute__((target ("avx,pclmul")))
|
|
+blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
|
|
{
|
|
long n = A.NumRows();
|
|
|
|
@@ -3912,8 +4621,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
|
|
else if (n/MAT_BLK_SZ < 4) {
|
|
long V = 64;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
@@ -3938,8 +4648,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
|
|
else {
|
|
long V = 4*MAT_BLK_SZ;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
@@ -4345,10 +5056,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
|
|
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
-static
|
|
-void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
|
|
+static void __attribute__((target ("avx,pclmul")))
|
|
+alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
|
|
vec_zz_p *xp, bool trans, bool relax)
|
|
{
|
|
long n = A.NumRows();
|
|
@@ -4535,10 +5246,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
|
|
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
-static
|
|
-void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
|
|
+static void __attribute__((target ("avx,pclmul")))
|
|
+blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
|
|
vec_zz_p *xp, bool trans, bool relax)
|
|
{
|
|
long n = A.NumRows();
|
|
@@ -5349,8 +6060,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
|
|
else if (n/MAT_BLK_SZ < 4) {
|
|
long V = 64;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
@@ -5375,8 +6087,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
|
|
else {
|
|
long V = 4*MAT_BLK_SZ;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
@@ -5622,7 +6335,7 @@ long elim_basic(const mat_zz_p& A, mat_z
|
|
#ifdef NTL_HAVE_LL_TYPE
|
|
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
|
|
|
|
static inline
|
|
@@ -7075,8 +7788,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
|
|
else {
|
|
long V = 4*MAT_BLK_SZ;
|
|
|
|
-#ifdef NTL_HAVE_AVX
|
|
- if (p-1 <= MAX_DBL_INT &&
|
|
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
|
|
+ if (AVX_ACTIVE &&
|
|
+ p-1 <= MAX_DBL_INT &&
|
|
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
|
|
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
|
|
|
|
--- src/QuickTest.c.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/QuickTest.c 2016-07-20 19:57:16.777610226 -0600
|
|
@@ -339,6 +339,9 @@ cerr << "Performance Options:\n";
|
|
cerr << "NTL_PCLMUL\n";
|
|
#endif
|
|
|
|
+#ifdef NTL_LOADTIME_CPU
|
|
+ cerr << "NTL_LOADTIME_CPU\n";
|
|
+#endif
|
|
|
|
cerr << "\n\n";
|
|
|
|
--- src/WizardAux.orig 2016-06-21 12:46:44.000000000 -0600
|
|
+++ src/WizardAux 2016-07-20 19:57:16.777610226 -0600
|
|
@@ -94,6 +94,7 @@ system("make InitSettings");
|
|
'NTL_PCLMUL' => 0,
|
|
'NTL_FFT_BIGTAB' => 0,
|
|
'NTL_FFT_LAZYMUL' => 0,
|
|
+'NTL_LOADTIME_CPU' => 0,
|
|
|
|
'WIZARD_HACK' => '#define NTL_WIZARD_HACK',
|
|
|