--- doc/config.txt.orig 2016-06-21 12:46:44.000000000 -0600 +++ doc/config.txt 2016-07-20 19:57:16.756611904 -0600 @@ -292,6 +292,7 @@ NTL_GF2X_NOINLINE=off NTL_GF2X_ALTCODE=off NTL_GF2X_ALTCODE1=off NTL_PCLMUL=off +NTL_LOADTIME_CPU=off GMP_INCDIR=$(GMP_PREFIX)/include GMP_LIBDIR=$(GMP_PREFIX)/lib @@ -644,6 +645,10 @@ NTL_PCLMUL=off # switch to enable the PCLMUL instruction on x86 machines for faster arithmetic # over GF(2)[X] (without relying on the gf2x package) +NTL_LOADTIME_CPU=off + +# switch to check CPU characteristics at load time and use routines +# optimized for the executing CPU. ########## More GMP Options: --- include/NTL/config.h.orig 2016-06-21 12:46:44.000000000 -0600 +++ include/NTL/config.h 2016-07-20 19:57:16.766611105 -0600 @@ -625,6 +625,23 @@ using the configure script. #endif +#if 0 +#define NTL_LOADTIME_CPU + +/* + * With this flag enabled, detect advanced CPU features at load time instead + * of at compile time. This flag is intended for distributions, so that they + * can compile for the lowest common denominator CPU, but still support newer + * CPUs. + * + * This flag is useful only on x86_64 platforms with gcc 4.8 or later. + * + * To re-build after changing this flag: + * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a + */ + +#endif + --- include/NTL/ctools.h.orig 2016-06-21 12:46:44.000000000 -0600 +++ include/NTL/ctools.h 2016-07-20 19:57:16.767611025 -0600 @@ -473,6 +473,137 @@ char *_ntl_make_aligned(char *p, long al // this should be big enough to satisfy any SIMD instructions, // and it should also be as big as a cache line +/* Determine CPU characteristics at runtime */ +#ifdef NTL_LOADTIME_CPU +#if !defined(__x86_64__) +#error Runtime CPU support is only available on x86_64. +#endif +#ifndef __GNUC__ +#error Runtime CPU support is only available with GCC. +#endif +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6) +#error Runtime CPU support is only available with GCC 4.6 or later. +#endif + +#include +#ifndef bit_PCLMUL +#define bit_PCLMUL (1 << 1) +#endif +#ifndef bit_AVX +#define bit_AVX (1 << 28) +#endif +#ifndef bit_FMA +#define bit_FMA (1 << 12) +#endif +#ifndef bit_AVX2 +#define bit_AVX2 (1 << 5) +#endif + +#define BASE_FUNC(type,name) static type name##_base +#define TARGET_FUNC(arch,suffix,type,name) \ + static type __attribute__((target (arch))) name##_##suffix +#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name) +#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name) +#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name) +#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name) +#define PCLMUL_RESOLVER(type,name,params) \ + extern "C" { \ + static void __attribute__((optimize ("O0"))) \ + (*resolve_##name (void))(void) { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ + have_pclmul = ((ecx & bit_PCLMUL) != 0); \ + have_avx = ((ecx & bit_AVX) != 0); \ + have_fma = ((ecx & bit_FMA) != 0); \ + } else { \ + have_pclmul = 0; \ + have_avx = 0; \ + have_fma = 0; \ + } \ + } \ + if (have_avx) return (void (*)(void))&name##_avx; \ + if (have_pclmul) return (void (*)(void))&name##_pclmul; \ + return (void (*)(void))&name##_base; \ + } \ + } \ + type __attribute__((ifunc ("resolve_" #name))) name params +#define AVX_RESOLVER(type,name,params) \ + extern "C" { \ + static void __attribute__((optimize ("O0"))) \ + (*resolve_##name (void))(void) { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ + have_pclmul = ((ecx & bit_PCLMUL) != 0); \ + have_avx = ((ecx & bit_AVX) != 0); \ + have_fma = ((ecx & bit_FMA) != 0); \ + } else { \ + have_pclmul = 0; \ + have_avx = 0; \ + have_fma = 0; \ + } \ + } \ + return have_avx \ + ? (void (*)(void))&name##_avx \ + : (void (*)(void))&name##_base; \ + } \ + } \ + type __attribute__((ifunc ("resolve_" #name))) name params +#define FMA_RESOLVER(type,name,params) \ + extern "C" { \ + static void __attribute__((optimize ("O0"))) \ + (*resolve_##name (void))(void) { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ + have_pclmul = ((ecx & bit_PCLMUL) != 0); \ + have_avx = ((ecx & bit_AVX) != 0); \ + have_fma = ((ecx & bit_FMA) != 0); \ + } else { \ + have_pclmul = 0; \ + have_avx = 0; \ + have_fma = 0; \ + } \ + } \ + return have_fma \ + ? (void (*)(void))&name##_fma \ + : (void (*)(void))&name##_avx; \ + } \ + } \ + type __attribute__((ifunc ("resolve_" #name))) name params +#define AVX2_RESOLVER(type,name,params) \ + extern "C" { \ + static void __attribute__((optimize ("O0"))) \ + (*resolve_##name (void))(void) { \ + if (__builtin_expect(have_avx2, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \ + have_avx2 = ((ebx & bit_AVX2) != 0); \ + } else { \ + have_avx2 = 0; \ + } \ + } \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ + have_pclmul = ((ecx & bit_PCLMUL) != 0); \ + have_avx = ((ecx & bit_AVX) != 0); \ + have_fma = ((ecx & bit_FMA) != 0); \ + } else { \ + have_pclmul = 0; \ + have_avx = 0; \ + have_fma = 0; \ + } \ + } \ + unsigned int eax, ebx, ecx, edx; \ + return have_avx2 \ + ? (void (*)(void))&name##_avx2 \ + : (void (*)(void))&name##_fma; \ + } \ + } \ + type __attribute__((ifunc ("resolve_" #name))) name params +#endif --- include/NTL/def_config.h.orig 2016-06-21 12:46:44.000000000 -0600 +++ include/NTL/def_config.h 2016-07-20 19:57:16.767611025 -0600 @@ -625,6 +625,22 @@ using the configure script. #endif +#if 0 +#define NTL_LOADTIME_CPU + +/* + * With this flag enabled, detect advanced CPU features at load time instead + * of at compile time. This flag is intended for distributions, so that they + * can compile for the lowest common denominator CPU, but still support newer + * CPUs. + * + * This flag is useful only on x86_64 platforms with gcc 4.8 or later. + * + * To re-build after changing this flag: + * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a + */ + +#endif --- src/cfile.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/cfile 2016-07-20 19:57:16.768610945 -0600 @@ -625,6 +625,23 @@ using the configure script. #endif +#if @{NTL_LOADTIME_CPU} +#define NTL_LOADTIME_CPU + +/* + * With this flag enabled, detect advanced CPU features at load time instead + * of at compile time. This flag is intended for distributions, so that they + * can compile for the lowest common denominator CPU, but still support newer + * CPUs. + * + * This flag is useful only on x86_64 platforms with gcc 4.8 or later. + * + * To re-build after changing this flag: + * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a + */ + +#endif + @{WIZARD_HACK} --- src/DispSettings.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/DispSettings.c 2016-07-20 19:57:16.768610945 -0600 @@ -191,6 +191,10 @@ cout << "Performance Options:\n"; cout << "NTL_PCLMUL\n"; #endif +#ifdef NTL_LOADTIME_CPU + cout << "NTL_LOADTIME_CPU\n"; +#endif + cout << "***************************/\n"; cout << "\n\n"; --- src/DoConfig.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/DoConfig 2016-07-20 19:57:16.769610865 -0600 @@ -1,7 +1,7 @@ # This is a perl script, invoked from a shell # use warnings; # this doesn't work on older versions of perl - +use Config; %MakeFlag = ( @@ -83,6 +83,7 @@ 'NTL_RANGE_CHECK' => 'off', 'NTL_FFT_BIGTAB' => 'off', 'NTL_FFT_LAZYMUL' => 'off', +'NTL_LOADTIME_CPU' => 'off', ); @@ -149,6 +150,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' & } +# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE + +if ($ConfigFlag{'NTL_LOADTIME_CPU'} eq 'on') { + if (!$Config{archname} =~ /x86_64/) { + die "Error: NTL_LOADTIME_CPU currently only available with x86_64...sorry\n"; + } + $ConfigFlag{'NTL_GF2X_NOINLINE'} = 'on'; +} + # some special MakeVal values that are determined by SHARED --- src/GF2X1.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/GF2X1.c 2016-07-20 19:57:16.770610785 -0600 @@ -19,7 +19,7 @@ // simple scaling factor for some crossover points: // we use a lower crossover of the underlying multiplication // is faster -#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL)) +#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL) || defined (NTL_LOADTIME_CPU)) #define XOVER_SCALE (1L) #else #define XOVER_SCALE (2L) --- src/GF2X.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/GF2X.c 2016-07-20 19:57:16.771610706 -0600 @@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0)); } +#elif defined (NTL_LOADTIME_CPU) + +#include + +static int have_pclmul = -1; +static int have_avx = -1; +static int have_fma = -1; + +#define NTL_INLINE inline + +#define pclmul_mul1(c,a,b) do { \ + __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \ + __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \ + _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0)); \ +} while (0) + #else @@ -579,6 +595,27 @@ void add(GF2X& x, const GF2X& a, const G +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + NTL_EFF_BB_MUL_CODE0 +} + +PCLMUL_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + pclmul_mul1(c, a, b); +} + +AVX_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + pclmul_mul1(c, a, b); +} + +PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); + +#else + static void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) { @@ -592,6 +629,7 @@ NTL_EFF_BB_MUL_CODE0 } +#endif #ifdef NTL_GF2X_NOINLINE @@ -616,6 +654,51 @@ NTL_EFF_BB_MUL_CODE0 #endif +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,Mul1) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + NTL_EFF_BB_MUL_CODE1 +} + +PCLMUL_FUNC(void,Mul1) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] = carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] = carry; +} + +AVX_FUNC(void,Mul1) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] = carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] = carry; +} + +PCLMUL_RESOLVER(static void,Mul1, + (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); + +#else + static void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) { @@ -643,6 +726,53 @@ NTL_EFF_BB_MUL_CODE1 } +#endif + +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,AddMul1) +(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) +{ + NTL_EFF_BB_MUL_CODE2 +} + +PCLMUL_FUNC(void,AddMul1) +(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] ^= carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] ^= carry; +} + +AVX_FUNC(void,AddMul1) +(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] ^= carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] ^= carry; +} + +PCLMUL_RESOLVER(static void,AddMul1, + (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)); + +#else + static void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) { @@ -671,6 +801,52 @@ NTL_EFF_BB_MUL_CODE2 } +#endif + +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,Mul1_short) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + NTL_EFF_SHORT_BB_MUL_CODE1 +} + +PCLMUL_FUNC(void,Mul1_short) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] = carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] = carry; +} + +AVX_FUNC(void,Mul1_short) +(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +{ + long i; + unsigned long carry, prod[2]; + + carry = 0; + for (i = 0; i < sb; i++) { + pclmul_mul1(prod, bp[i], a); + cp[i] = carry ^ prod[0]; + carry = prod[1]; + } + + cp[sb] = carry; +} + +PCLMUL_RESOLVER(static void,Mul1_short, + (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); + +#else static void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) @@ -699,9 +875,29 @@ NTL_EFF_SHORT_BB_MUL_CODE1 } +#endif + + +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + NTL_EFF_HALF_BB_MUL_CODE0 +} + +PCLMUL_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + pclmul_mul1(c, a, b); +} +AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +{ + pclmul_mul1(c, a, b); +} +PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); +#else static void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) @@ -716,6 +912,8 @@ NTL_EFF_HALF_BB_MUL_CODE0 } +#endif + // mul2...mul8 hard-code 2x2...8x8 word multiplies. // I adapted these routines from LiDIA (except mul3, see below). @@ -1627,6 +1825,77 @@ static const _ntl_ulong sqrtab[256] = { +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,sqr)(GF2X& c, const GF2X& a) +{ + long sa = a.xrep.length(); + if (sa <= 0) { + clear(c); + return; + } + + c.xrep.SetLength(sa << 1); + _ntl_ulong *cp = c.xrep.elts(); + const _ntl_ulong *ap = a.xrep.elts(); + + for (long i = sa-1; i >= 0; i--) { + _ntl_ulong *c = cp + (i << 1); + _ntl_ulong a = ap[i]; + _ntl_ulong hi, lo; + + NTL_BB_SQR_CODE + + c[0] = lo; + c[1] = hi; + } + + c.normalize(); + return; +} + +PCLMUL_FUNC(void,sqr)(GF2X& c, const GF2X& a) +{ + long sa = a.xrep.length(); + if (sa <= 0) { + clear(c); + return; + } + + c.xrep.SetLength(sa << 1); + _ntl_ulong *cp = c.xrep.elts(); + const _ntl_ulong *ap = a.xrep.elts(); + + for (long i = sa-1; i >= 0; i--) + pclmul_mul1 (cp + (i << 1), ap[i], ap[i]); + + c.normalize(); + return; +} + +AVX_FUNC(void,sqr)(GF2X& c, const GF2X& a) +{ + long sa = a.xrep.length(); + if (sa <= 0) { + clear(c); + return; + } + + c.xrep.SetLength(sa << 1); + _ntl_ulong *cp = c.xrep.elts(); + const _ntl_ulong *ap = a.xrep.elts(); + + for (long i = sa-1; i >= 0; i--) + pclmul_mul1 (cp + (i << 1), ap[i], ap[i]); + + c.normalize(); + return; +} + +PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a)); + +#else + static inline void sqr1(_ntl_ulong *c, _ntl_ulong a) { @@ -1667,6 +1936,7 @@ void sqr(GF2X& c, const GF2X& a) return; } +#endif void LeftShift(GF2X& c, const GF2X& a, long n) --- src/InitSettings.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/InitSettings.c 2016-07-20 19:57:16.772610626 -0600 @@ -156,6 +156,11 @@ int main() cout << "NTL_RANGE_CHECK=0\n"; #endif +#ifdef NTL_LOADTIME_CPU + cout << "NTL_LOADTIME_CPU=1\n"; +#else + cout << "NTL_LOADTIME_CPU=0\n"; +#endif // the following is synthetically defined #ifdef NTL_LONGLONG_SP_MULMOD --- src/lzz_pX1.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/lzz_pX1.c 2016-07-20 19:57:16.773610546 -0600 @@ -4,6 +4,12 @@ #ifdef NTL_HAVE_AVX #include +#elif defined(NTL_LOADTIME_CPU) +#include + +static int have_pclmul = -1; +static int have_avx = -1; +static int have_fma = -1; #endif @@ -1076,6 +1082,175 @@ void Comp3Mod(zz_pX& x1, zz_pX& x2, zz_p +#ifdef NTL_LOADTIME_CPU + +BASE_FUNC(void,build) +(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) +{ + altH.orig = &H; + altH.mem.kill(); + altH.row.kill(); + + if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; } + + altH.n = F.n; + altH.m = H.H.length()-1; + + long p = zz_p::modulus(); + long n = altH.n; + long m = altH.m; + + + if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) && + cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) { + altH.strategy = 1; + altH.pinv_L = sp_PrepRem(p); + } + else { + altH.strategy = 2; + altH.pinv_LL = make_sp_ll_reduce_struct(p); + } + + + if (altH.strategy == 1 || altH.strategy == 2) { + + altH.row.SetLength(n); + long **row = altH.row.elts(); + + const long AllocAmt = 1L << 18; + + long BlockSize = (AllocAmt + m - 1)/m; + long NumBlocks = (n + BlockSize - 1)/BlockSize; + + altH.mem.SetLength(NumBlocks); + + for (long i = 0; i < NumBlocks; i++) { + long first = i*BlockSize; + long last = min(n, first + BlockSize); + altH.mem[i].SetLength((last-first)*m); + for (long j = first; j < last; j++) { + row[j] = altH.mem[i].elts() + (j-first)*m; + } + } + + for (long i = 0; i < m; i++) { + const zz_p* ptr = H.H[i].rep.elts(); + long len = H.H[i].rep.length(); + for (long j = 0; j < len; j++) + row[j][i] = rep(ptr[j]); + for (long j = len; j < n; j++) + row[j][i] = 0; + } + } +} + +AVX_FUNC(void,build) +(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) +{ + altH.orig = &H; + altH.mem.kill(); + altH.row.kill(); + altH.dmem.kill(); + altH.drow.kill(); + + if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; } + + altH.n = F.n; + altH.m = H.H.length()-1; + + long p = zz_p::modulus(); + long n = altH.n; + long m = altH.m; + + if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) && + m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) { + altH.strategy = 3; + altH.pinv_L = sp_PrepRem(p); + } + else if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) && + cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) { + altH.strategy = 1; + altH.pinv_L = sp_PrepRem(p); + } + else { + altH.strategy = 2; + altH.pinv_LL = make_sp_ll_reduce_struct(p); + } + + + if (altH.strategy == 1 || altH.strategy == 2) { + + altH.row.SetLength(n); + long **row = altH.row.elts(); + + const long AllocAmt = 1L << 18; + + long BlockSize = (AllocAmt + m - 1)/m; + long NumBlocks = (n + BlockSize - 1)/BlockSize; + + altH.mem.SetLength(NumBlocks); + + for (long i = 0; i < NumBlocks; i++) { + long first = i*BlockSize; + long last = min(n, first + BlockSize); + altH.mem[i].SetLength((last-first)*m); + for (long j = first; j < last; j++) { + row[j] = altH.mem[i].elts() + (j-first)*m; + } + } + + for (long i = 0; i < m; i++) { + const zz_p* ptr = H.H[i].rep.elts(); + long len = H.H[i].rep.length(); + for (long j = 0; j < len; j++) + row[j][i] = rep(ptr[j]); + for (long j = len; j < n; j++) + row[j][i] = 0; + } + } else { + + // sanity check + if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow"); + + long npanels = (n+15)/16; + long panel_size = 16*m; + + const long AllocAmt = 1L << 18; + + long BlockSize = (AllocAmt + panel_size - 1)/panel_size; + long NumBlocks = (npanels + BlockSize - 1)/BlockSize; + + altH.dmem.SetLength(NumBlocks); + altH.drow.SetLength(npanels); + double **drow = altH.drow.elts(); + + for (long i = 0; i < NumBlocks; i++) { + long first = i*BlockSize; + long last = min(npanels, first + BlockSize); + altH.dmem[i].SetLength((last-first)*panel_size); + + double *ptr = altH.dmem[i].get(); + + for (long j = first; j < last; j++) + drow[j] = ptr + (j-first)*panel_size; + } + + for (long i = 0; i < m; i++) { + const zz_p *ptr = H.H[i].rep.elts(); + long len = H.H[i].rep.length(); + for (long j = 0; j < len; j++) + drow[j/16][(i*16) + (j%16)] = rep(ptr[j]); + for (long j = len; j < npanels*16; j++) + drow[j/16][(i*16) + (j%16)] = 0; + } + } +} + +AVX_RESOLVER(void,build, + (zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)); + +#else + void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) { altH.orig = &H; @@ -1194,11 +1369,82 @@ void build(zz_pXAltArgument& altH, const #endif } +#endif + #ifdef NTL_HAVE_LL_TYPE +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,mul16rowsD) +(double *x, const double *a, const double *b, long n) +{ + __m256d avec0, avec1, avec2, avec3; + + __m256d acc0 = _mm256_setzero_pd(); + __m256d acc1 = _mm256_setzero_pd(); + __m256d acc2 = _mm256_setzero_pd(); + __m256d acc3 = _mm256_setzero_pd(); + + __m256d bvec; + + for (long i = 0; i < n; i++) { + bvec = _mm256_broadcast_sd(&b[i]); + + avec0 = _mm256_load_pd(a); a += 4; + avec1 = _mm256_load_pd(a); a += 4; + avec2 = _mm256_load_pd(a); a += 4; + avec3 = _mm256_load_pd(a); a += 4; + + acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0); + acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1); + acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2); + acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3); + } + + _mm256_store_pd(x + 0*4, acc0); + _mm256_store_pd(x + 1*4, acc1); + _mm256_store_pd(x + 2*4, acc2); + _mm256_store_pd(x + 3*4, acc3); +} + +FMA_FUNC(void,mul16rowsD) +(double *x, const double *a, const double *b, long n) +{ + __m256d avec0, avec1, avec2, avec3; + + __m256d acc0 = _mm256_setzero_pd(); + __m256d acc1 = _mm256_setzero_pd(); + __m256d acc2 = _mm256_setzero_pd(); + __m256d acc3 = _mm256_setzero_pd(); + + __m256d bvec; + + for (long i = 0; i < n; i++) { + bvec = _mm256_broadcast_sd(&b[i]); + + avec0 = _mm256_load_pd(a); a += 4; + avec1 = _mm256_load_pd(a); a += 4; + avec2 = _mm256_load_pd(a); a += 4; + avec3 = _mm256_load_pd(a); a += 4; + + acc0 = _mm256_fmadd_pd(avec0, bvec, acc0); + acc1 = _mm256_fmadd_pd(avec1, bvec, acc1); + acc2 = _mm256_fmadd_pd(avec2, bvec, acc2); + acc3 = _mm256_fmadd_pd(avec3, bvec, acc3); + } + + _mm256_store_pd(x + 0*4, acc0); + _mm256_store_pd(x + 1*4, acc1); + _mm256_store_pd(x + 2*4, acc2); + _mm256_store_pd(x + 3*4, acc3); +} + +FMA_RESOLVER(static void,mul16rowsD, + (double *x, const double *a, const double *b, long n)); + +#elif defined(NTL_HAVE_AVX) -#ifdef NTL_HAVE_AVX static void mul16rowsD(double *x, const double *a, const double *b, long n) { @@ -1243,6 +1489,114 @@ void mul16rowsD(double *x, const double _mm256_store_pd(x + 3*4, acc3); } +#endif + +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,mul16rows2D) +(double *x, double *x_, const double *a, const double *b, const double *b_, long n) +{ + __m256d avec0, avec1, avec2, avec3; + + __m256d acc0 = _mm256_setzero_pd(); + __m256d acc1 = _mm256_setzero_pd(); + __m256d acc2 = _mm256_setzero_pd(); + __m256d acc3 = _mm256_setzero_pd(); + + __m256d acc0_ = _mm256_setzero_pd(); + __m256d acc1_ = _mm256_setzero_pd(); + __m256d acc2_ = _mm256_setzero_pd(); + __m256d acc3_ = _mm256_setzero_pd(); + + + __m256d bvec; + __m256d bvec_; + + for (long i = 0; i < n; i++) { + bvec = _mm256_broadcast_sd(&b[i]); + bvec_ = _mm256_broadcast_sd(&b_[i]); + + avec0 = _mm256_load_pd(a); a += 4; + avec1 = _mm256_load_pd(a); a += 4; + avec2 = _mm256_load_pd(a); a += 4; + avec3 = _mm256_load_pd(a); a += 4; + + acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0); + acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1); + acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2); + acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3); + + acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_); + acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_); + acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_); + acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_); + } + + _mm256_store_pd(x + 0*4, acc0); + _mm256_store_pd(x + 1*4, acc1); + _mm256_store_pd(x + 2*4, acc2); + _mm256_store_pd(x + 3*4, acc3); + + _mm256_store_pd(x_ + 0*4, acc0_); + _mm256_store_pd(x_ + 1*4, acc1_); + _mm256_store_pd(x_ + 2*4, acc2_); + _mm256_store_pd(x_ + 3*4, acc3_); +} + +FMA_FUNC(void,mul16rows2D) +(double *x, double *x_, const double *a, const double *b, const double *b_, long n) +{ + __m256d avec0, avec1, avec2, avec3; + + __m256d acc0 = _mm256_setzero_pd(); + __m256d acc1 = _mm256_setzero_pd(); + __m256d acc2 = _mm256_setzero_pd(); + __m256d acc3 = _mm256_setzero_pd(); + + __m256d acc0_ = _mm256_setzero_pd(); + __m256d acc1_ = _mm256_setzero_pd(); + __m256d acc2_ = _mm256_setzero_pd(); + __m256d acc3_ = _mm256_setzero_pd(); + + + __m256d bvec; + __m256d bvec_; + + for (long i = 0; i < n; i++) { + bvec = _mm256_broadcast_sd(&b[i]); + bvec_ = _mm256_broadcast_sd(&b_[i]); + + avec0 = _mm256_load_pd(a); a += 4; + avec1 = _mm256_load_pd(a); a += 4; + avec2 = _mm256_load_pd(a); a += 4; + avec3 = _mm256_load_pd(a); a += 4; + + acc0 = _mm256_fmadd_pd(avec0, bvec, acc0); + acc1 = _mm256_fmadd_pd(avec1, bvec, acc1); + acc2 = _mm256_fmadd_pd(avec2, bvec, acc2); + acc3 = _mm256_fmadd_pd(avec3, bvec, acc3); + + acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_); + acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_); + acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_); + acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_); + } + + _mm256_store_pd(x + 0*4, acc0); + _mm256_store_pd(x + 1*4, acc1); + _mm256_store_pd(x + 2*4, acc2); + _mm256_store_pd(x + 3*4, acc3); + + _mm256_store_pd(x_ + 0*4, acc0_); + _mm256_store_pd(x_ + 1*4, acc1_); + _mm256_store_pd(x_ + 2*4, acc2_); + _mm256_store_pd(x_ + 3*4, acc3_); +} + +FMA_RESOLVER(static void,mul16rows2D, + (double *x, double *x_, const double *a, const double *b, const double *b_, long n)); + +#elif defined(NTL_HAVE_AVX) static void mul16rows2D(double *x, double *x_, const double *a, const double *b, const double *b_, long n) { @@ -1309,6 +1663,7 @@ void mul16rows2D(double *x, double *x_, _mm256_store_pd(x_ + 3*4, acc3_); } +#endif #endif @@ -1422,7 +1777,7 @@ void CompMod_L(zz_pX& x, const zz_pX& g, } -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) static void InnerProduct_AVX(zz_pX& x, const Vec& v, long low, long high, @@ -1534,7 +1889,6 @@ void CompMod_AVX(zz_pX& x, const zz_pX& x = t; } -#endif @@ -1570,6 +1924,14 @@ void CompMod(zz_pX& x, const zz_pX& g, c break; #endif +#ifdef NTL_LOADTIME_CPU + case 3: + if (have_avx) { + CompMod_AVX(x, g, A, F); + break; + } + /* FALLTHRU */ +#endif default: LogicError("CompMod: bad strategy"); --- src/mat_lzz_p.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/mat_lzz_p.c 2016-07-20 19:57:16.776610306 -0600 @@ -10,6 +10,15 @@ #ifdef NTL_HAVE_AVX #include +#define AVX_ACTIVE 1 +#elif defined(NTL_LOADTIME_CPU) +#include +#define AVX_ACTIVE have_avx + +static int have_pclmul = -1; +static int have_avx = -1; +static int have_fma = -1; +static int have_avx2 = -1; #endif NTL_START_IMPL @@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A, #ifdef NTL_HAVE_LL_TYPE -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1) // max int representable exactly as a double @@ -640,19 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A, // MUL_ADD(a, b, c): a += b*c +#define FMA_MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) +#define AVX_MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c)) #ifdef NTL_HAVE_FMA -#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) +#define MUL_ADD(a, b, c) FMA_MUL_ADD(a, b, c) #else -#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c)) +#define MUL_ADD(a, b, c) AVX_MUL_ADD(a, b, c) #endif -#if 0 -static -void muladd1_by_32(double *x, const double *a, const double *b, long n) +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,muladd1_by_32) +(double *x, const double *a, const double *b, long n) { - __m256d avec, bvec; + __m256d acc0=_mm256_load_pd(x + 0*4); + __m256d acc1=_mm256_load_pd(x + 1*4); + __m256d acc2=_mm256_load_pd(x + 2*4); + __m256d acc3=_mm256_load_pd(x + 3*4); + __m256d acc4=_mm256_load_pd(x + 4*4); + __m256d acc5=_mm256_load_pd(x + 5*4); + __m256d acc6=_mm256_load_pd(x + 6*4); + __m256d acc7=_mm256_load_pd(x + 7*4); + + long i = 0; + for (; i <= n-4; i +=4) { + + // the following code sequences are a bit faster than + // just doing 4 _mm256_broadcast_sd's + // it requires a to point to aligned storage, however + +#if 1 + // this one seems slightly faster + __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0)); + __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2)); +#else + __m256d avec = _mm256_load_pd(a); + __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0); + __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11); + +#endif + + __m256d avec0 = _mm256_permute_pd(a0101, 0); + __m256d avec1 = _mm256_permute_pd(a0101, 0xf); + __m256d avec2 = _mm256_permute_pd(a2323, 0); + __m256d avec3 = _mm256_permute_pd(a2323, 0xf); + + a += 4; + + __m256d bvec; + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec); + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec); + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec2, bvec); + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec3, bvec); + } + + for (; i < n; i++) { + __m256d avec = _mm256_broadcast_sd(a); a++; + __m256d bvec; + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec, bvec); + } + + + _mm256_store_pd(x + 0*4, acc0); + _mm256_store_pd(x + 1*4, acc1); + _mm256_store_pd(x + 2*4, acc2); + _mm256_store_pd(x + 3*4, acc3); + _mm256_store_pd(x + 4*4, acc4); + _mm256_store_pd(x + 5*4, acc5); + _mm256_store_pd(x + 6*4, acc6); + _mm256_store_pd(x + 7*4, acc7); +} +FMA_FUNC(void,muladd1_by_32) +(double *x, const double *a, const double *b, long n) +{ __m256d acc0=_mm256_load_pd(x + 0*4); __m256d acc1=_mm256_load_pd(x + 1*4); __m256d acc2=_mm256_load_pd(x + 2*4); @@ -662,19 +772,82 @@ void muladd1_by_32(double *x, const doub __m256d acc6=_mm256_load_pd(x + 6*4); __m256d acc7=_mm256_load_pd(x + 7*4); + long i = 0; + for (; i <= n-4; i +=4) { - for (long i = 0; i < n; i++) { - avec = _mm256_broadcast_sd(a); a++; + // the following code sequences are a bit faster than + // just doing 4 _mm256_broadcast_sd's + // it requires a to point to aligned storage, however + +#if 1 + // this one seems slightly faster + __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0)); + __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2)); +#else + __m256d avec = _mm256_load_pd(a); + __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0); + __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11); +#endif - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec); - bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec); + __m256d avec0 = _mm256_permute_pd(a0101, 0); + __m256d avec1 = _mm256_permute_pd(a0101, 0xf); + __m256d avec2 = _mm256_permute_pd(a2323, 0); + __m256d avec3 = _mm256_permute_pd(a2323, 0xf); + + a += 4; + + __m256d bvec; + + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec0, bvec); + + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec1, bvec); + + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec2, bvec); + + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec3, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec3, bvec); + } + + for (; i < n; i++) { + __m256d avec = _mm256_broadcast_sd(a); a++; + __m256d bvec; + + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec, bvec); + bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec, bvec); } @@ -688,6 +861,9 @@ void muladd1_by_32(double *x, const doub _mm256_store_pd(x + 7*4, acc7); } +FMA_RESOLVER(static void,muladd1_by_32, + (double *x, const double *a, const double *b, long n)); + #else static @@ -794,7 +970,164 @@ void muladd1_by_32(double *x, const doub #endif // experiment: process two rows at a time -#if 1 +#ifdef NTL_LOADTIME_CPU +AVX_FUNC(void,muladd2_by_32) +(double *x, const double *a, const double *b, long n) +{ + __m256d avec0, avec1, bvec; + __m256d acc00, acc01, acc02, acc03; + __m256d acc10, acc11, acc12, acc13; + + + // round 0 + + acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); + + acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec); + } + + + _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); + + // round 1 + + acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); + + acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec); + } + + + _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); + +} + +FMA_FUNC(void,muladd2_by_32) +(double *x, const double *a, const double *b, long n) +{ + __m256d avec0, avec1, bvec; + __m256d acc00, acc01, acc02, acc03; + __m256d acc10, acc11, acc12, acc13; + + + // round 0 + + acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); + + acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); + } + + + _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); + + // round 1 + + acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); + + acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); + } + + + _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); + +} + +FMA_RESOLVER(static void,muladd2_by_32, + (double *x, const double *a, const double *b, long n)); + +#else + static void muladd2_by_32(double *x, const double *a, const double *b, long n) { @@ -870,96 +1203,217 @@ void muladd2_by_32(double *x, const doub _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); } +#endif -#else -static -void muladd2_by_32(double *x, const double *a, const double *b, long n) + +// experiment: process three rows at a time +// NOTE: this makes things slower on an AVX1 platform --- not enough registers +// it could be faster on AVX2/FMA, where there should be enough registers + +#ifdef NTL_LOADTIME_CPU +FMA_FUNC(void,muladd3_by_32) +(double *x, const double *a, const double *b, long n) { - long i, j; - __m256d bvec; + __m256d avec0, avec1, avec2, bvec; __m256d acc00, acc01, acc02, acc03; __m256d acc10, acc11, acc12, acc13; + __m256d acc20, acc21, acc22, acc23; + + // round 0 - for (j = 0; j < 2; j++) { + acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); - acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); + acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); - acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); - acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); + acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ); + acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ); + acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ); + acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ); - for (i = 0; i <= n-4; i+=4) { - __m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0)); - __m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2)); - __m256d avec00 = _mm256_permute_pd(a0_0101, 0); - __m256d avec01 = _mm256_permute_pd(a0_0101, 0xf); - __m256d avec02 = _mm256_permute_pd(a0_2323, 0); - __m256d avec03 = _mm256_permute_pd(a0_2323, 0xf); + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); - __m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ)); - __m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ)); - __m256d avec10 = _mm256_permute_pd(a1_0101, 0); - __m256d avec11 = _mm256_permute_pd(a1_0101, 0xf); - __m256d avec12 = _mm256_permute_pd(a1_2323, 0); - __m256d avec13 = _mm256_permute_pd(a1_2323, 0xf); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); + } - bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec); - bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec); - bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec); - bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec); - bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec); - bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec); - bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec); - bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec); + _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); - bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec); - bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec); - bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec); - bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec); + _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); - bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec); - bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec); - bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec); - bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec); - } + _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20); + _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21); + _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22); + _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23); - for (; i < n; i++) { - __m256d avec0 = _mm256_broadcast_sd(&a[i]); - __m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + // round 1 - bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); - bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); - bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); - bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); - } + acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); + acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); - _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00); - _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01); - _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02); - _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03); + acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ); + acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ); + acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ); + acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ); - _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10); - _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11); - _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12); - _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13); + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); } + + + _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); + + _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20); + _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21); + _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22); + _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23); + } -#endif +AVX2_FUNC(void,muladd3_by_32) +(double *x, const double *a, const double *b, long n) +{ + __m256d avec0, avec1, avec2, bvec; + __m256d acc00, acc01, acc02, acc03; + __m256d acc10, acc11, acc12, acc13; + __m256d acc20, acc21, acc22, acc23; + + + // round 0 + acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); -// experiment: process three rows at a time -// NOTE: this makes things slower on an AVX1 platform --- not enough registers -// it could be faster on AVX2/FMA, where there should be enough registers + acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); + + acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ); + acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ); + acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ); + acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); + } + + + _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); + + _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20); + _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21); + _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22); + _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23); + + // round 1 + + acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); + acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); + acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); + acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); + + acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); + acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); + acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); + acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); + + acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ); + acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ); + acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ); + acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ); + + for (long i = 0; i < n; i++) { + avec0 = _mm256_broadcast_sd(&a[i]); + avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); + avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); + + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); + bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); + } + + + _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); + _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); + _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); + _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); + + _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); + _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); + _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); + _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); + + _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20); + _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21); + _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22); + _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23); + +} + +AVX2_RESOLVER(static void,muladd3_by_32, + (double *x, const double *a, const double *b, long n)); + +#else static void muladd3_by_32(double *x, const double *a, const double *b, long n) @@ -1060,6 +1514,32 @@ void muladd3_by_32(double *x, const doub } +#endif + +#ifdef NTL_LOADTIME_CPU + +static inline +void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n) +{ + long i = first; + + if (have_fma) { + // processing three rows at a time is faster + for (; i <= last-3; i+=3) + muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); + for (; i < last; i++) + muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); + } else { + // process only two rows at a time: not enough registers :-( + for (; i <= last-2; i+=2) + muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); + for (; i < last; i++) + muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); + } +} + +#else + static inline void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n) { @@ -1079,8 +1559,79 @@ void muladd_all_by_32(long first, long l #endif } +#endif + // this assumes n is a multiple of 16 +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,muladd_interval) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; + + __m256d cvec = _mm256_broadcast_sd(&c); + + for (long i = 0; i < n; i += 16, x += 16, y += 16) { + xvec0 = _mm256_load_pd(x+0*4); + xvec1 = _mm256_load_pd(x+1*4); + xvec2 = _mm256_load_pd(x+2*4); + xvec3 = _mm256_load_pd(x+3*4); + + yvec0 = _mm256_load_pd(y+0*4); + yvec1 = _mm256_load_pd(y+1*4); + yvec2 = _mm256_load_pd(y+2*4); + yvec3 = _mm256_load_pd(y+3*4); + + AVX_MUL_ADD(xvec0, yvec0, cvec); + AVX_MUL_ADD(xvec1, yvec1, cvec); + AVX_MUL_ADD(xvec2, yvec2, cvec); + AVX_MUL_ADD(xvec3, yvec3, cvec); + + _mm256_store_pd(x + 0*4, xvec0); + _mm256_store_pd(x + 1*4, xvec1); + _mm256_store_pd(x + 2*4, xvec2); + _mm256_store_pd(x + 3*4, xvec3); + } +} + +FMA_FUNC(void,muladd_interval) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; + + __m256d cvec = _mm256_broadcast_sd(&c); + + for (long i = 0; i < n; i += 16, x += 16, y += 16) { + xvec0 = _mm256_load_pd(x+0*4); + xvec1 = _mm256_load_pd(x+1*4); + xvec2 = _mm256_load_pd(x+2*4); + xvec3 = _mm256_load_pd(x+3*4); + + yvec0 = _mm256_load_pd(y+0*4); + yvec1 = _mm256_load_pd(y+1*4); + yvec2 = _mm256_load_pd(y+2*4); + yvec3 = _mm256_load_pd(y+3*4); + + FMA_MUL_ADD(xvec0, yvec0, cvec); + FMA_MUL_ADD(xvec1, yvec1, cvec); + FMA_MUL_ADD(xvec2, yvec2, cvec); + FMA_MUL_ADD(xvec3, yvec3, cvec); + + _mm256_store_pd(x + 0*4, xvec0); + _mm256_store_pd(x + 1*4, xvec1); + _mm256_store_pd(x + 2*4, xvec2); + _mm256_store_pd(x + 3*4, xvec3); + } +} + +FMA_RESOLVER(static void,muladd_interval, + (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); + +#else + static inline void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) { @@ -1111,9 +1662,109 @@ void muladd_interval(double * NTL_RESTRI _mm256_store_pd(x + 3*4, xvec3); } } +#endif // this one is more general: does not assume that n is a // multiple of 16 +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,muladd_interval1) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; + __m256d cvec; + + if (n >= 4) + cvec = _mm256_broadcast_sd(&c); + + long i=0; + for (; i <= n-16; i += 16, x += 16, y += 16) { + xvec0 = _mm256_load_pd(x+0*4); + xvec1 = _mm256_load_pd(x+1*4); + xvec2 = _mm256_load_pd(x+2*4); + xvec3 = _mm256_load_pd(x+3*4); + + yvec0 = _mm256_load_pd(y+0*4); + yvec1 = _mm256_load_pd(y+1*4); + yvec2 = _mm256_load_pd(y+2*4); + yvec3 = _mm256_load_pd(y+3*4); + + AVX_MUL_ADD(xvec0, yvec0, cvec); + AVX_MUL_ADD(xvec1, yvec1, cvec); + AVX_MUL_ADD(xvec2, yvec2, cvec); + AVX_MUL_ADD(xvec3, yvec3, cvec); + + _mm256_store_pd(x + 0*4, xvec0); + _mm256_store_pd(x + 1*4, xvec1); + _mm256_store_pd(x + 2*4, xvec2); + _mm256_store_pd(x + 3*4, xvec3); + } + + for (; i <= n-4; i += 4, x += 4, y += 4) { + xvec0 = _mm256_load_pd(x+0*4); + yvec0 = _mm256_load_pd(y+0*4); + AVX_MUL_ADD(xvec0, yvec0, cvec); + _mm256_store_pd(x + 0*4, xvec0); + } + + for (; i < n; i++, x++, y++) { + *x += (*y)*c; + } +} + +FMA_FUNC(void,muladd_interval1) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; + __m256d cvec; + + if (n >= 4) + cvec = _mm256_broadcast_sd(&c); + + long i=0; + for (; i <= n-16; i += 16, x += 16, y += 16) { + xvec0 = _mm256_load_pd(x+0*4); + xvec1 = _mm256_load_pd(x+1*4); + xvec2 = _mm256_load_pd(x+2*4); + xvec3 = _mm256_load_pd(x+3*4); + + yvec0 = _mm256_load_pd(y+0*4); + yvec1 = _mm256_load_pd(y+1*4); + yvec2 = _mm256_load_pd(y+2*4); + yvec3 = _mm256_load_pd(y+3*4); + + FMA_MUL_ADD(xvec0, yvec0, cvec); + FMA_MUL_ADD(xvec1, yvec1, cvec); + FMA_MUL_ADD(xvec2, yvec2, cvec); + FMA_MUL_ADD(xvec3, yvec3, cvec); + + _mm256_store_pd(x + 0*4, xvec0); + _mm256_store_pd(x + 1*4, xvec1); + _mm256_store_pd(x + 2*4, xvec2); + _mm256_store_pd(x + 3*4, xvec3); + } + + for (; i <= n-4; i += 4, x += 4, y += 4) { + xvec0 = _mm256_load_pd(x+0*4); + yvec0 = _mm256_load_pd(y+0*4); + FMA_MUL_ADD(xvec0, yvec0, cvec); + _mm256_store_pd(x + 0*4, xvec0); + } + + for (; i < n; i++, x++, y++) { + *x += (*y)*c; + } +} + +FMA_RESOLVER(static void,muladd_interval1, + (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); + +#else + static inline void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) { @@ -1159,11 +1810,74 @@ void muladd_interval1(double * NTL_RESTR *x += (*y)*c; } } +#endif #define AVX_PD_SZ (4) // experimental: assumes n is a multiple of 4 in the range [0..32] -#if 1 +#ifdef NTL_LOADTIME_CPU + +AVX_FUNC(void,muladd_interval2) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + n /= 4; + if (n <= 0 || n > 8) return; + + x += n*4; + y += n*4; + + // n in [1..8] + + __m256d xvec, yvec, cvec; + + cvec = _mm256_broadcast_sd(&c); + + switch (n) { + case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); + case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); + case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); + case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); + case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); + case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); + case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); + case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); + } + +} + +FMA_FUNC(void,muladd_interval2) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ + n /= 4; + if (n <= 0 || n > 8) return; + + x += n*4; + y += n*4; + + // n in [1..8] + + __m256d xvec, yvec, cvec; + + cvec = _mm256_broadcast_sd(&c); + + switch (n) { + case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); + case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); + case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); + case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); + case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); + case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); + case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); + case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); + } + +} + +FMA_RESOLVER(static void,muladd_interval2, + (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); + +#else + static inline void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) { @@ -1191,13 +1905,6 @@ void muladd_interval2(double * NTL_RESTR } } -#else -static inline -void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) -{ - for (long i = 0; i < n; i++) - x[i] += y[i]*c; -} #endif #endif @@ -2031,10 +2738,10 @@ void alt_mul_LL(const mat_window_zz_p& X } -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) -static -void blk_mul_DD(const mat_window_zz_p& X, +static void __attribute__((target ("avx,pclmul"))) +blk_mul_DD(const mat_window_zz_p& X, const const_mat_window_zz_p& A, const const_mat_window_zz_p& B) { long n = A.NumRows(); @@ -2401,8 +3108,9 @@ void mul_base (const mat_window_zz_p& X, long V = MAT_BLK_SZ*4; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined (NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { @@ -2482,7 +3190,8 @@ void mul_strassen(const mat_window_zz_p& // this code determines if mul_base triggers blk_mul_DD, // in which case a higher crossover is used -#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX)) +#if (defined(NTL_HAVE_LL_TYPE) && (defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU))) + if (AVX_ACTIVE) { long V = MAT_BLK_SZ*4; long p = zz_p::modulus(); @@ -2982,10 +3691,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) -static -void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) +static void __attribute__((target ("avx,pclmul"))) +alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) { long n = A.NumRows(); @@ -3151,10 +3860,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) -static -void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) +static void __attribute__((target ("avx,pclmul"))) +blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) { long n = A.NumRows(); @@ -3912,8 +4621,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c else if (n/MAT_BLK_SZ < 4) { long V = 64; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { @@ -3938,8 +4648,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c else { long V = 4*MAT_BLK_SZ; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { @@ -4345,10 +5056,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p& -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) -static -void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, +static void __attribute__((target ("avx,pclmul"))) +alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, vec_zz_p *xp, bool trans, bool relax) { long n = A.NumRows(); @@ -4535,10 +5246,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p& -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) -static -void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, +static void __attribute__((target ("avx,pclmul"))) +blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, vec_zz_p *xp, bool trans, bool relax) { long n = A.NumRows(); @@ -5349,8 +6060,9 @@ void tri(zz_p& d, const mat_zz_p& A, con else if (n/MAT_BLK_SZ < 4) { long V = 64; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { @@ -5375,8 +6087,9 @@ void tri(zz_p& d, const mat_zz_p& A, con else { long V = 4*MAT_BLK_SZ; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { @@ -5622,7 +6335,7 @@ long elim_basic(const mat_zz_p& A, mat_z #ifdef NTL_HAVE_LL_TYPE -#ifdef NTL_HAVE_AVX +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) static inline @@ -7075,8 +7788,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i else { long V = 4*MAT_BLK_SZ; -#ifdef NTL_HAVE_AVX - if (p-1 <= MAX_DBL_INT && +#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + if (AVX_ACTIVE && + p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { --- src/QuickTest.c.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/QuickTest.c 2016-07-20 19:57:16.777610226 -0600 @@ -339,6 +339,9 @@ cerr << "Performance Options:\n"; cerr << "NTL_PCLMUL\n"; #endif +#ifdef NTL_LOADTIME_CPU + cerr << "NTL_LOADTIME_CPU\n"; +#endif cerr << "\n\n"; --- src/WizardAux.orig 2016-06-21 12:46:44.000000000 -0600 +++ src/WizardAux 2016-07-20 19:57:16.777610226 -0600 @@ -94,6 +94,7 @@ system("make InitSettings"); 'NTL_PCLMUL' => 0, 'NTL_FFT_BIGTAB' => 0, 'NTL_FFT_LAZYMUL' => 0, +'NTL_LOADTIME_CPU' => 0, 'WIZARD_HACK' => '#define NTL_WIZARD_HACK',