diff --git a/.gitignore b/.gitignore index 18b4a37..5ea72f0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1 @@ -/ntl-9.4.0.tar.gz -/ntl-9.6.2.tar.gz -/ntl-9.6.4.tar.gz -/ntl-9.7.0.tar.gz -/ntl-9.8.0.tar.gz -/ntl-9.9.1.tar.gz -/ntl-9.10.0.tar.gz -/ntl-9.11.0.tar.gz -/ntl-10.1.0.tar.gz -/ntl-10.3.0.tar.gz -/ntl-10.5.0.tar.gz +/ntl-*.tar.gz diff --git a/ntl-loadtime-cpu.patch b/ntl-loadtime-cpu.patch index f4838e5..3aa7c78 100644 --- a/ntl-loadtime-cpu.patch +++ b/ntl-loadtime-cpu.patch @@ -1,6 +1,6 @@ ---- doc/config.txt.orig 2017-07-07 09:05:14.000000000 -0600 -+++ doc/config.txt 2017-09-09 12:10:10.877839678 -0600 -@@ -337,6 +337,7 @@ NTL_AVOID_BRANCHING=off +--- doc/config.txt.orig 2018-04-08 12:23:07.000000000 -0600 ++++ doc/config.txt 2018-04-09 11:00:49.604177774 -0600 +@@ -367,6 +367,7 @@ NTL_AVOID_BRANCHING=off NTL_GF2X_NOINLINE=off NTL_GF2X_ALTCODE=off NTL_GF2X_ALTCODE1=off @@ -8,7 +8,7 @@ GMP_INCDIR=$(GMP_PREFIX)/include GMP_LIBDIR=$(GMP_PREFIX)/lib -@@ -634,6 +635,10 @@ NTL_GF2X_ALTCODE1=off +@@ -680,6 +681,10 @@ NTL_GF2X_ALTCODE1=off # Yet another alternative implementation for GF2X multiplication. @@ -19,9 +19,9 @@ ########## More GMP Options: ---- include/NTL/config.h.orig 2017-07-07 09:05:14.000000000 -0600 -+++ include/NTL/config.h 2017-09-09 12:10:10.891839641 -0600 -@@ -525,6 +525,20 @@ using the configure script. +--- include/NTL/config.h.orig 2018-04-08 12:23:07.000000000 -0600 ++++ include/NTL/config.h 2018-04-09 11:00:49.628177715 -0600 +@@ -517,6 +517,20 @@ to be defined. Of course, to unset a f #endif @@ -42,9 +42,9 @@ ---- include/NTL/ctools.h.orig 2017-07-07 09:05:14.000000000 -0600 -+++ include/NTL/ctools.h 2017-09-09 13:36:16.915768457 -0600 -@@ -498,6 +498,166 @@ char *_ntl_make_aligned(char *p, long al +--- include/NTL/ctools.h.orig 2018-04-08 12:23:06.000000000 -0600 ++++ include/NTL/ctools.h 2018-04-09 14:24:06.057491526 -0600 +@@ -509,6 +509,155 @@ char *_ntl_make_aligned(char *p, long al // and it should also be as big as a cache line @@ -85,10 +85,9 @@ +#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul,ssse3",avx,type,name) +#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul,ssse3",fma,type,name) +#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul,ssse3",avx2,type,name) -+#define SSSE3_RESOLVER(type,name,params) \ ++#define SSSE3_RESOLVER(st,type,name,params) \ + extern "C" { \ -+ static void __attribute__((optimize ("O0"))) \ -+ (*resolve_##name (void))(void) { \ ++ static type (*resolve_##name(void)) params { \ + if (__builtin_expect(have_avx2, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \ @@ -105,16 +104,15 @@ + have_ssse3 = 0; \ + } \ + } \ -+ if (have_avx2) return (void (*)(void))&name##_avx2; \ -+ if (have_ssse3) return (void (*)(void))&name##_ssse3; \ -+ return (void (*)(void))&name##_base; \ ++ if (have_avx2) return &name##_avx2; \ ++ if (have_ssse3) return &name##_ssse3; \ ++ return &name##_base; \ + } \ + } \ -+ type __attribute__((ifunc ("resolve_" #name))) name params -+#define PCLMUL_RESOLVER(type,name,params) \ ++ st type __attribute__((ifunc ("resolve_" #name))) name params ++#define PCLMUL_RESOLVER(st,type,name,params) \ + extern "C" { \ -+ static void __attribute__((optimize ("O0"))) \ -+ (*resolve_##name (void))(void) { \ ++ static type (*resolve_##name(void)) params { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ @@ -127,16 +125,15 @@ + have_fma = 0; \ + } \ + } \ -+ if (have_avx) return (void (*)(void))&name##_avx; \ -+ if (have_pclmul) return (void (*)(void))&name##_pclmul; \ -+ return (void (*)(void))&name##_base; \ ++ if (have_avx) return &name##_avx; \ ++ if (have_pclmul) return &name##_pclmul; \ ++ return &name##_base; \ + } \ + } \ -+ type __attribute__((ifunc ("resolve_" #name))) name params -+#define AVX_RESOLVER(type,name,params) \ ++ st type __attribute__((ifunc ("resolve_" #name))) name params ++#define AVX_RESOLVER(st,type,name,params) \ + extern "C" { \ -+ static void __attribute__((optimize ("O0"))) \ -+ (*resolve_##name (void))(void) { \ ++ static type (*resolve_##name(void)) params { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ @@ -149,16 +146,13 @@ + have_fma = 0; \ + } \ + } \ -+ return have_avx \ -+ ? (void (*)(void))&name##_avx \ -+ : (void (*)(void))&name##_base; \ ++ return have_avx ? &name##_avx : &name##_base; \ + } \ + } \ -+ type __attribute__((ifunc ("resolve_" #name))) name params -+#define FMA_RESOLVER(type,name,params) \ ++ st type __attribute__((ifunc ("resolve_" #name))) name params ++#define FMA_RESOLVER(st,type,name,params) \ + extern "C" { \ -+ static void __attribute__((optimize ("O0"))) \ -+ (*resolve_##name (void))(void) { \ ++ static type (*resolve_##name(void)) params { \ + if (__builtin_expect(have_pclmul, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ @@ -171,16 +165,13 @@ + have_fma = 0; \ + } \ + } \ -+ return have_fma \ -+ ? (void (*)(void))&name##_fma \ -+ : (void (*)(void))&name##_avx; \ ++ return have_fma ? &name##_fma : &name##_avx; \ + } \ + } \ -+ type __attribute__((ifunc ("resolve_" #name))) name params -+#define AVX2_RESOLVER(type,name,params) \ ++ st type __attribute__((ifunc ("resolve_" #name))) name params ++#define AVX2_RESOLVER(st,type,name,params) \ + extern "C" { \ -+ static void __attribute__((optimize ("O0"))) \ -+ (*resolve_##name (void))(void) { \ ++ static type (*resolve_##name(void)) params { \ + if (__builtin_expect(have_avx2, 0) < 0) { \ + unsigned int eax, ebx, ecx, edx; \ + if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \ @@ -201,40 +192,16 @@ + have_fma = 0; \ + } \ + } \ -+ return have_avx2 \ -+ ? (void (*)(void))&name##_avx2 \ -+ : (void (*)(void))&name##_fma; \ ++ return have_avx2 ? &name##_avx2 : &name##_fma; \ + } \ + } \ -+ type __attribute__((ifunc ("resolve_" #name))) name params ++ st type __attribute__((ifunc ("resolve_" #name))) name params +#endif #ifdef NTL_HAVE_BUILTIN_CLZL ---- include/NTL/def_config.h.orig 2017-07-07 09:05:14.000000000 -0600 -+++ include/NTL/def_config.h 2017-09-09 12:10:10.892839638 -0600 -@@ -525,6 +525,19 @@ using the configure script. - #endif - - -+#if 0 -+#define NTL_LOADTIME_CPU -+ -+/* -+ * With this flag enabled, detect advanced CPU features at load time instead -+ * of at compile time. This flag is intended for distributions, so that they -+ * can compile for the lowest common denominator CPU, but still support newer -+ * CPUs. -+ * -+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later. -+ */ -+ -+#endif - - - ---- include/NTL/MatPrime.h.orig 2017-07-07 09:05:14.000000000 -0600 -+++ include/NTL/MatPrime.h 2017-09-09 12:10:10.892839638 -0600 +--- include/NTL/MatPrime.h.orig 2018-04-08 12:23:07.000000000 -0600 ++++ include/NTL/MatPrime.h 2018-04-09 11:00:57.950157056 -0600 @@ -20,7 +20,7 @@ NTL_OPEN_NNS @@ -244,18 +211,18 @@ #define NTL_MatPrime_NBITS (23) #else #define NTL_MatPrime_NBITS NTL_SP_NBITS ---- include/NTL/REPORT_ALL_FEATURES.h.orig 2017-07-07 09:05:15.000000000 -0600 -+++ include/NTL/REPORT_ALL_FEATURES.h 2017-09-09 12:11:10.313683979 -0600 -@@ -39,3 +39,6 @@ - std::cerr << "NTL_HAVE_COPY_TRAITS2\n"; +--- include/NTL/REPORT_ALL_FEATURES.h.orig 2018-04-08 12:23:07.000000000 -0600 ++++ include/NTL/REPORT_ALL_FEATURES.h 2018-04-09 11:00:57.951157054 -0600 +@@ -51,3 +51,6 @@ + std::cerr << "NTL_HAVE_POSIX_TIME\n"; #endif +#ifdef NTL_LOADTIME_CPU + std::cerr << "NTL_LOADTIME_CPU\n"; +#endif ---- src/cfile.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/cfile 2017-09-09 12:10:10.892839638 -0600 -@@ -480,6 +480,20 @@ using the configure script. +--- src/cfile.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/cfile 2018-04-09 11:00:57.951157054 -0600 +@@ -471,6 +471,20 @@ to be defined. Of course, to unset a f #elif @{NTL_GF2X_ALTCODE1} #define NTL_GF2X_ALTCODE1 @@ -276,9 +243,9 @@ /* * Yest another alternative strategy for implementing GF2X ---- src/DispSettings.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/DispSettings.cpp 2017-09-09 12:10:10.892839638 -0600 -@@ -168,6 +168,10 @@ cout << "Performance Options:\n"; +--- src/DispSettings.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/DispSettings.cpp 2018-04-09 11:00:57.952157051 -0600 +@@ -179,6 +179,10 @@ cout << "Performance Options:\n"; cout << "NTL_GF2X_NOINLINE\n"; #endif @@ -289,18 +256,17 @@ cout << "***************************/\n"; cout << "\n\n"; ---- src/DoConfig.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/DoConfig 2017-09-09 12:10:10.892839638 -0600 -@@ -1,7 +1,7 @@ +--- src/DoConfig.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/DoConfig 2018-04-09 11:03:15.877814660 -0600 +@@ -1,6 +1,7 @@ # This is a perl script, invoked from a shell - # use warnings; # this doesn't work on older versions of perl -- + use warnings; # this doesn't work on older versions of perl +use Config; - %MakeFlag = ( -@@ -86,6 +86,7 @@ + system("echo '*** CompilerOutput.log ***' > CompilerOutput.log"); +@@ -90,6 +91,7 @@ system("echo '*** CompilerOutput.log *** 'NTL_GF2X_NOINLINE' => 'off', 'NTL_GF2X_ALTCODE' => 'off', 'NTL_GF2X_ALTCODE1' => 'off', @@ -308,9 +274,9 @@ ); -@@ -195,6 +196,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq ' - } +@@ -222,6 +224,15 @@ if ($MakeFlag{'SHARED'} eq 'off') { + } +# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE + @@ -322,11 +288,11 @@ +} + - # some special MakeVal values that are determined by SHARED ---- src/GF2X1.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/GF2X1.cpp 2017-09-09 12:10:10.893839636 -0600 -@@ -19,7 +19,7 @@ + } +--- src/GF2X1.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/GF2X1.cpp 2018-04-09 11:00:57.955157044 -0600 +@@ -18,7 +18,7 @@ // simple scaling factor for some crossover points: // we use a lower crossover of the underlying multiplication // is faster @@ -335,9 +301,9 @@ #define XOVER_SCALE (1L) #else #define XOVER_SCALE (2L) ---- src/GF2X.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/GF2X.cpp 2017-09-09 12:10:10.893839636 -0600 -@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned +--- src/GF2X.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/GF2X.cpp 2018-04-09 14:24:53.705349749 -0600 +@@ -27,6 +27,22 @@ pclmul_mul1 (unsigned long *c, unsigned _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0)); } @@ -381,7 +347,7 @@ + pclmul_mul1(c, a, b); +} + -+PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); ++PCLMUL_RESOLVER(static,void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); + +#else + @@ -440,7 +406,7 @@ + cp[sb] = carry; +} + -+PCLMUL_RESOLVER(static void,Mul1, ++PCLMUL_RESOLVER(static,void,Mul1, + (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); + +#else @@ -448,8 +414,8 @@ static void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) { -@@ -619,6 +702,53 @@ NTL_EFF_BB_MUL_CODE1 - +@@ -620,6 +703,53 @@ NTL_EFF_BB_MUL_CODE1 + // warning #13200: No EMMS instruction before return } +#endif @@ -494,7 +460,7 @@ + cp[sb] ^= carry; +} + -+PCLMUL_RESOLVER(static void,AddMul1, ++PCLMUL_RESOLVER(static,void,AddMul1, + (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)); + +#else @@ -502,7 +468,7 @@ static void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) { -@@ -647,6 +777,52 @@ NTL_EFF_BB_MUL_CODE2 +@@ -648,6 +778,52 @@ NTL_EFF_BB_MUL_CODE2 } @@ -548,15 +514,15 @@ + cp[sb] = carry; +} + -+PCLMUL_RESOLVER(static void,Mul1_short, ++PCLMUL_RESOLVER(static,void,Mul1_short, + (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); + +#else static void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) -@@ -675,10 +851,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1 - +@@ -677,10 +853,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1 + // warning #13200: No EMMS instruction before return } +#endif @@ -580,14 +546,14 @@ + pclmul_mul1(c, a, b); +} + -+PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); ++PCLMUL_RESOLVER(static,void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); + +#else + static void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) { -@@ -692,6 +889,7 @@ NTL_EFF_HALF_BB_MUL_CODE0 +@@ -694,6 +891,7 @@ NTL_EFF_HALF_BB_MUL_CODE0 } @@ -595,7 +561,7 @@ // mul2...mul8 hard-code 2x2...8x8 word multiplies. // I adapted these routines from LiDIA (except mul3, see below). -@@ -1603,6 +1801,77 @@ static const _ntl_ulong sqrtab[256] = { +@@ -1611,6 +1809,77 @@ static const _ntl_ulong sqrtab[256] = { @@ -666,14 +632,14 @@ + return; +} + -+PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a)); ++PCLMUL_RESOLVER(,void,sqr,(GF2X& c, const GF2X& a)); + +#else + static inline void sqr1(_ntl_ulong *c, _ntl_ulong a) { -@@ -1643,6 +1912,7 @@ void sqr(GF2X& c, const GF2X& a) +@@ -1651,6 +1920,7 @@ void sqr(GF2X& c, const GF2X& a) return; } @@ -681,9 +647,9 @@ void LeftShift(GF2X& c, const GF2X& a, long n) ---- src/InitSettings.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/InitSettings.cpp 2017-09-09 12:10:10.894839633 -0600 -@@ -166,6 +166,11 @@ int main() +--- src/InitSettings.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/InitSettings.cpp 2018-04-09 11:00:57.956157041 -0600 +@@ -172,6 +172,11 @@ int main() cout << "NTL_RANGE_CHECK=0\n"; #endif @@ -695,9 +661,9 @@ // the following are not actual config flags, but help // in the Wizard logic ---- src/mat_lzz_p.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/mat_lzz_p.cpp 2017-09-09 12:10:10.895839630 -0600 -@@ -10,6 +10,15 @@ +--- src/mat_lzz_p.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/mat_lzz_p.cpp 2018-04-09 15:00:11.078401207 -0600 +@@ -9,6 +9,15 @@ #ifdef NTL_HAVE_AVX #include @@ -713,7 +679,7 @@ #endif NTL_START_IMPL -@@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A, +@@ -625,7 +634,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A, #ifdef NTL_HAVE_LL_TYPE @@ -722,7 +688,7 @@ #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1) // max int representable exactly as a double -@@ -640,18 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A, +@@ -639,18 +648,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A, // MUL_ADD(a, b, c): a += b*c @@ -755,7 +721,7 @@ + + long i = 0; + for (; i <= n-4; i +=4) { - ++ + // the following code sequences are a bit faster than + // just doing 4 _mm256_broadcast_sd's + // it requires a to point to aligned storage, however @@ -770,7 +736,7 @@ + __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11); + +#endif - ++ + __m256d avec0 = _mm256_permute_pd(a0101, 0); + __m256d avec1 = _mm256_permute_pd(a0101, 0xf); + __m256d avec2 = _mm256_permute_pd(a2323, 0); @@ -788,7 +754,7 @@ + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec); -+ + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec); @@ -797,7 +763,7 @@ + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec); -+ + + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec); + bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec); @@ -848,7 +814,7 @@ __m256d acc0=_mm256_load_pd(x + 0*4); __m256d acc1=_mm256_load_pd(x + 1*4); __m256d acc2=_mm256_load_pd(x + 2*4); -@@ -661,10 +772,179 @@ void muladd1_by_32(double *x, const doub +@@ -660,10 +771,179 @@ void muladd1_by_32(double *x, const doub __m256d acc6=_mm256_load_pd(x + 6*4); __m256d acc7=_mm256_load_pd(x + 7*4); @@ -860,7 +826,7 @@ + // the following code sequences are a bit faster than + // just doing 4 _mm256_broadcast_sd's + // it requires a to point to aligned storage, however - ++ +#if 1 + // this one seems slightly faster + __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0)); @@ -943,7 +909,7 @@ + _mm256_store_pd(x + 7*4, acc7); +} + -+FMA_RESOLVER(static void,muladd1_by_32, ++FMA_RESOLVER(static,void,muladd1_by_32, + (double *x, const double *a, const double *b, long n)); + +#else @@ -995,7 +961,7 @@ + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec0, bvec); + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec0, bvec); -+ + + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec1, bvec); + bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec1, bvec); @@ -1030,7 +996,7 @@ bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec); bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec); -@@ -687,6 +967,75 @@ void muladd1_by_32(double *x, const doub +@@ -686,6 +966,75 @@ void muladd1_by_32(double *x, const doub _mm256_store_pd(x + 7*4, acc7); } @@ -1098,7 +1064,7 @@ + _mm256_store_pd(x + 3*4, acc3); +} + -+FMA_RESOLVER(static void,muladd1_by_16, ++FMA_RESOLVER(static,void,muladd1_by_16, + (double *x, const double *a, const double *b, long n)); + +#else @@ -1106,7 +1072,7 @@ static void muladd1_by_16(double *x, const double *a, const double *b, long n) { -@@ -717,6 +1066,165 @@ void muladd1_by_16(double *x, const doub +@@ -716,6 +1065,165 @@ void muladd1_by_16(double *x, const doub _mm256_store_pd(x + 3*4, acc3); } @@ -1265,14 +1231,14 @@ + +} + -+FMA_RESOLVER(static void,muladd2_by_32, ++FMA_RESOLVER(static,void,muladd2_by_32, + (double *x, const double *a, const double *b, long n)); + +#else // experiment: process two rows at a time static -@@ -795,6 +1303,211 @@ void muladd2_by_32(double *x, const doub +@@ -794,6 +1302,211 @@ void muladd2_by_32(double *x, const doub } @@ -1477,14 +1443,14 @@ + +} + -+AVX2_RESOLVER(static void,muladd3_by_32, ++AVX2_RESOLVER(static,void,muladd3_by_32, + (double *x, const double *a, const double *b, long n)); + +#else // experiment: process three rows at a time // NOTE: this makes things slower on an AVX1 platform --- not enough registers -@@ -899,8 +1612,10 @@ void muladd3_by_32(double *x, const doub +@@ -898,8 +1611,10 @@ void muladd3_by_32(double *x, const doub } @@ -1497,7 +1463,7 @@ { __m256d avec0, avec1, bvec; __m256d acc00, acc01, acc02, acc03; -@@ -923,10 +1638,10 @@ void muladd2_by_16(double *x, const doub +@@ -922,10 +1637,10 @@ void muladd2_by_16(double *x, const doub avec0 = _mm256_broadcast_sd(&a[i]); avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); @@ -1512,7 +1478,7 @@ } -@@ -942,8 +1657,8 @@ void muladd2_by_16(double *x, const doub +@@ -941,8 +1656,8 @@ void muladd2_by_16(double *x, const doub } @@ -1523,7 +1489,7 @@ { __m256d avec0, avec1, avec2, bvec; __m256d acc00, acc01, acc02, acc03; -@@ -973,10 +1688,10 @@ void muladd3_by_16(double *x, const doub +@@ -972,10 +1687,10 @@ void muladd3_by_16(double *x, const doub avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); @@ -1538,7 +1504,7 @@ } -@@ -997,6 +1712,30 @@ void muladd3_by_16(double *x, const doub +@@ -996,6 +1711,30 @@ void muladd3_by_16(double *x, const doub } @@ -1569,7 +1535,7 @@ static inline void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n) { -@@ -1016,6 +1755,30 @@ void muladd_all_by_32(long first, long l +@@ -1015,6 +1754,30 @@ void muladd_all_by_32(long first, long l #endif } @@ -1600,7 +1566,7 @@ static inline void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n) -@@ -1036,6 +1799,8 @@ void muladd_all_by_16(long first, long l +@@ -1035,6 +1798,8 @@ void muladd_all_by_16(long first, long l #endif } @@ -1609,12 +1575,11 @@ static inline void muladd_all_by_32_width(long first, long last, double *x, const double *a, const double *b, long n, long width) { -@@ -1045,7 +1810,74 @@ void muladd_all_by_32_width(long first, - muladd_all_by_16(first, last, x, a, b, n); - } +@@ -1050,6 +1815,72 @@ void muladd_all_by_32_width(long first, + + // this assumes n is a multiple of 16 +#ifdef NTL_LOADTIME_CPU -+ +AVX_FUNC(void,muladd_interval) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ @@ -1651,7 +1616,7 @@ +{ + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; - ++ + __m256d cvec = _mm256_broadcast_sd(&c); + + for (long i = 0; i < n; i += 16, x += 16, y += 16) { @@ -1677,21 +1642,19 @@ + } +} + -+FMA_RESOLVER(static void,muladd_interval, ++FMA_RESOLVER(static,void,muladd_interval, + (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); -+ +#else - - // this assumes n is a multiple of 16 static inline -@@ -1079,6 +1911,107 @@ void muladd_interval(double * NTL_RESTRI + void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) + { +@@ -1080,7 +1911,105 @@ void muladd_interval(double * NTL_RESTRI + _mm256_store_pd(x + 3*4, xvec3); } } - +#endif + +#ifdef NTL_LOADTIME_CPU -+ +AVX_FUNC(void,muladd_interval1) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ @@ -1741,7 +1704,7 @@ +FMA_FUNC(void,muladd_interval1) +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +{ -+ + + __m256d xvec0, xvec1, xvec2, xvec3; + __m256d yvec0, yvec1, yvec2, yvec3; + __m256d cvec; @@ -1784,98 +1747,22 @@ + } +} + -+FMA_RESOLVER(static void,muladd_interval1, ++FMA_RESOLVER(static,void,muladd_interval1, + (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); + +#else -+ // this one is more general: does not assume that n is a // multiple of 16 static inline -@@ -1127,8 +2060,73 @@ void muladd_interval1(double * NTL_RESTR - } - } +@@ -1131,6 +2060,7 @@ void muladd_interval1(double * NTL_RESTR -+#endif -+ - #define AVX_PD_SZ (4) - -+#ifdef NTL_LOADTIME_CPU -+ -+AVX_FUNC(void,muladd_interval2) -+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) -+{ -+ n /= 4; -+ if (n <= 0 || n > 8) return; -+ -+ x += n*4; -+ y += n*4; -+ -+ // n in [1..8] -+ -+ __m256d xvec, yvec, cvec; -+ -+ cvec = _mm256_broadcast_sd(&c); -+ -+ switch (n) { -+ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); -+ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); -+ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); -+ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); -+ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); -+ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); -+ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); -+ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); -+ } -+ -+} -+ -+FMA_FUNC(void,muladd_interval2) -+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) -+{ -+ n /= 4; -+ if (n <= 0 || n > 8) return; -+ -+ x += n*4; -+ y += n*4; -+ -+ // n in [1..8] -+ -+ __m256d xvec, yvec, cvec; -+ -+ cvec = _mm256_broadcast_sd(&c); -+ -+ switch (n) { -+ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); -+ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); -+ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); -+ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); -+ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); -+ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); -+ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); -+ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); -+ } -+ -+} -+ -+FMA_RESOLVER(static void,muladd_interval2, -+ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); -+ -+#else -+ - // experimental: assumes n is a multiple of 4 in the range [0..32] - #if 1 - static inline -@@ -1169,6 +2167,8 @@ void muladd_interval2(double * NTL_RESTR #endif - +#endif -+ - #define DO_MUL(a, b) ((unsigned long) (long(a)*long(b))) -@@ -2743,10 +3743,10 @@ void alt_mul_LL(const mat_window_zz_p& X + //#define DO_MUL(a, b) ((unsigned long) (long(a)*long(b))) +@@ -2716,10 +3646,10 @@ void alt_mul_LL(const mat_window_zz_p& X } @@ -1889,7 +1776,7 @@ const const_mat_window_zz_p& A, const const_mat_window_zz_p& B) { long n = A.NumRows(); -@@ -3085,12 +4085,13 @@ void mul_base (const mat_window_zz_p& X, +@@ -3058,12 +3988,13 @@ void mul_base (const mat_window_zz_p& X, long p = zz_p::modulus(); long V = MAT_BLK_SZ*4; @@ -1905,7 +1792,7 @@ p-1 <= MAX_DBL_INT && V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) -@@ -3185,7 +4186,8 @@ void mul_strassen(const mat_window_zz_p& +@@ -3158,7 +4089,8 @@ void mul_strassen(const mat_window_zz_p& // this code determines if mul_base triggers blk_mul_DD, // in which case a higher crossover is used @@ -1915,7 +1802,7 @@ { long V = MAT_BLK_SZ*4; long p = zz_p::modulus(); -@@ -3685,10 +4687,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con +@@ -3658,10 +4590,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con @@ -1929,7 +1816,7 @@ { long n = A.NumRows(); -@@ -3854,10 +4856,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co +@@ -3827,10 +4759,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co @@ -1943,7 +1830,7 @@ { long n = A.NumRows(); -@@ -4615,8 +5617,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c +@@ -4588,8 +5520,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c else if (n/MAT_BLK_SZ < 4) { long V = 64; @@ -1955,7 +1842,7 @@ V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { -@@ -4641,8 +5644,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c +@@ -4614,8 +5547,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c else { long V = 4*MAT_BLK_SZ; @@ -1967,7 +1854,7 @@ V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { -@@ -5048,10 +6052,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p& +@@ -5021,10 +5955,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p& @@ -1981,7 +1868,7 @@ vec_zz_p *xp, bool trans, bool relax) { long n = A.NumRows(); -@@ -5238,10 +6242,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p& +@@ -5211,10 +6145,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p& @@ -1995,7 +1882,7 @@ vec_zz_p *xp, bool trans, bool relax) { long n = A.NumRows(); -@@ -6052,8 +7056,9 @@ void tri(zz_p& d, const mat_zz_p& A, con +@@ -6025,8 +6959,9 @@ void tri(zz_p& d, const mat_zz_p& A, con else if (n/MAT_BLK_SZ < 4) { long V = 64; @@ -2007,7 +1894,7 @@ V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { -@@ -6078,8 +7083,9 @@ void tri(zz_p& d, const mat_zz_p& A, con +@@ -6051,8 +6986,9 @@ void tri(zz_p& d, const mat_zz_p& A, con else { long V = 4*MAT_BLK_SZ; @@ -2019,7 +1906,7 @@ V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { -@@ -6325,7 +7331,7 @@ long elim_basic(const mat_zz_p& A, mat_z +@@ -6298,7 +7234,7 @@ long elim_basic(const mat_zz_p& A, mat_z #ifdef NTL_HAVE_LL_TYPE @@ -2028,7 +1915,7 @@ static inline -@@ -7778,8 +8784,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i +@@ -7751,8 +8687,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i else { long V = 4*MAT_BLK_SZ; @@ -2040,9 +1927,9 @@ V <= (MAX_DBL_INT-(p-1))/(p-1) && V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { ---- src/QuickTest.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/QuickTest.cpp 2017-09-09 12:10:10.895839630 -0600 -@@ -310,6 +310,9 @@ cerr << "Performance Options:\n"; +--- src/QuickTest.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/QuickTest.cpp 2018-04-09 11:00:57.958157036 -0600 +@@ -316,6 +316,9 @@ cerr << "Performance Options:\n"; cerr << "NTL_GF2X_NOINLINE\n"; #endif @@ -2052,8 +1939,8 @@ cerr << "\n\n"; ---- src/WizardAux.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/WizardAux 2017-09-09 12:10:10.895839630 -0600 +--- src/WizardAux.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/WizardAux 2018-04-09 11:00:57.958157036 -0600 @@ -89,6 +89,7 @@ system("$ARGV[0] InitSettings"); 'NTL_GF2X_NOINLINE' => 0, 'NTL_FFT_BIGTAB' => 0, @@ -2062,9 +1949,9 @@ 'WIZARD_HACK' => '#define NTL_WIZARD_HACK', ---- src/ZZ.cpp.orig 2017-07-07 09:05:14.000000000 -0600 -+++ src/ZZ.cpp 2017-09-18 18:41:27.125503871 -0600 -@@ -12,6 +12,13 @@ +--- src/ZZ.cpp.orig 2018-04-08 12:23:06.000000000 -0600 ++++ src/ZZ.cpp 2018-04-09 14:35:45.184455758 -0600 +@@ -14,6 +14,13 @@ #elif defined(NTL_HAVE_SSSE3) #include #include @@ -2078,7 +1965,7 @@ #endif -@@ -2106,6 +2113,481 @@ struct RandomStream_impl { +@@ -2351,6 +2358,591 @@ struct RandomStream_impl { }; @@ -2101,6 +1988,9 @@ +#define SSSE3_START _mm_setzero_si128() +#define AVX2_START _mm256_set_epi64x(0,1,0,0) + ++#define SSSE3_NONCE(nonce) _mm_set_epi64x(nonce,0) ++#define AVX2_NONCE(nonce) _mm256_set_epi64x(nonce, 1, nonce, 0) ++ +#define SSSE3_STOREU_VEC(m,r) _mm_storeu_si128((__m128i*)(m), r) +#define AVX2_STOREU_VEC(m,r) _mm256_storeu_si256((__m256i*)(m), r) + @@ -2168,11 +2058,11 @@ +#define SSSE3_SZ_VEC (16) +#define AVX2_SZ_VEC (32) + -+#define SSSE3_RANSTREAM_BUFSZ (1024) -+// must be a multiple of 8*SSE3_SZ_VEC ++#define SSSE3_RANSTREAM_NCHUNKS (4) ++// leads to a BUFSZ of 512 + -+#define AVX2_RANSTREAM_BUFSZ (1024) -+// must be a multiple of 8*AVX2_SZ_VEC ++#define AVX2_RANSTREAM_NCHUNKS (2) ++// leads to a BUFSZ of 512 + +#define SSSE3_DQROUND_VECTORS_VEC(a,b,c,d) \ + a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \ @@ -2199,7 +2089,13 @@ + b = AVX2_ROR_VEC_V3(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V1(d); + +#define SSSE3_RANSTREAM_STATESZ (4*SSSE3_SZ_VEC) -+#define AVX2_RANSTREAM_STATESZ (4*AVX2_SZ_VEC) ++#define AVX2_RANSTREAM_STATESZ (4*AVX2_SZ_VEC) ++ ++#define SSSE3_RANSTREAM_CHUNKSZ (2*SSSE3_RANSTREAM_STATESZ) ++#define AVX2_RANSTREAM_CHUNKSZ (2*AVX2_RANSTREAM_STATESZ) ++ ++#define SSSE3_RANSTREAM_BUFSZ (SSSE3_RANSTREAM_NCHUNKS*SSSE3_RANSTREAM_CHUNKSZ) ++#define AVX2_RANSTREAM_BUFSZ (AVX2_RANSTREAM_NCHUNKS*AVX2_RANSTREAM_CHUNKSZ) + +static void allocate_space(AlignedArray &state_store, + AlignedArray &buf_store) @@ -2276,7 +2172,7 @@ + AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3); +} + -+SSSE3_RESOLVER(static void, randomstream_impl_init, ++SSSE3_RESOLVER(static, void, randomstream_impl_init, + (_ntl_uint32 *state, AlignedArray &state_store, + AlignedArray &buf_store, const unsigned char *key)); + @@ -2285,6 +2181,7 @@ + unsigned char *buf, + AlignedArray &state_store __attribute__((unused)), + AlignedArray &buf_store __attribute__((unused)), ++ long &chunk_count __attribute__((unused)), + unsigned char *NTL_RESTRICT res, + long n, + long pos) @@ -2333,6 +2230,7 @@ + unsigned char *buf_ignored __attribute__((unused)), + AlignedArray &state_store, + AlignedArray &buf_store, ++ long &chunk_count, + unsigned char *NTL_RESTRICT res, + long n, + long pos) @@ -2364,8 +2262,9 @@ + + long i = 0; + for (; i <= n-SSSE3_RANSTREAM_BUFSZ; i += SSSE3_RANSTREAM_BUFSZ) { ++ chunk_count |= SSSE3_RANSTREAM_NCHUNKS; // disable small buffer strategy + -+ for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) { ++ for (long j = 0; j < SSSE3_RANSTREAM_NCHUNKS; j++) { + ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3; + ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA); + @@ -2383,7 +2282,20 @@ + } + + if (i < n) { -+ for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) { ++ ++ long nchunks; ++ ++ if (chunk_count < SSSE3_RANSTREAM_NCHUNKS) { ++ nchunks = long(cast_unsigned((n-i)+SSSE3_RANSTREAM_CHUNKSZ-1)/SSSE3_RANSTREAM_CHUNKSZ); ++ chunk_count += nchunks; ++ } ++ else ++ nchunks = SSSE3_RANSTREAM_NCHUNKS; ++ ++ long pos_offset = SSSE3_RANSTREAM_BUFSZ - nchunks*SSSE3_RANSTREAM_CHUNKSZ; ++ buf += pos_offset; ++ ++ for (long j = 0; j < nchunks; j++) { + ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3; + ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA); + @@ -2398,8 +2310,8 @@ + d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA); + } + -+ pos = n-i; -+ std::memcpy(&res[i], &buf[0], pos); ++ pos = n-i+pos_offset; ++ std::memcpy(&res[i], &buf[0], n-i); + } + + SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3); @@ -2412,6 +2324,7 @@ + unsigned char *buf_ignored __attribute__((unused)), + AlignedArray &state_store, + AlignedArray &buf_store, ++ long &chunk_count, + unsigned char *NTL_RESTRICT res, + long n, + long pos) @@ -2443,8 +2356,9 @@ + + long i = 0; + for (; i <= n-AVX2_RANSTREAM_BUFSZ; i += AVX2_RANSTREAM_BUFSZ) { ++ chunk_count |= AVX2_RANSTREAM_NCHUNKS; // disable small buffer strategy + -+ for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) { ++ for (long j = 0; j < AVX2_RANSTREAM_NCHUNKS; j++) { + avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3; + avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA); + @@ -2462,7 +2376,20 @@ + } + + if (i < n) { -+ for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) { ++ ++ long nchunks; ++ ++ if (chunk_count < AVX2_RANSTREAM_NCHUNKS) { ++ nchunks = long(cast_unsigned((n-i)+AVX2_RANSTREAM_CHUNKSZ-1)/AVX2_RANSTREAM_CHUNKSZ); ++ chunk_count += nchunks; ++ } ++ else ++ nchunks = AVX2_RANSTREAM_NCHUNKS; ++ ++ long pos_offset = AVX2_RANSTREAM_BUFSZ - nchunks*AVX2_RANSTREAM_CHUNKSZ; ++ buf += pos_offset; ++ ++ for (long j = 0; j < nchunks; j++) { + avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3; + avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA); + @@ -2477,8 +2404,8 @@ + d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA); + } + -+ pos = n-i; -+ std::memcpy(&res[i], &buf[0], pos); ++ pos = n-i+pos_offset; ++ std::memcpy(&res[i], &buf[0], n-i); + } + + AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3); @@ -2486,17 +2413,75 @@ + return pos; +} + -+SSSE3_RESOLVER(static long, randomstream_get_bytes, ++SSSE3_RESOLVER(static, long, randomstream_get_bytes, + (_ntl_uint32 *state, unsigned char *buf, + AlignedArray &state_store, + AlignedArray &buf_store, ++ long &chunk_count, + unsigned char *NTL_RESTRICT res, + long n, + long pos)); + ++BASE_FUNC(void, randomstream_set_nonce) ++(_ntl_uint32 *state, ++ AlignedArray &state_store __attribute__((unused)), ++ long &chunk_count __attribute__((unused)), ++ unsigned long nonce) ++{ ++ _ntl_uint32 nonce0, nonce1; ++ ++ nonce0 = nonce; ++ nonce0 = INT32MASK(nonce0); ++ ++ nonce1 = 0; ++ ++#if (NTL_BITS_PER_LONG > 32) ++ nonce1 = nonce >> 32; ++ nonce1 = INT32MASK(nonce1); ++#endif ++ ++ state[12] = 0; ++ state[13] = 0; ++ state[14] = nonce0; ++ state[15] = nonce1; ++} ++ ++SSSE3_FUNC(void, randomstream_set_nonce) ++(_ntl_uint32 *state_ignored __attribute__((unused)), ++ AlignedArray &state_store, ++ long &chunk_count, ++ unsigned long nonce) ++{ ++ unsigned char *state = state_store.elts(); ++ ssse3_ivec_t d3; ++ d3 = SSSE3_NONCE(nonce); ++ SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3); ++ chunk_count = 0; ++} ++ ++AVX2_FUNC(void, randomstream_set_nonce) ++(_ntl_uint32 *state_ignored __attribute__((unused)), ++ AlignedArray &state_store, ++ long &chunk_count, ++ unsigned long nonce) ++{ ++ unsigned char *state = state_store.elts(); ++ avx2_ivec_t d3; ++ d3 = AVX2_NONCE(nonce); ++ AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3); ++ chunk_count = 0; ++} ++ ++SSSE3_RESOLVER(, void, randomstream_set_nonce, ++ (_ntl_uint32 *state, ++ AlignedArray &state_store, ++ long &chunk_count, ++ unsigned long nonce)); ++ +struct RandomStream_impl { + AlignedArray state_store; + AlignedArray buf_store; ++ long chunk_count; + _ntl_uint32 state[16]; + unsigned char buf[64]; + @@ -2504,6 +2489,7 @@ + RandomStream_impl(const unsigned char *key) + { + randomstream_impl_init(state, state_store, buf_store, key); ++ chunk_count = 0; + } + + RandomStream_impl(const RandomStream_impl& other) @@ -2523,6 +2509,7 @@ + std::memcpy(state_store.elts(), other.state_store.elts(), SSSE3_RANSTREAM_STATESZ); + std::memcpy(buf_store.elts(), other.buf_store.elts(), SSSE3_RANSTREAM_BUFSZ); + } ++ chunk_count = other.chunk_count; + return *this; + } + @@ -2548,12 +2535,22 @@ + } + } + ++ // bytes are generated in chunks of RANSTREAM_BUFSZ bytes, except that ++ // initially, we may generate a few chunks of RANSTREAM_CHUNKSZ ++ // bytes. This optimizes a bit for short bursts following a reset. ++ + long + get_bytes(unsigned char *NTL_RESTRICT res, + long n, long pos) + { + return randomstream_get_bytes(state, buf, state_store, buf_store, -+ res, n, pos); ++ chunk_count, res, n, pos); ++ } ++ ++ void ++ set_nonce(unsigned long nonce) ++ { ++ randomstream_set_nonce(state, state_store, chunk_count, nonce); + } +}; + diff --git a/ntl.spec b/ntl.spec index 9f7ee7c..d7ea354 100644 --- a/ntl.spec +++ b/ntl.spec @@ -10,8 +10,8 @@ Summary: High-performance algorithms for vectors, matrices, and polynomials Name: ntl -Version: 10.5.0 -Release: 2%{?dist} +Version: 11.0.0 +Release: 1%{?dist} License: LGPLv2+ URL: http://shoup.net/ntl/ @@ -70,9 +70,6 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release} %build pushd src -# We eventually want to set NTL_STD_CXX14=on and NTL_SAFE_VECTORS=on, but that -# involves a change in semantics to vector and matrix assignment operations -# that latte-integrale, at least, is not yet prepared for. ./configure \ CXX="${CXX-g++}" \ CXXFLAGS="%{optflags} -fPIC" \ @@ -81,9 +78,10 @@ pushd src DOCDIR=%{_docdir} \ INCLUDEDIR=%{_includedir} \ LIBDIR=%{_libdir} \ + LDLIBS="-lpthread -lm" \ NATIVE=off \ %{?gf2x:NTL_GF2X_LIB=on} \ - NTL_DISABLE_TLS_HACK=on \ + NTL_STD_CXX14=on \ %ifarch x86_64 NTL_LOADTIME_CPU=on \ TUNE=x86 \ @@ -135,13 +133,12 @@ done %endif -%post -p /sbin/ldconfig -%postun -p /sbin/ldconfig +%ldconfig_scriptlets %files %doc README %license doc/copying.txt -%{_libdir}/libntl.so.35* +%{_libdir}/libntl.so.36* %files devel %doc doc/* @@ -155,6 +152,9 @@ done %changelog +* Sat Jun 2 2018 Jerry James - 11.0.0-1 +- ntl-11.0.0 + * Thu Feb 08 2018 Fedora Release Engineering - 10.5.0-2 - Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild diff --git a/sources b/sources index a858e08..ce0b963 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (ntl-10.5.0.tar.gz) = b299dfc29005079470972c2a9ca02acd0ebdbc31ff8923df02f3627dbc66daa0f527226972cef032e1e488c4272554634a96456e94653fdf8b01356160319aa0 +SHA512 (ntl-11.0.0.tar.gz) = 495a07db6ac92ec41b9c660e53d2a714f635f042c48453c59fc6524ee3e64f4ca52878bd2b96e1a21b1c1a39bdb68314ca5b3fc36c75e7d616a33c6d8c7f8e5a