diff --git a/.gitignore b/.gitignore
index 18b4a37..5ea72f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1 @@
-/ntl-9.4.0.tar.gz
-/ntl-9.6.2.tar.gz
-/ntl-9.6.4.tar.gz
-/ntl-9.7.0.tar.gz
-/ntl-9.8.0.tar.gz
-/ntl-9.9.1.tar.gz
-/ntl-9.10.0.tar.gz
-/ntl-9.11.0.tar.gz
-/ntl-10.1.0.tar.gz
-/ntl-10.3.0.tar.gz
-/ntl-10.5.0.tar.gz
+/ntl-*.tar.gz
diff --git a/ntl-loadtime-cpu.patch b/ntl-loadtime-cpu.patch
index f4838e5..3aa7c78 100644
--- a/ntl-loadtime-cpu.patch
+++ b/ntl-loadtime-cpu.patch
@@ -1,6 +1,6 @@
---- doc/config.txt.orig	2017-07-07 09:05:14.000000000 -0600
-+++ doc/config.txt	2017-09-09 12:10:10.877839678 -0600
-@@ -337,6 +337,7 @@ NTL_AVOID_BRANCHING=off
+--- doc/config.txt.orig	2018-04-08 12:23:07.000000000 -0600
++++ doc/config.txt	2018-04-09 11:00:49.604177774 -0600
+@@ -367,6 +367,7 @@ NTL_AVOID_BRANCHING=off
  NTL_GF2X_NOINLINE=off
  NTL_GF2X_ALTCODE=off
  NTL_GF2X_ALTCODE1=off
@@ -8,7 +8,7 @@
  
  GMP_INCDIR=$(GMP_PREFIX)/include
  GMP_LIBDIR=$(GMP_PREFIX)/lib
-@@ -634,6 +635,10 @@ NTL_GF2X_ALTCODE1=off
+@@ -680,6 +681,10 @@ NTL_GF2X_ALTCODE1=off
  
  # Yet another alternative implementation for GF2X multiplication.
  
@@ -19,9 +19,9 @@
  
  
  ########## More GMP Options:
---- include/NTL/config.h.orig	2017-07-07 09:05:14.000000000 -0600
-+++ include/NTL/config.h	2017-09-09 12:10:10.891839641 -0600
-@@ -525,6 +525,20 @@ using the configure script.
+--- include/NTL/config.h.orig	2018-04-08 12:23:07.000000000 -0600
++++ include/NTL/config.h	2018-04-09 11:00:49.628177715 -0600
+@@ -517,6 +517,20 @@ to be defined.  Of course,  to unset a f
  #endif
  
  
@@ -42,9 +42,9 @@
  
  
  
---- include/NTL/ctools.h.orig	2017-07-07 09:05:14.000000000 -0600
-+++ include/NTL/ctools.h	2017-09-09 13:36:16.915768457 -0600
-@@ -498,6 +498,166 @@ char *_ntl_make_aligned(char *p, long al
+--- include/NTL/ctools.h.orig	2018-04-08 12:23:06.000000000 -0600
++++ include/NTL/ctools.h	2018-04-09 14:24:06.057491526 -0600
+@@ -509,6 +509,155 @@ char *_ntl_make_aligned(char *p, long al
  // and it should also be as big as a cache line
  
  
@@ -85,10 +85,9 @@
 +#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul,ssse3",avx,type,name)
 +#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul,ssse3",fma,type,name)
 +#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul,ssse3",avx2,type,name)
-+#define SSSE3_RESOLVER(type,name,params)                                \
++#define SSSE3_RESOLVER(st,type,name,params)                             \
 +   extern "C" {                                                         \
-+      static void __attribute__((optimize ("O0")))                      \
-+         (*resolve_##name (void))(void) {                               \
++      static type (*resolve_##name(void)) params {                      \
 +         if (__builtin_expect(have_avx2, 0) < 0) {                      \
 +            unsigned int eax, ebx, ecx, edx;                            \
 +            if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {               \
@@ -105,16 +104,15 @@
 +               have_ssse3 = 0;                                          \
 +            }                                                           \
 +         }                                                              \
-+         if (have_avx2) return (void (*)(void))&name##_avx2;            \
-+         if (have_ssse3) return (void (*)(void))&name##_ssse3;          \
-+         return (void (*)(void))&name##_base;                           \
++         if (have_avx2) return &name##_avx2;                            \
++         if (have_ssse3) return &name##_ssse3;                          \
++         return &name##_base;                                           \
 +      }                                                                 \
 +   }                                                                    \
-+   type __attribute__((ifunc ("resolve_" #name))) name params
-+#define PCLMUL_RESOLVER(type,name,params)                               \
++   st type __attribute__((ifunc ("resolve_" #name))) name params
++#define PCLMUL_RESOLVER(st,type,name,params)                            \
 +   extern "C" {                                                         \
-+      static void __attribute__((optimize ("O0")))                      \
-+         (*resolve_##name (void))(void) {                               \
++      static type (*resolve_##name(void)) params {                      \
 +         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
 +            unsigned int eax, ebx, ecx, edx;                            \
 +            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
@@ -127,16 +125,15 @@
 +               have_fma = 0;                                            \
 +            }                                                           \
 +         }                                                              \
-+         if (have_avx) return (void (*)(void))&name##_avx;              \
-+         if (have_pclmul) return (void (*)(void))&name##_pclmul;        \
-+         return (void (*)(void))&name##_base;                           \
++         if (have_avx) return &name##_avx;                              \
++         if (have_pclmul) return &name##_pclmul;                        \
++         return &name##_base;                                           \
 +      }                                                                 \
 +   }                                                                    \
-+   type __attribute__((ifunc ("resolve_" #name))) name params
-+#define AVX_RESOLVER(type,name,params)                                  \
++   st type __attribute__((ifunc ("resolve_" #name))) name params
++#define AVX_RESOLVER(st,type,name,params)                               \
 +   extern "C" {                                                         \
-+      static void __attribute__((optimize ("O0")))                      \
-+         (*resolve_##name (void))(void) {                               \
++      static type (*resolve_##name(void)) params {                      \
 +         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
 +            unsigned int eax, ebx, ecx, edx;                            \
 +            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
@@ -149,16 +146,13 @@
 +               have_fma = 0;                                            \
 +            }                                                           \
 +         }                                                              \
-+         return have_avx                                                \
-+               ? (void (*)(void))&name##_avx                            \
-+               : (void (*)(void))&name##_base;                          \
++         return have_avx ? &name##_avx : &name##_base;                  \
 +      }                                                                 \
 +   }                                                                    \
-+   type __attribute__((ifunc ("resolve_" #name))) name params
-+#define FMA_RESOLVER(type,name,params)                                  \
++   st type __attribute__((ifunc ("resolve_" #name))) name params
++#define FMA_RESOLVER(st,type,name,params)                               \
 +   extern "C" {                                                         \
-+      static void __attribute__((optimize ("O0")))                      \
-+         (*resolve_##name (void))(void) {                               \
++      static type (*resolve_##name(void)) params {                      \
 +         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
 +            unsigned int eax, ebx, ecx, edx;                            \
 +            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
@@ -171,16 +165,13 @@
 +               have_fma = 0;                                            \
 +            }                                                           \
 +         }                                                              \
-+         return have_fma                                                \
-+               ? (void (*)(void))&name##_fma                            \
-+               : (void (*)(void))&name##_avx;                           \
++         return have_fma ? &name##_fma : &name##_avx;                   \
 +      }                                                                 \
 +   }                                                                    \
-+   type __attribute__((ifunc ("resolve_" #name))) name params
-+#define AVX2_RESOLVER(type,name,params)                                 \
++   st type __attribute__((ifunc ("resolve_" #name))) name params
++#define AVX2_RESOLVER(st,type,name,params)                              \
 +   extern "C" {                                                         \
-+      static void __attribute__((optimize ("O0")))                      \
-+         (*resolve_##name (void))(void) {                               \
++      static type (*resolve_##name(void)) params {                      \
 +         if (__builtin_expect(have_avx2, 0) < 0) {                      \
 +            unsigned int eax, ebx, ecx, edx;                            \
 +            if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {               \
@@ -201,40 +192,16 @@
 +               have_fma = 0;                                            \
 +            }                                                           \
 +         }                                                              \
-+         return have_avx2                                               \
-+               ? (void (*)(void))&name##_avx2                           \
-+               : (void (*)(void))&name##_fma;                           \
++         return have_avx2 ? &name##_avx2 : &name##_fma;                 \
 +      }                                                                 \
 +   }                                                                    \
-+   type __attribute__((ifunc ("resolve_" #name))) name params
++   st type __attribute__((ifunc ("resolve_" #name))) name params
 +#endif
  
  #ifdef NTL_HAVE_BUILTIN_CLZL
  
---- include/NTL/def_config.h.orig	2017-07-07 09:05:14.000000000 -0600
-+++ include/NTL/def_config.h	2017-09-09 12:10:10.892839638 -0600
-@@ -525,6 +525,19 @@ using the configure script.
- #endif
- 
- 
-+#if 0
-+#define NTL_LOADTIME_CPU
-+
-+/*
-+ * With this flag enabled, detect advanced CPU features at load time instead
-+ * of at compile time.  This flag is intended for distributions, so that they
-+ * can compile for the lowest common denominator CPU, but still support newer
-+ * CPUs.
-+ *
-+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
-+ */
-+
-+#endif
- 
- 
- 
---- include/NTL/MatPrime.h.orig	2017-07-07 09:05:14.000000000 -0600
-+++ include/NTL/MatPrime.h	2017-09-09 12:10:10.892839638 -0600
+--- include/NTL/MatPrime.h.orig	2018-04-08 12:23:07.000000000 -0600
++++ include/NTL/MatPrime.h	2018-04-09 11:00:57.950157056 -0600
 @@ -20,7 +20,7 @@ NTL_OPEN_NNS
  
  
@@ -244,18 +211,18 @@
  #define NTL_MatPrime_NBITS (23)
  #else
  #define NTL_MatPrime_NBITS NTL_SP_NBITS
---- include/NTL/REPORT_ALL_FEATURES.h.orig	2017-07-07 09:05:15.000000000 -0600
-+++ include/NTL/REPORT_ALL_FEATURES.h	2017-09-09 12:11:10.313683979 -0600
-@@ -39,3 +39,6 @@
-    std::cerr << "NTL_HAVE_COPY_TRAITS2\n";
+--- include/NTL/REPORT_ALL_FEATURES.h.orig	2018-04-08 12:23:07.000000000 -0600
++++ include/NTL/REPORT_ALL_FEATURES.h	2018-04-09 11:00:57.951157054 -0600
+@@ -51,3 +51,6 @@
+    std::cerr << "NTL_HAVE_POSIX_TIME\n";
  #endif
  
 +#ifdef NTL_LOADTIME_CPU
 +   std::cerr << "NTL_LOADTIME_CPU\n";
 +#endif
---- src/cfile.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/cfile	2017-09-09 12:10:10.892839638 -0600
-@@ -480,6 +480,20 @@ using the configure script.
+--- src/cfile.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/cfile	2018-04-09 11:00:57.951157054 -0600
+@@ -471,6 +471,20 @@ to be defined.  Of course,  to unset a f
  #elif @{NTL_GF2X_ALTCODE1}
  #define NTL_GF2X_ALTCODE1
  
@@ -276,9 +243,9 @@
  
  /*
   * Yest another alternative strategy for implementing GF2X
---- src/DispSettings.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/DispSettings.cpp	2017-09-09 12:10:10.892839638 -0600
-@@ -168,6 +168,10 @@ cout << "Performance Options:\n";
+--- src/DispSettings.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/DispSettings.cpp	2018-04-09 11:00:57.952157051 -0600
+@@ -179,6 +179,10 @@ cout << "Performance Options:\n";
     cout << "NTL_GF2X_NOINLINE\n";
  #endif
  
@@ -289,18 +256,17 @@
  
     cout << "***************************/\n";
     cout << "\n\n";
---- src/DoConfig.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/DoConfig	2017-09-09 12:10:10.892839638 -0600
-@@ -1,7 +1,7 @@
+--- src/DoConfig.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/DoConfig	2018-04-09 11:03:15.877814660 -0600
+@@ -1,6 +1,7 @@
  # This is a perl script, invoked from a shell
  
- # use warnings;  # this doesn't work on older versions of perl
--
+ use warnings;  # this doesn't work on older versions of perl
 +use Config;
  
  
- %MakeFlag = (
-@@ -86,6 +86,7 @@
+ system("echo '*** CompilerOutput.log ***' > CompilerOutput.log");
+@@ -90,6 +91,7 @@ system("echo '*** CompilerOutput.log ***
  'NTL_GF2X_NOINLINE'       => 'off',
  'NTL_GF2X_ALTCODE'        => 'off',
  'NTL_GF2X_ALTCODE1'       => 'off',
@@ -308,9 +274,9 @@
  
  
  );
-@@ -195,6 +196,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
- }
+@@ -222,6 +224,15 @@ if ($MakeFlag{'SHARED'} eq 'off') {
  
+    }
  
 +# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE
 +
@@ -322,11 +288,11 @@
 +}
 +
  
- # some special MakeVal values that are determined by SHARED
  
---- src/GF2X1.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/GF2X1.cpp	2017-09-09 12:10:10.893839636 -0600
-@@ -19,7 +19,7 @@
+ }
+--- src/GF2X1.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/GF2X1.cpp	2018-04-09 11:00:57.955157044 -0600
+@@ -18,7 +18,7 @@
  // simple scaling factor for some crossover points:
  // we use a lower crossover of the underlying multiplication
  // is faster  
@@ -335,9 +301,9 @@
  #define XOVER_SCALE (1L)
  #else
  #define XOVER_SCALE (2L)
---- src/GF2X.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/GF2X.cpp	2017-09-09 12:10:10.893839636 -0600
-@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned
+--- src/GF2X.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/GF2X.cpp	2018-04-09 14:24:53.705349749 -0600
+@@ -27,6 +27,22 @@ pclmul_mul1 (unsigned long *c, unsigned
     _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
  }
  
@@ -381,7 +347,7 @@
 +   pclmul_mul1(c, a, b);
 +}
 +
-+PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
++PCLMUL_RESOLVER(static,void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
 +
 +#else
 +
@@ -440,7 +406,7 @@
 +   cp[sb] = carry;
 +}
 +
-+PCLMUL_RESOLVER(static void,Mul1,
++PCLMUL_RESOLVER(static,void,Mul1,
 +    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
 +
 +#else
@@ -448,8 +414,8 @@
  static 
  void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
  {
-@@ -619,6 +702,53 @@ NTL_EFF_BB_MUL_CODE1
- 
+@@ -620,6 +703,53 @@ NTL_EFF_BB_MUL_CODE1
+ // warning #13200: No EMMS instruction before return
  }
  
 +#endif
@@ -494,7 +460,7 @@
 +   cp[sb] ^= carry;
 +}
 +
-+PCLMUL_RESOLVER(static void,AddMul1,
++PCLMUL_RESOLVER(static,void,AddMul1,
 +    (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a));
 +
 +#else
@@ -502,7 +468,7 @@
  static 
  void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
  {
-@@ -647,6 +777,52 @@ NTL_EFF_BB_MUL_CODE2
+@@ -648,6 +778,52 @@ NTL_EFF_BB_MUL_CODE2
  
  }
  
@@ -548,15 +514,15 @@
 +   cp[sb] = carry;
 +}
 +
-+PCLMUL_RESOLVER(static void,Mul1_short,
++PCLMUL_RESOLVER(static,void,Mul1_short,
 +    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
 +
 +#else
  
  static 
  void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
-@@ -675,10 +851,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
- 
+@@ -677,10 +853,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
+ // warning #13200: No EMMS instruction before return
  }
  
 +#endif
@@ -580,14 +546,14 @@
 +   pclmul_mul1(c, a, b);
 +}
 + 
-+PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
++PCLMUL_RESOLVER(static,void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
 + 
 +#else
 + 
  static 
  void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
  {
-@@ -692,6 +889,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
+@@ -694,6 +891,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
  
  }
  
@@ -595,7 +561,7 @@
  
  // mul2...mul8 hard-code 2x2...8x8 word multiplies.
  // I adapted these routines from LiDIA (except mul3, see below).
-@@ -1603,6 +1801,77 @@ static const _ntl_ulong sqrtab[256] = {
+@@ -1611,6 +1809,77 @@ static const _ntl_ulong sqrtab[256] = {
  
  
  
@@ -666,14 +632,14 @@
 +   return;
 +}
 +
-+PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a));
++PCLMUL_RESOLVER(,void,sqr,(GF2X& c, const GF2X& a));
 +
 +#else
 +
  static inline
  void sqr1(_ntl_ulong *c, _ntl_ulong a)
  {
-@@ -1643,6 +1912,7 @@ void sqr(GF2X& c, const GF2X& a)
+@@ -1651,6 +1920,7 @@ void sqr(GF2X& c, const GF2X& a)
     return;
  }
  
@@ -681,9 +647,9 @@
  
  
  void LeftShift(GF2X& c, const GF2X& a, long n)
---- src/InitSettings.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/InitSettings.cpp	2017-09-09 12:10:10.894839633 -0600
-@@ -166,6 +166,11 @@ int main()
+--- src/InitSettings.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/InitSettings.cpp	2018-04-09 11:00:57.956157041 -0600
+@@ -172,6 +172,11 @@ int main()
     cout << "NTL_RANGE_CHECK=0\n";
  #endif
  
@@ -695,9 +661,9 @@
  
  // the following are not actual config flags, but help
  // in the Wizard logic
---- src/mat_lzz_p.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/mat_lzz_p.cpp	2017-09-09 12:10:10.895839630 -0600
-@@ -10,6 +10,15 @@
+--- src/mat_lzz_p.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/mat_lzz_p.cpp	2018-04-09 15:00:11.078401207 -0600
+@@ -9,6 +9,15 @@
  
  #ifdef NTL_HAVE_AVX
  #include <immintrin.h>
@@ -713,7 +679,7 @@
  #endif
  
  NTL_START_IMPL
-@@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+@@ -625,7 +634,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
  
  #ifdef NTL_HAVE_LL_TYPE
  
@@ -722,7 +688,7 @@
  
  #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
  // max int representable exactly as a double
-@@ -640,18 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+@@ -639,18 +648,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
  
  
  // MUL_ADD(a, b, c): a += b*c
@@ -755,7 +721,7 @@
 +
 +   long i = 0;
 +   for (; i <= n-4; i +=4) {
- 
++
 +      // the following code sequences are a bit faster than
 +      // just doing 4 _mm256_broadcast_sd's
 +      // it requires a to point to aligned storage, however
@@ -770,7 +736,7 @@
 +      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
 +
 +#endif
- 
++
 +      __m256d avec0 = _mm256_permute_pd(a0101, 0);
 +      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
 +      __m256d avec2 = _mm256_permute_pd(a2323, 0);
@@ -788,7 +754,7 @@
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec);
-+
+ 
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec);
@@ -797,7 +763,7 @@
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec);
-+
+ 
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec);
@@ -848,7 +814,7 @@
     __m256d acc0=_mm256_load_pd(x + 0*4);
     __m256d acc1=_mm256_load_pd(x + 1*4);
     __m256d acc2=_mm256_load_pd(x + 2*4);
-@@ -661,10 +772,179 @@ void muladd1_by_32(double *x, const doub
+@@ -660,10 +771,179 @@ void muladd1_by_32(double *x, const doub
     __m256d acc6=_mm256_load_pd(x + 6*4);
     __m256d acc7=_mm256_load_pd(x + 7*4);
  
@@ -860,7 +826,7 @@
 +      // the following code sequences are a bit faster than
 +      // just doing 4 _mm256_broadcast_sd's
 +      // it requires a to point to aligned storage, however
- 
++
 +#if 1
 +     // this one seems slightly faster
 +      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
@@ -943,7 +909,7 @@
 +   _mm256_store_pd(x + 7*4, acc7);
 +}
 +
-+FMA_RESOLVER(static void,muladd1_by_32,
++FMA_RESOLVER(static,void,muladd1_by_32,
 +   (double *x, const double *a, const double *b, long n));
 +
 +#else
@@ -995,7 +961,7 @@
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec0, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec0, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec0, bvec);
-+
+ 
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec1, bvec);
 +      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec1, bvec);
@@ -1030,7 +996,7 @@
  
        bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
        bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
-@@ -687,6 +967,75 @@ void muladd1_by_32(double *x, const doub
+@@ -686,6 +966,75 @@ void muladd1_by_32(double *x, const doub
     _mm256_store_pd(x + 7*4, acc7);
  }
  
@@ -1098,7 +1064,7 @@
 +   _mm256_store_pd(x + 3*4, acc3);
 +}
 +
-+FMA_RESOLVER(static void,muladd1_by_16,
++FMA_RESOLVER(static,void,muladd1_by_16,
 +   (double *x, const double *a, const double *b, long n));
 +
 +#else
@@ -1106,7 +1072,7 @@
  static
  void muladd1_by_16(double *x, const double *a, const double *b, long n)
  {
-@@ -717,6 +1066,165 @@ void muladd1_by_16(double *x, const doub
+@@ -716,6 +1065,165 @@ void muladd1_by_16(double *x, const doub
     _mm256_store_pd(x + 3*4, acc3);
  }
  
@@ -1265,14 +1231,14 @@
 +
 +}
 +
-+FMA_RESOLVER(static void,muladd2_by_32,
++FMA_RESOLVER(static,void,muladd2_by_32,
 +   (double *x, const double *a, const double *b, long n));
 +
 +#else
  
  // experiment: process two rows at a time
  static
-@@ -795,6 +1303,211 @@ void muladd2_by_32(double *x, const doub
+@@ -794,6 +1302,211 @@ void muladd2_by_32(double *x, const doub
  
  }
  
@@ -1477,14 +1443,14 @@
 +
 +}
 +
-+AVX2_RESOLVER(static void,muladd3_by_32,
++AVX2_RESOLVER(static,void,muladd3_by_32,
 +   (double *x, const double *a, const double *b, long n));
 +
 +#else
  
  // experiment: process three rows at a time
  // NOTE: this makes things slower on an AVX1 platform --- not enough registers
-@@ -899,8 +1612,10 @@ void muladd3_by_32(double *x, const doub
+@@ -898,8 +1611,10 @@ void muladd3_by_32(double *x, const doub
  
  }
  
@@ -1497,7 +1463,7 @@
  {
     __m256d avec0, avec1, bvec;
     __m256d acc00, acc01, acc02, acc03;
-@@ -923,10 +1638,10 @@ void muladd2_by_16(double *x, const doub
+@@ -922,10 +1637,10 @@ void muladd2_by_16(double *x, const doub
        avec0 = _mm256_broadcast_sd(&a[i]); 
        avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
  
@@ -1512,7 +1478,7 @@
     }
  
  
-@@ -942,8 +1657,8 @@ void muladd2_by_16(double *x, const doub
+@@ -941,8 +1656,8 @@ void muladd2_by_16(double *x, const doub
  
  }
  
@@ -1523,7 +1489,7 @@
  {
     __m256d avec0, avec1, avec2, bvec;
     __m256d acc00, acc01, acc02, acc03;
-@@ -973,10 +1688,10 @@ void muladd3_by_16(double *x, const doub
+@@ -972,10 +1687,10 @@ void muladd3_by_16(double *x, const doub
        avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
        avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
  
@@ -1538,7 +1504,7 @@
     }
  
  
-@@ -997,6 +1712,30 @@ void muladd3_by_16(double *x, const doub
+@@ -996,6 +1711,30 @@ void muladd3_by_16(double *x, const doub
  
  }
  
@@ -1569,7 +1535,7 @@
  static inline
  void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
  {
-@@ -1016,6 +1755,30 @@ void muladd_all_by_32(long first, long l
+@@ -1015,6 +1754,30 @@ void muladd_all_by_32(long first, long l
  #endif
  }
  
@@ -1600,7 +1566,7 @@
  
  static inline
  void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
-@@ -1036,6 +1799,8 @@ void muladd_all_by_16(long first, long l
+@@ -1035,6 +1798,8 @@ void muladd_all_by_16(long first, long l
  #endif
  }
  
@@ -1609,12 +1575,11 @@
  static inline
  void muladd_all_by_32_width(long first, long last, double *x, const double *a, const double *b, long n, long width)
  {
-@@ -1045,7 +1810,74 @@ void muladd_all_by_32_width(long first,
-       muladd_all_by_16(first, last, x, a, b, n);
- }
+@@ -1050,6 +1815,72 @@ void muladd_all_by_32_width(long first,
  
+ 
+ // this assumes n is a multiple of 16
 +#ifdef NTL_LOADTIME_CPU
-+
 +AVX_FUNC(void,muladd_interval)
 +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
 +{
@@ -1651,7 +1616,7 @@
 +{
 +   __m256d xvec0, xvec1, xvec2, xvec3;
 +   __m256d yvec0, yvec1, yvec2, yvec3;
- 
++
 +   __m256d cvec = _mm256_broadcast_sd(&c);
 +
 +   for (long i = 0; i < n; i += 16, x += 16, y += 16) {
@@ -1677,21 +1642,19 @@
 +   }
 +}
 +
-+FMA_RESOLVER(static void,muladd_interval,
++FMA_RESOLVER(static,void,muladd_interval,
 +   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
-+
 +#else
- 
- // this assumes n is a multiple of 16
  static inline
-@@ -1079,6 +1911,107 @@ void muladd_interval(double * NTL_RESTRI
+ void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+ {
+@@ -1080,7 +1911,105 @@ void muladd_interval(double * NTL_RESTRI
+       _mm256_store_pd(x + 3*4, xvec3);
     }
  }
- 
 +#endif
 +
 +#ifdef NTL_LOADTIME_CPU
-+
 +AVX_FUNC(void,muladd_interval1)
 +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
 +{
@@ -1741,7 +1704,7 @@
 +FMA_FUNC(void,muladd_interval1)
 +(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
 +{
-+
+ 
 +   __m256d xvec0, xvec1, xvec2, xvec3;
 +   __m256d yvec0, yvec1, yvec2, yvec3;
 +   __m256d cvec;
@@ -1784,98 +1747,22 @@
 +   }
 +}
 +
-+FMA_RESOLVER(static void,muladd_interval1,
++FMA_RESOLVER(static,void,muladd_interval1,
 +   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
 +
 +#else
-+
  // this one is more general: does not assume that n is a
  // multiple of 16
  static inline
-@@ -1127,8 +2060,73 @@ void muladd_interval1(double * NTL_RESTR
-    }
- }
+@@ -1131,6 +2060,7 @@ void muladd_interval1(double * NTL_RESTR
  
-+#endif
-+
- #define AVX_PD_SZ (4)
- 
-+#ifdef NTL_LOADTIME_CPU
-+
-+AVX_FUNC(void,muladd_interval2)
-+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
-+{
-+   n /= 4;
-+   if (n <= 0 || n > 8) return;
-+
-+   x += n*4;
-+   y += n*4;
-+
-+   // n in [1..8]
-+
-+   __m256d xvec, yvec, cvec;
-+
-+   cvec = _mm256_broadcast_sd(&c);
-+
-+   switch (n) {
-+   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
-+   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
-+   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
-+   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
-+   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
-+   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
-+   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
-+   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
-+   }
-+   
-+}
-+
-+FMA_FUNC(void,muladd_interval2)
-+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
-+{
-+   n /= 4;
-+   if (n <= 0 || n > 8) return;
-+
-+   x += n*4;
-+   y += n*4;
-+
-+   // n in [1..8]
-+
-+   __m256d xvec, yvec, cvec;
-+
-+   cvec = _mm256_broadcast_sd(&c);
-+
-+   switch (n) {
-+   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
-+   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
-+   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
-+   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
-+   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
-+   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
-+   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
-+   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
-+   }
-+   
-+}
-+
-+FMA_RESOLVER(static void,muladd_interval2,
-+   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
-+
-+#else
-+
- // experimental: assumes n is a multiple of 4 in the range [0..32]
- #if 1
- static inline
-@@ -1169,6 +2167,8 @@ void muladd_interval2(double * NTL_RESTR
  
  #endif
- 
 +#endif
-+
  
- #define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
  
-@@ -2743,10 +3743,10 @@ void alt_mul_LL(const mat_window_zz_p& X
+ //#define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
+@@ -2716,10 +3646,10 @@ void alt_mul_LL(const mat_window_zz_p& X
  }  
  
  
@@ -1889,7 +1776,7 @@
                  const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)  
  {  
     long n = A.NumRows();  
-@@ -3085,12 +4085,13 @@ void mul_base (const mat_window_zz_p& X,
+@@ -3058,12 +3988,13 @@ void mul_base (const mat_window_zz_p& X,
     long p = zz_p::modulus();
     long V = MAT_BLK_SZ*4;
  
@@ -1905,7 +1792,7 @@
         p-1 <= MAX_DBL_INT &&
         V <= (MAX_DBL_INT-(p-1))/(p-1) &&
         V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) 
-@@ -3185,7 +4186,8 @@ void mul_strassen(const mat_window_zz_p&
+@@ -3158,7 +4089,8 @@ void mul_strassen(const mat_window_zz_p&
      // this code determines if mul_base triggers blk_mul_DD,
      // in which case a higher crossover is used
  
@@ -1915,7 +1802,7 @@
      {
         long V = MAT_BLK_SZ*4;
         long p = zz_p::modulus();
-@@ -3685,10 +4687,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
+@@ -3658,10 +4590,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
  
  
  
@@ -1929,7 +1816,7 @@
  {
     long n = A.NumRows();
  
-@@ -3854,10 +4856,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
+@@ -3827,10 +4759,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
  
  
  
@@ -1943,7 +1830,7 @@
  {
     long n = A.NumRows();
  
-@@ -4615,8 +5617,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+@@ -4588,8 +5520,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
     else if (n/MAT_BLK_SZ < 4) {
        long V = 64;
  
@@ -1955,7 +1842,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -4641,8 +5644,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+@@ -4614,8 +5547,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -1967,7 +1854,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -5048,10 +6052,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
+@@ -5021,10 +5955,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
  
  
  
@@ -1981,7 +1868,7 @@
                 vec_zz_p *xp, bool trans, bool relax)
  {
     long n = A.NumRows();
-@@ -5238,10 +6242,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
+@@ -5211,10 +6145,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
  
  
  
@@ -1995,7 +1882,7 @@
                 vec_zz_p *xp, bool trans, bool relax)
  {
     long n = A.NumRows();
-@@ -6052,8 +7056,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+@@ -6025,8 +6959,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
     else if (n/MAT_BLK_SZ < 4) {
        long V = 64;
  
@@ -2007,7 +1894,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -6078,8 +7083,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+@@ -6051,8 +6986,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -2019,7 +1906,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -6325,7 +7331,7 @@ long elim_basic(const mat_zz_p& A, mat_z
+@@ -6298,7 +7234,7 @@ long elim_basic(const mat_zz_p& A, mat_z
  #ifdef NTL_HAVE_LL_TYPE
  
  
@@ -2028,7 +1915,7 @@
  
  
  static inline
-@@ -7778,8 +8784,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
+@@ -7751,8 +8687,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -2040,9 +1927,9 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
---- src/QuickTest.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/QuickTest.cpp	2017-09-09 12:10:10.895839630 -0600
-@@ -310,6 +310,9 @@ cerr << "Performance Options:\n";
+--- src/QuickTest.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/QuickTest.cpp	2018-04-09 11:00:57.958157036 -0600
+@@ -316,6 +316,9 @@ cerr << "Performance Options:\n";
     cerr << "NTL_GF2X_NOINLINE\n";
  #endif
  
@@ -2052,8 +1939,8 @@
  
     cerr << "\n\n";
  
---- src/WizardAux.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/WizardAux	2017-09-09 12:10:10.895839630 -0600
+--- src/WizardAux.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/WizardAux	2018-04-09 11:00:57.958157036 -0600
 @@ -89,6 +89,7 @@ system("$ARGV[0] InitSettings");
  'NTL_GF2X_NOINLINE'    => 0,
  'NTL_FFT_BIGTAB'       => 0,
@@ -2062,9 +1949,9 @@
  
  'WIZARD_HACK'          => '#define NTL_WIZARD_HACK',
  
---- src/ZZ.cpp.orig	2017-07-07 09:05:14.000000000 -0600
-+++ src/ZZ.cpp	2017-09-18 18:41:27.125503871 -0600
-@@ -12,6 +12,13 @@
+--- src/ZZ.cpp.orig	2018-04-08 12:23:06.000000000 -0600
++++ src/ZZ.cpp	2018-04-09 14:35:45.184455758 -0600
+@@ -14,6 +14,13 @@
  #elif defined(NTL_HAVE_SSSE3)
  #include <emmintrin.h>
  #include <tmmintrin.h>
@@ -2078,7 +1965,7 @@
  #endif
  
  
-@@ -2106,6 +2113,481 @@ struct RandomStream_impl {
+@@ -2351,6 +2358,591 @@ struct RandomStream_impl {
  };
  
  
@@ -2101,6 +1988,9 @@
 +#define SSSE3_START  _mm_setzero_si128()
 +#define AVX2_START   _mm256_set_epi64x(0,1,0,0)
 +
++#define SSSE3_NONCE(nonce)	_mm_set_epi64x(nonce,0)
++#define AVX2_NONCE(nonce)	_mm256_set_epi64x(nonce, 1, nonce, 0)
++
 +#define SSSE3_STOREU_VEC(m,r)	_mm_storeu_si128((__m128i*)(m), r)
 +#define AVX2_STOREU_VEC(m,r)	_mm256_storeu_si256((__m256i*)(m), r)
 +
@@ -2168,11 +2058,11 @@
 +#define SSSE3_SZ_VEC (16)
 +#define AVX2_SZ_VEC (32)
 +
-+#define SSSE3_RANSTREAM_BUFSZ (1024)
-+// must be a multiple of 8*SSE3_SZ_VEC
++#define SSSE3_RANSTREAM_NCHUNKS (4)
++// leads to a BUFSZ of 512
 +
-+#define AVX2_RANSTREAM_BUFSZ (1024)
-+// must be a multiple of 8*AVX2_SZ_VEC
++#define AVX2_RANSTREAM_NCHUNKS (2)
++// leads to a BUFSZ of 512
 +
 +#define SSSE3_DQROUND_VECTORS_VEC(a,b,c,d)				\
 +  a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \
@@ -2199,7 +2089,13 @@
 +  b = AVX2_ROR_VEC_V3(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V1(d);
 +
 +#define SSSE3_RANSTREAM_STATESZ (4*SSSE3_SZ_VEC)
-+#define AVX2_RANSTREAM_STATESZ (4*AVX2_SZ_VEC)
++#define AVX2_RANSTREAM_STATESZ  (4*AVX2_SZ_VEC)
++
++#define SSSE3_RANSTREAM_CHUNKSZ (2*SSSE3_RANSTREAM_STATESZ)
++#define AVX2_RANSTREAM_CHUNKSZ  (2*AVX2_RANSTREAM_STATESZ)
++
++#define SSSE3_RANSTREAM_BUFSZ  (SSSE3_RANSTREAM_NCHUNKS*SSSE3_RANSTREAM_CHUNKSZ)
++#define AVX2_RANSTREAM_BUFSZ   (AVX2_RANSTREAM_NCHUNKS*AVX2_RANSTREAM_CHUNKSZ)
 +
 +static void allocate_space(AlignedArray<unsigned char> &state_store,
 +			   AlignedArray<unsigned char> &buf_store)
@@ -2276,7 +2172,7 @@
 +  AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
 +}
 +
-+SSSE3_RESOLVER(static void, randomstream_impl_init,
++SSSE3_RESOLVER(static, void, randomstream_impl_init,
 +  (_ntl_uint32 *state, AlignedArray<unsigned char> &state_store,
 +   AlignedArray<unsigned char> &buf_store, const unsigned char *key));
 +
@@ -2285,6 +2181,7 @@
 + unsigned char *buf,
 + AlignedArray<unsigned char> &state_store __attribute__((unused)),
 + AlignedArray<unsigned char> &buf_store __attribute__((unused)),
++ long &chunk_count __attribute__((unused)),
 + unsigned char *NTL_RESTRICT res,
 + long n,
 + long pos)
@@ -2333,6 +2230,7 @@
 + unsigned char *buf_ignored __attribute__((unused)),
 + AlignedArray<unsigned char> &state_store,
 + AlignedArray<unsigned char> &buf_store,
++ long &chunk_count,
 + unsigned char *NTL_RESTRICT res,
 + long n,
 + long pos)
@@ -2364,8 +2262,9 @@
 +
 +   long i = 0;
 +   for (;  i <= n-SSSE3_RANSTREAM_BUFSZ; i += SSSE3_RANSTREAM_BUFSZ) {
++      chunk_count |= SSSE3_RANSTREAM_NCHUNKS;  // disable small buffer strategy
 +
-+      for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
++      for (long j = 0; j < SSSE3_RANSTREAM_NCHUNKS; j++) {
 +	 ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
 +	 ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
 +
@@ -2383,7 +2282,20 @@
 +   }
 +
 +   if (i < n) {
-+      for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
++      
++      long nchunks;
++
++      if (chunk_count < SSSE3_RANSTREAM_NCHUNKS) {
++         nchunks = long(cast_unsigned((n-i)+SSSE3_RANSTREAM_CHUNKSZ-1)/SSSE3_RANSTREAM_CHUNKSZ);
++         chunk_count += nchunks;
++      }
++      else
++         nchunks = SSSE3_RANSTREAM_NCHUNKS;
++
++      long pos_offset = SSSE3_RANSTREAM_BUFSZ - nchunks*SSSE3_RANSTREAM_CHUNKSZ;
++      buf += pos_offset;
++
++      for (long j = 0; j < nchunks; j++) {
 +	 ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
 +	 ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
 +
@@ -2398,8 +2310,8 @@
 +	 d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
 +      }
 +
-+      pos = n-i;
-+      std::memcpy(&res[i], &buf[0], pos);
++      pos = n-i+pos_offset;
++      std::memcpy(&res[i], &buf[0], n-i);
 +   }
 +
 +   SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
@@ -2412,6 +2324,7 @@
 + unsigned char *buf_ignored __attribute__((unused)),
 + AlignedArray<unsigned char> &state_store,
 + AlignedArray<unsigned char> &buf_store,
++ long &chunk_count,
 + unsigned char *NTL_RESTRICT res,
 + long n,
 + long pos)
@@ -2443,8 +2356,9 @@
 +
 +   long i = 0;
 +   for (;  i <= n-AVX2_RANSTREAM_BUFSZ; i += AVX2_RANSTREAM_BUFSZ) {
++      chunk_count |= AVX2_RANSTREAM_NCHUNKS;  // disable small buffer strategy
 +
-+      for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
++      for (long j = 0; j < AVX2_RANSTREAM_NCHUNKS; j++) {
 +	 avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
 +	 avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
 +
@@ -2462,7 +2376,20 @@
 +   }
 +
 +   if (i < n) {
-+      for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
++
++      long nchunks;
++
++      if (chunk_count < AVX2_RANSTREAM_NCHUNKS) {
++         nchunks = long(cast_unsigned((n-i)+AVX2_RANSTREAM_CHUNKSZ-1)/AVX2_RANSTREAM_CHUNKSZ);
++         chunk_count += nchunks;
++      }
++      else
++         nchunks = AVX2_RANSTREAM_NCHUNKS;
++
++      long pos_offset = AVX2_RANSTREAM_BUFSZ - nchunks*AVX2_RANSTREAM_CHUNKSZ;
++      buf += pos_offset;
++
++      for (long j = 0; j < nchunks; j++) {
 +	 avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
 +	 avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
 +
@@ -2477,8 +2404,8 @@
 +	 d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
 +      }
 +
-+      pos = n-i;
-+      std::memcpy(&res[i], &buf[0], pos);
++      pos = n-i+pos_offset;
++      std::memcpy(&res[i], &buf[0], n-i);
 +   }
 +
 +   AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
@@ -2486,17 +2413,75 @@
 +   return pos;
 +}
 +
-+SSSE3_RESOLVER(static long, randomstream_get_bytes,
++SSSE3_RESOLVER(static, long, randomstream_get_bytes,
 +  (_ntl_uint32 *state, unsigned char *buf,
 +   AlignedArray<unsigned char> &state_store,
 +   AlignedArray<unsigned char> &buf_store,
++   long &chunk_count,
 +   unsigned char *NTL_RESTRICT res,
 +   long n,
 +   long pos));
 +
++BASE_FUNC(void, randomstream_set_nonce)
++(_ntl_uint32 *state,
++ AlignedArray<unsigned char> &state_store __attribute__((unused)),
++ long &chunk_count __attribute__((unused)),
++ unsigned long nonce)
++{
++   _ntl_uint32 nonce0, nonce1;
++
++   nonce0 = nonce;
++   nonce0 = INT32MASK(nonce0);
++
++   nonce1 = 0;
++
++#if (NTL_BITS_PER_LONG > 32)
++   nonce1 = nonce >> 32;
++   nonce1 = INT32MASK(nonce1);
++#endif
++
++   state[12] = 0;
++   state[13] = 0;
++   state[14] = nonce0;
++   state[15] = nonce1;
++}
++
++SSSE3_FUNC(void, randomstream_set_nonce)
++(_ntl_uint32 *state_ignored __attribute__((unused)),
++ AlignedArray<unsigned char> &state_store,
++ long &chunk_count,
++ unsigned long nonce)
++{
++   unsigned char *state = state_store.elts();
++   ssse3_ivec_t d3;
++   d3 = SSSE3_NONCE(nonce);
++   SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
++   chunk_count = 0;
++}
++
++AVX2_FUNC(void, randomstream_set_nonce)
++(_ntl_uint32 *state_ignored __attribute__((unused)),
++ AlignedArray<unsigned char> &state_store,
++ long &chunk_count,
++ unsigned long nonce)
++{
++   unsigned char *state = state_store.elts();
++   avx2_ivec_t d3;
++   d3 = AVX2_NONCE(nonce);
++   AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
++   chunk_count = 0;
++}
++
++SSSE3_RESOLVER(, void, randomstream_set_nonce,
++  (_ntl_uint32 *state,
++   AlignedArray<unsigned char> &state_store,
++   long &chunk_count,
++   unsigned long nonce));
++
 +struct RandomStream_impl {
 +   AlignedArray<unsigned char> state_store;
 +   AlignedArray<unsigned char> buf_store;
++   long chunk_count;
 +   _ntl_uint32 state[16];
 +   unsigned char buf[64];
 +
@@ -2504,6 +2489,7 @@
 +   RandomStream_impl(const unsigned char *key)
 +   {
 +     randomstream_impl_init(state, state_store, buf_store, key);
++     chunk_count = 0;
 +   }
 +
 +   RandomStream_impl(const RandomStream_impl& other)
@@ -2523,6 +2509,7 @@
 +         std::memcpy(state_store.elts(), other.state_store.elts(), SSSE3_RANSTREAM_STATESZ);
 +         std::memcpy(buf_store.elts(), other.buf_store.elts(), SSSE3_RANSTREAM_BUFSZ);
 +      }
++      chunk_count = other.chunk_count;
 +      return *this;
 +   }
 +
@@ -2548,12 +2535,22 @@
 +      }
 +   }
 +
++   // bytes are generated in chunks of RANSTREAM_BUFSZ bytes, except that
++   // initially, we may generate a few chunks of RANSTREAM_CHUNKSZ
++   // bytes.  This optimizes a bit for short bursts following a reset.
++
 +   long
 +   get_bytes(unsigned char *NTL_RESTRICT res,
 +             long n, long pos)
 +   {
 +      return randomstream_get_bytes(state, buf, state_store, buf_store,
-+				    res, n, pos);
++				    chunk_count, res, n, pos);
++   }
++
++   void
++   set_nonce(unsigned long nonce)
++   {
++      randomstream_set_nonce(state, state_store, chunk_count, nonce);
 +   }
 +};
 +
diff --git a/ntl.spec b/ntl.spec
index 9f7ee7c..d7ea354 100644
--- a/ntl.spec
+++ b/ntl.spec
@@ -10,8 +10,8 @@
 
 Summary: High-performance algorithms for vectors, matrices, and polynomials 
 Name:    ntl 
-Version: 10.5.0
-Release: 2%{?dist}
+Version: 11.0.0
+Release: 1%{?dist}
 
 License: LGPLv2+
 URL:     http://shoup.net/ntl/ 
@@ -70,9 +70,6 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release}
 
 %build
 pushd src
-# We eventually want to set NTL_STD_CXX14=on and NTL_SAFE_VECTORS=on, but that
-# involves a change in semantics to vector and matrix assignment operations
-# that latte-integrale, at least, is not yet prepared for.
 ./configure \
   CXX="${CXX-g++}" \
   CXXFLAGS="%{optflags} -fPIC" \
@@ -81,9 +78,10 @@ pushd src
   DOCDIR=%{_docdir} \
   INCLUDEDIR=%{_includedir} \
   LIBDIR=%{_libdir} \
+  LDLIBS="-lpthread -lm" \
   NATIVE=off \
   %{?gf2x:NTL_GF2X_LIB=on} \
-  NTL_DISABLE_TLS_HACK=on \
+  NTL_STD_CXX14=on \
 %ifarch x86_64
   NTL_LOADTIME_CPU=on \
   TUNE=x86 \
@@ -135,13 +133,12 @@ done
 %endif
 
 
-%post -p /sbin/ldconfig
-%postun -p /sbin/ldconfig
+%ldconfig_scriptlets
 
 %files
 %doc README
 %license doc/copying.txt
-%{_libdir}/libntl.so.35*
+%{_libdir}/libntl.so.36*
 
 %files devel 
 %doc doc/*
@@ -155,6 +152,9 @@ done
 
 
 %changelog
+* Sat Jun  2 2018 Jerry James <loganjerry@gmail.com> - 11.0.0-1
+- ntl-11.0.0
+
 * Thu Feb 08 2018 Fedora Release Engineering <releng@fedoraproject.org> - 10.5.0-2
 - Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
 
diff --git a/sources b/sources
index a858e08..ce0b963 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (ntl-10.5.0.tar.gz) = b299dfc29005079470972c2a9ca02acd0ebdbc31ff8923df02f3627dbc66daa0f527226972cef032e1e488c4272554634a96456e94653fdf8b01356160319aa0
+SHA512 (ntl-11.0.0.tar.gz) = 495a07db6ac92ec41b9c660e53d2a714f635f042c48453c59fc6524ee3e64f4ca52878bd2b96e1a21b1c1a39bdb68314ca5b3fc36c75e7d616a33c6d8c7f8e5a