ntl-10.5.0

2017-09-28 18:37:10 -06:00 · 2017-09-28 18:37:10 -06:00 · 7e405f74ed
parent 2828a242e4
commit 7e405f74ed
4 changed files with 603 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,4 @@
 /ntl-9.11.0.tar.gz
 /ntl-10.1.0.tar.gz
 /ntl-10.3.0.tar.gz
+/ntl-10.5.0.tar.gz
--- a/ntl-loadtime-cpu.patch
+++ b/ntl-loadtime-cpu.patch
@ -1,6 +1,6 @@
--- doc/config.txt.orig	2016-11-18 11:39:17.000000000 -0700
-+++ doc/config.txt	2017-01-12 15:07:36.859377026 -0700
-@@ -300,6 +300,7 @@ NTL_AVOID_BRANCHING=off
+--- doc/config.txt.orig	2017-07-07 09:05:14.000000000 -0600
+++ doc/config.txt	2017-09-09 12:10:10.877839678 -0600
+@@ -337,6 +337,7 @@ NTL_AVOID_BRANCHING=off
 NTL_GF2X_NOINLINE=off
 NTL_GF2X_ALTCODE=off
 NTL_GF2X_ALTCODE1=off
@ -8,7 +8,7 @@
 
 GMP_INCDIR=$(GMP_PREFIX)/include
 GMP_LIBDIR=$(GMP_PREFIX)/lib
-@@ -597,6 +598,10 @@ NTL_GF2X_ALTCODE1=off
+@@ -634,6 +635,10 @@ NTL_GF2X_ALTCODE1=off
 
 # Yet another alternative implementation for GF2X multiplication.
 
@ -19,9 +19,9 @@
 
 
 ########## More GMP Options:
--- include/NTL/config.h.orig	2016-11-18 11:39:17.000000000 -0700
-+++ include/NTL/config.h	2017-01-12 15:07:36.860377023 -0700
-@@ -475,6 +475,20 @@ using the configure script.
+--- include/NTL/config.h.orig	2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/config.h	2017-09-09 12:10:10.891839641 -0600
+@@ -525,6 +525,20 @@ using the configure script.
 #endif
 
 
@ -42,9 +42,9 @@
 
 
 
--- include/NTL/ctools.h.orig	2016-11-18 11:39:16.000000000 -0700
-+++ include/NTL/ctools.h	2017-01-12 15:07:36.861377020 -0700
-@@ -447,6 +447,136 @@ char *_ntl_make_aligned(char *p, long al
+--- include/NTL/ctools.h.orig	2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/ctools.h	2017-09-09 13:36:16.915768457 -0600
+@@ -498,6 +498,166 @@ char *_ntl_make_aligned(char *p, long al
 // and it should also be as big as a cache line
 
 
@ -61,6 +61,9 @@
 +#endif
 +
 +#include <cpuid.h>
+#ifndef bit_SSSE3
+#define bit_SSSE3	(1 << 9)
+#endif
 +#ifndef bit_PCLMUL
 +#define bit_PCLMUL	(1 << 1)
 +#endif
@ -77,10 +80,37 @@
 +#define BASE_FUNC(type,name) static type name##_base
 +#define TARGET_FUNC(arch,suffix,type,name)                              \
 +   static type __attribute__((target (arch))) name##_##suffix
-+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name)
-+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name)
-+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name)
-+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name)
+#define SSSE3_FUNC(type,name) TARGET_FUNC("ssse3",ssse3,type,name)
+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul,ssse3",pclmul,type,name)
+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul,ssse3",avx,type,name)
+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul,ssse3",fma,type,name)
+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul,ssse3",avx2,type,name)
+#define SSSE3_RESOLVER(type,name,params)                                \
+   extern "C" {                                                         \
+      static void __attribute__((optimize ("O0")))                      \
+         (*resolve_##name (void))(void) {                               \
+         if (__builtin_expect(have_avx2, 0) < 0) {                      \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {               \
+               have_avx2 = ((ebx & bit_AVX2) != 0);                     \
+            } else {                                                    \
+               have_avx2 = 0;                                           \
+            }                                                           \
+         }                                                              \
+         if (__builtin_expect(have_ssse3, 0) < 0) {                     \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
+               have_ssse3 = ((ecx & bit_SSSE3) != 0);                   \
+            } else {                                                    \
+               have_ssse3 = 0;                                          \
+            }                                                           \
+         }                                                              \
+         if (have_avx2) return (void (*)(void))&name##_avx2;            \
+         if (have_ssse3) return (void (*)(void))&name##_ssse3;          \
+         return (void (*)(void))&name##_base;                           \
+      }                                                                 \
+   }                                                                    \
+   type __attribute__((ifunc ("resolve_" #name))) name params
 +#define PCLMUL_RESOLVER(type,name,params)                               \
 +   extern "C" {                                                         \
 +      static void __attribute__((optimize ("O0")))                      \
@ -181,9 +211,9 @@
 
 #ifdef NTL_HAVE_BUILTIN_CLZL
 
--- include/NTL/def_config.h.orig	2016-11-18 11:39:16.000000000 -0700
-+++ include/NTL/def_config.h	2017-01-12 15:07:36.861377020 -0700
-@@ -475,6 +475,19 @@ using the configure script.
+--- include/NTL/def_config.h.orig	2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/def_config.h	2017-09-09 12:10:10.892839638 -0600
+@@ -525,6 +525,19 @@ using the configure script.
 #endif
 
 
@ -203,8 +233,8 @@
 
 
 
--- include/NTL/MatPrime.h.orig	2016-11-18 11:39:16.000000000 -0700
-+++ include/NTL/MatPrime.h	2017-01-12 16:15:17.307205250 -0700
+--- include/NTL/MatPrime.h.orig	2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/MatPrime.h	2017-09-09 12:10:10.892839638 -0600
@@ -20,7 +20,7 @@ NTL_OPEN_NNS
 
 
@ -214,11 +244,20 @@
 #define NTL_MatPrime_NBITS (23)
 #else
 #define NTL_MatPrime_NBITS NTL_SP_NBITS
--- src/cfile.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/cfile	2017-01-12 15:07:36.862377017 -0700
-@@ -475,6 +475,20 @@ using the configure script.
+--- include/NTL/REPORT_ALL_FEATURES.h.orig	2017-07-07 09:05:15.000000000 -0600
+++ include/NTL/REPORT_ALL_FEATURES.h	2017-09-09 12:11:10.313683979 -0600
+@@ -39,3 +39,6 @@
+    std::cerr << "NTL_HAVE_COPY_TRAITS2\n";
 #endif
 
+#ifdef NTL_LOADTIME_CPU
+   std::cerr << "NTL_LOADTIME_CPU\n";
+#endif
+--- src/cfile.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/cfile	2017-09-09 12:10:10.892839638 -0600
+@@ -480,6 +480,20 @@ using the configure script.
+ #elif @{NTL_GF2X_ALTCODE1}
+ #define NTL_GF2X_ALTCODE1
 
 +#if @{NTL_LOADTIME_CPU}
 +#define NTL_LOADTIME_CPU
@ -235,11 +274,11 @@
 +#endif
 +
 
- @{WIZARD_HACK}
- 
--- src/DispSettings.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/DispSettings.cpp	2017-01-12 15:07:36.863377014 -0700
-@@ -164,6 +164,10 @@ cout << "Performance Options:\n";
+ /*
+  * Yest another alternative strategy for implementing GF2X
+--- src/DispSettings.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/DispSettings.cpp	2017-09-09 12:10:10.892839638 -0600
+@@ -168,6 +168,10 @@ cout << "Performance Options:\n";
    cout << "NTL_GF2X_NOINLINE\n";
 #endif
 
@ -250,8 +289,8 @@
 
    cout << "***************************/\n";
    cout << "\n\n";
--- src/DoConfig.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/DoConfig	2017-01-12 15:07:36.864377011 -0700
+--- src/DoConfig.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/DoConfig	2017-09-09 12:10:10.892839638 -0600
@@ -1,7 +1,7 @@
 # This is a perl script, invoked from a shell
 
@ -261,7 +300,7 @@
 
 
 %MakeFlag = (
-@@ -82,6 +82,7 @@
+@@ -86,6 +86,7 @@
 'NTL_GF2X_NOINLINE'       => 'off',
 'NTL_GF2X_ALTCODE'        => 'off',
 'NTL_GF2X_ALTCODE1'       => 'off',
@ -269,7 +308,7 @@
 
 
 );
-@@ -191,6 +192,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
+@@ -195,6 +196,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
 }
 
 
@ -285,8 +324,8 @@
 
 # some special MakeVal values that are determined by SHARED
 
--- src/GF2X1.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/GF2X1.cpp	2017-01-12 15:07:36.866377005 -0700
+--- src/GF2X1.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/GF2X1.cpp	2017-09-09 12:10:10.893839636 -0600
@@ -19,7 +19,7 @@
 // simple scaling factor for some crossover points:
 // we use a lower crossover of the underlying multiplication
@ -296,8 +335,8 @@
 #define XOVER_SCALE (1L)
 #else
 #define XOVER_SCALE (2L)
--- src/GF2X.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/GF2X.cpp	2017-01-12 15:07:36.867377002 -0700
+--- src/GF2X.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/GF2X.cpp	2017-09-09 12:10:10.893839636 -0600
@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned
    _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
 }
@ -321,7 +360,7 @@
 #else
 
 
-@@ -576,6 +592,27 @@ void add(GF2X& x, const GF2X& a, const G
+@@ -556,6 +572,27 @@ void add(GF2X& x, const GF2X& a, const G
 
 
 
@ -349,7 +388,7 @@
 static NTL_INLINE
 void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
 {
-@@ -588,6 +625,7 @@ NTL_EFF_BB_MUL_CODE0
+@@ -568,6 +605,7 @@ NTL_EFF_BB_MUL_CODE0
 
 }
 
@ -357,7 +396,7 @@
 
 #ifdef NTL_GF2X_NOINLINE
 
-@@ -612,6 +650,51 @@ NTL_EFF_BB_MUL_CODE0
+@@ -592,6 +630,51 @@ NTL_EFF_BB_MUL_CODE0
 #endif
 
 
@ -409,7 +448,7 @@
 static 
 void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
 {
-@@ -639,6 +722,53 @@ NTL_EFF_BB_MUL_CODE1
+@@ -619,6 +702,53 @@ NTL_EFF_BB_MUL_CODE1
 
 }
 
@ -463,7 +502,7 @@
 static 
 void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
 {
-@@ -667,6 +797,52 @@ NTL_EFF_BB_MUL_CODE2
+@@ -647,6 +777,52 @@ NTL_EFF_BB_MUL_CODE2
 
 }
 
@ -516,7 +555,7 @@
 
 static 
 void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
-@@ -695,10 +871,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
+@@ -675,10 +851,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
 
 }
 
@ -548,7 +587,7 @@
 static 
 void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
 {
-@@ -712,6 +909,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
+@@ -692,6 +889,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
 
 }
 
@ -556,7 +595,7 @@
 
 // mul2...mul8 hard-code 2x2...8x8 word multiplies.
 // I adapted these routines from LiDIA (except mul3, see below).
-@@ -1623,6 +1821,77 @@ static const _ntl_ulong sqrtab[256] = {
+@@ -1603,6 +1801,77 @@ static const _ntl_ulong sqrtab[256] = {
 
 
 
@ -634,7 +673,7 @@
 static inline
 void sqr1(_ntl_ulong *c, _ntl_ulong a)
 {
-@@ -1663,6 +1932,7 @@ void sqr(GF2X& c, const GF2X& a)
+@@ -1643,6 +1912,7 @@ void sqr(GF2X& c, const GF2X& a)
    return;
 }
 
@ -642,9 +681,9 @@
 
 
 void LeftShift(GF2X& c, const GF2X& a, long n)
--- src/InitSettings.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/InitSettings.cpp	2017-01-12 15:07:36.867377002 -0700
-@@ -148,6 +148,11 @@ int main()
+--- src/InitSettings.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/InitSettings.cpp	2017-09-09 12:10:10.894839633 -0600
+@@ -166,6 +166,11 @@ int main()
    cout << "NTL_RANGE_CHECK=0\n";
 #endif
 
@ -656,8 +695,8 @@
 
 // the following are not actual config flags, but help
 // in the Wizard logic
--- src/mat_lzz_p.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/mat_lzz_p.cpp	2017-01-12 21:47:53.774949563 -0700
+--- src/mat_lzz_p.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/mat_lzz_p.cpp	2017-09-09 12:10:10.895839630 -0600
@@ -10,6 +10,15 @@
 
 #ifdef NTL_HAVE_AVX
@ -2001,9 +2040,9 @@
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
--- src/QuickTest.cpp.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/QuickTest.cpp	2017-01-12 15:07:36.883376955 -0700
-@@ -316,6 +316,9 @@ cerr << "Performance Options:\n";
+--- src/QuickTest.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/QuickTest.cpp	2017-09-09 12:10:10.895839630 -0600
+@@ -310,6 +310,9 @@ cerr << "Performance Options:\n";
    cerr << "NTL_GF2X_NOINLINE\n";
 #endif
 
@ -2013,9 +2052,9 @@
 
    cerr << "\n\n";
 
--- src/WizardAux.orig	2016-11-18 11:39:15.000000000 -0700
-+++ src/WizardAux	2017-01-12 15:07:36.883376955 -0700
-@@ -88,6 +88,7 @@ system("$ARGV[0] InitSettings");
+--- src/WizardAux.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/WizardAux	2017-09-09 12:10:10.895839630 -0600
+@@ -89,6 +89,7 @@ system("$ARGV[0] InitSettings");
 'NTL_GF2X_NOINLINE'    => 0,
 'NTL_FFT_BIGTAB'       => 0,
 'NTL_FFT_LAZYMUL'      => 0,
@ -2023,3 +2062,501 @@
 
 'WIZARD_HACK'          => '#define NTL_WIZARD_HACK',
 
+--- src/ZZ.cpp.orig	2017-07-07 09:05:14.000000000 -0600
+++ src/ZZ.cpp	2017-09-18 18:41:27.125503871 -0600
+@@ -12,6 +12,13 @@
+ #elif defined(NTL_HAVE_SSSE3)
+ #include <emmintrin.h>
+ #include <tmmintrin.h>
+#elif defined(NTL_LOADTIME_CPU)
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static int have_avx2 = -1;
+static int have_ssse3 = -1;
+ #endif
+ 
+ 
+@@ -2106,6 +2113,481 @@ struct RandomStream_impl {
+ };
+ 
+ 
+#elif defined(NTL_LOADTIME_CPU)
+
+// round selector, specified values:
+//  8:  low security - high speed
+// 12:  mid security -  mid speed
+// 20: high security -  low speed
+#ifndef CHACHA_RNDS
+#define CHACHA_RNDS 20
+#endif
+
+typedef __m128i ssse3_ivec_t;
+typedef __m256i avx2_ivec_t;
+
+#define SSSE3_DELTA	_mm_set_epi32(0,0,0,1)
+#define AVX2_DELTA	_mm256_set_epi64x(0,2,0,2)
+
+#define SSSE3_START  _mm_setzero_si128()
+#define AVX2_START   _mm256_set_epi64x(0,1,0,0)
+
+#define SSSE3_STOREU_VEC(m,r)	_mm_storeu_si128((__m128i*)(m), r)
+#define AVX2_STOREU_VEC(m,r)	_mm256_storeu_si256((__m256i*)(m), r)
+
+#define SSSE3_STORE_VEC(m,r)	_mm_store_si128((__m128i*)(m), r)
+#define AVX2_STORE_VEC(m,r)	_mm256_store_si256((__m256i*)(m), r)
+
+#define SSSE3_LOAD_VEC(r,m) r = _mm_load_si128((const __m128i *)(m))
+#define AVX2_LOAD_VEC(r,m) r = _mm256_load_si256((const __m256i *)(m))
+
+#define SSSE3_LOADU_VEC_128(r, m) r = _mm_loadu_si128((const __m128i*)(m))
+#define AVX2_LOADU_VEC_128(r, m) r = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(m)))
+
+#define SSSE3_ADD_VEC_32(a,b)	_mm_add_epi32(a, b)
+#define AVX2_ADD_VEC_32(a,b)	_mm256_add_epi32(a, b)
+
+#define SSSE3_ADD_VEC_64(a,b)	_mm_add_epi64(a, b)
+#define AVX2_ADD_VEC_64(a,b)	_mm256_add_epi64(a, b)
+
+#define SSSE3_XOR_VEC(a,b)	_mm_xor_si128(a, b)
+#define AVX2_XOR_VEC(a,b)	_mm256_xor_si256(a, b)
+
+#define SSSE3_ROR_VEC_V1(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
+#define AVX2_ROR_VEC_V1(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
+
+#define SSSE3_ROR_VEC_V2(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
+#define AVX2_ROR_VEC_V2(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
+
+#define SSSE3_ROR_VEC_V3(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
+#define AVX2_ROR_VEC_V3(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
+
+#define SSSE3_ROL_VEC_7(x)	SSSE3_XOR_VEC(_mm_slli_epi32(x, 7), _mm_srli_epi32(x,25))
+#define AVX2_ROL_VEC_7(x)	AVX2_XOR_VEC(_mm256_slli_epi32(x, 7), _mm256_srli_epi32(x,25))
+
+#define SSSE3_ROL_VEC_12(x)	SSSE3_XOR_VEC(_mm_slli_epi32(x,12), _mm_srli_epi32(x,20))
+#define AVX2_ROL_VEC_12(x)	AVX2_XOR_VEC(_mm256_slli_epi32(x,12), _mm256_srli_epi32(x,20))
+
+#define SSSE3_ROL_VEC_8(x)	_mm_shuffle_epi8(x,_mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
+#define AVX2_ROL_VEC_8(x)	_mm256_shuffle_epi8(x,_mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
+
+#define SSSE3_ROL_VEC_16(x)	_mm_shuffle_epi8(x,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
+#define AVX2_ROL_VEC_16(x)	_mm256_shuffle_epi8(x,_mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
+
+#define SSSE3_WRITEU_VEC(op, d, v0, v1, v2, v3)	\
+  SSSE3_STOREU_VEC(op + (d + 0*4), v0);		\
+  SSSE3_STOREU_VEC(op + (d + 4*4), v1);		\
+  SSSE3_STOREU_VEC(op + (d + 8*4), v2);		\
+  SSSE3_STOREU_VEC(op + (d +12*4), v3);
+#define AVX2_WRITEU_VEC(op, d, v0, v1, v2, v3)				\
+  AVX2_STOREU_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
+  AVX2_STOREU_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
+  AVX2_STOREU_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
+  AVX2_STOREU_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
+
+#define SSSE3_WRITE_VEC(op, d, v0, v1, v2, v3)	\
+  SSSE3_STORE_VEC(op + (d + 0*4), v0);		\
+  SSSE3_STORE_VEC(op + (d + 4*4), v1);		\
+  SSSE3_STORE_VEC(op + (d + 8*4), v2);		\
+  SSSE3_STORE_VEC(op + (d +12*4), v3);
+#define AVX2_WRITE_VEC(op, d, v0, v1, v2, v3)				\
+  AVX2_STORE_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
+  AVX2_STORE_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
+  AVX2_STORE_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
+  AVX2_STORE_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
+
+#define SSSE3_SZ_VEC (16)
+#define AVX2_SZ_VEC (32)
+
+#define SSSE3_RANSTREAM_BUFSZ (1024)
+// must be a multiple of 8*SSE3_SZ_VEC
+
+#define AVX2_RANSTREAM_BUFSZ (1024)
+// must be a multiple of 8*AVX2_SZ_VEC
+
+#define SSSE3_DQROUND_VECTORS_VEC(a,b,c,d)				\
+  a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \
+  c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_12(b); \
+  a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_8(d); \
+  c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_7(b); \
+  b = SSSE3_ROR_VEC_V1(b); c = SSSE3_ROR_VEC_V2(c); d = SSSE3_ROR_VEC_V3(d); \
+  a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \
+  c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_12(b); \
+  a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_8(d); \
+  c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_7(b); \
+  b = SSSE3_ROR_VEC_V3(b); c = SSSE3_ROR_VEC_V2(c); d = SSSE3_ROR_VEC_V1(d);
+
+#define AVX2_DQROUND_VECTORS_VEC(a,b,c,d)				\
+  a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_16(d); \
+  c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_12(b); \
+  a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_8(d); \
+  c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_7(b); \
+  b = AVX2_ROR_VEC_V1(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V3(d); \
+  a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_16(d); \
+  c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_12(b); \
+  a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_8(d); \
+  c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_7(b); \
+  b = AVX2_ROR_VEC_V3(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V1(d);
+
+#define SSSE3_RANSTREAM_STATESZ (4*SSSE3_SZ_VEC)
+#define AVX2_RANSTREAM_STATESZ (4*AVX2_SZ_VEC)
+
+static void allocate_space(AlignedArray<unsigned char> &state_store,
+			   AlignedArray<unsigned char> &buf_store)
+{
+   if (have_avx2) {
+      state_store.SetLength(AVX2_RANSTREAM_STATESZ);
+      buf_store.SetLength(AVX2_RANSTREAM_BUFSZ);
+  } else {
+      state_store.SetLength(SSSE3_RANSTREAM_STATESZ);
+      buf_store.SetLength(SSSE3_RANSTREAM_BUFSZ);
+  }
+};
+
+BASE_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state,
+ AlignedArray<unsigned char> &state_store __attribute__((unused)),
+ AlignedArray<unsigned char> &buf_store __attribute__((unused)),
+ const unsigned char *key)
+{
+   salsa20_init(state, key);
+}
+
+SSSE3_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ const unsigned char *key)
+{
+  allocate_space(state_store, buf_store);
+
+  unsigned char *state = state_store.elts();
+
+  unsigned int chacha_const[] = {
+    0x61707865,0x3320646E,0x79622D32,0x6B206574
+  };
+
+  ssse3_ivec_t d0, d1, d2, d3;
+  SSSE3_LOADU_VEC_128(d0, chacha_const);
+  SSSE3_LOADU_VEC_128(d1, key);
+  SSSE3_LOADU_VEC_128(d2, key+16);
+
+  d3 = SSSE3_START;
+
+  SSSE3_STORE_VEC(state + 0*SSSE3_SZ_VEC, d0);
+  SSSE3_STORE_VEC(state + 1*SSSE3_SZ_VEC, d1);
+  SSSE3_STORE_VEC(state + 2*SSSE3_SZ_VEC, d2);
+  SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
+}
+
+AVX2_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ const unsigned char *key)
+{
+  allocate_space(state_store, buf_store);
+
+  unsigned char *state = state_store.elts();
+
+  unsigned int chacha_const[] = {
+    0x61707865,0x3320646E,0x79622D32,0x6B206574
+  };
+
+  avx2_ivec_t d0, d1, d2, d3;
+  AVX2_LOADU_VEC_128(d0, chacha_const);
+  AVX2_LOADU_VEC_128(d1, key);
+  AVX2_LOADU_VEC_128(d2, key+16);
+
+  d3 = AVX2_START;
+
+  AVX2_STORE_VEC(state + 0*AVX2_SZ_VEC, d0);
+  AVX2_STORE_VEC(state + 1*AVX2_SZ_VEC, d1);
+  AVX2_STORE_VEC(state + 2*AVX2_SZ_VEC, d2);
+  AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
+}
+
+SSSE3_RESOLVER(static void, randomstream_impl_init,
+  (_ntl_uint32 *state, AlignedArray<unsigned char> &state_store,
+   AlignedArray<unsigned char> &buf_store, const unsigned char *key));
+
+BASE_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state,
+ unsigned char *buf,
+ AlignedArray<unsigned char> &state_store __attribute__((unused)),
+ AlignedArray<unsigned char> &buf_store __attribute__((unused)),
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+   if (n < 0) LogicError("RandomStream::get: bad args");
+
+   long i, j;
+
+   if (n <= 64-pos) {
+      for (i = 0; i < n; i++) res[i] = buf[pos+i];
+      pos += n;
+      return pos;
+   }
+
+   // read remainder of buffer
+   for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
+   n -= 64-pos;
+   res += 64-pos;
+   pos = 64;
+
+   _ntl_uint32 wdata[16];
+
+   // read 64-byte chunks
+   for (i = 0; i <= n-64; i += 64) {
+      salsa20_apply(state, wdata);
+      for (j = 0; j < 16; j++)
+	 FROMLE(res + i + 4*j, wdata[j]);
+   }
+
+   if (i < n) {
+      salsa20_apply(state, wdata);
+
+      for (j = 0; j < 16; j++)
+	 FROMLE(buf + 4*j, wdata[j]);
+
+      pos = n-i;
+      for (j = 0; j < pos; j++)
+	 res[i+j] = buf[j];
+   }
+
+   return pos;
+}
+
+SSSE3_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ unsigned char *buf_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+   if (n < 0) LogicError("RandomStream::get: bad args");
+   if (n == 0) return pos;
+
+   unsigned char *NTL_RESTRICT buf = buf_store.elts();
+
+   if (n <= SSSE3_RANSTREAM_BUFSZ-pos) {
+      std::memcpy(&res[0], &buf[pos], n);
+      pos += n;
+      return pos;
+   }
+
+   unsigned char *NTL_RESTRICT state = state_store.elts();
+
+   ssse3_ivec_t d0, d1, d2, d3;
+   SSSE3_LOAD_VEC(d0, state + 0*SSSE3_SZ_VEC);
+   SSSE3_LOAD_VEC(d1, state + 1*SSSE3_SZ_VEC);
+   SSSE3_LOAD_VEC(d2, state + 2*SSSE3_SZ_VEC);
+   SSSE3_LOAD_VEC(d3, state + 3*SSSE3_SZ_VEC);
+
+   // read remainder of buffer
+   std::memcpy(&res[0], &buf[pos], SSSE3_RANSTREAM_BUFSZ-pos);
+   n -= SSSE3_RANSTREAM_BUFSZ-pos;
+   res += SSSE3_RANSTREAM_BUFSZ-pos;
+   pos = SSSE3_RANSTREAM_BUFSZ;
+
+   long i = 0;
+   for (;  i <= n-SSSE3_RANSTREAM_BUFSZ; i += SSSE3_RANSTREAM_BUFSZ) {
+
+      for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
+	 ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+	 ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+
+	 for (long k = 0; k < CHACHA_RNDS/2; k++) {
+		 SSSE3_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+		 SSSE3_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+	 }
+
+	 SSSE3_WRITEU_VEC(res+i+j*(8*SSSE3_SZ_VEC), 0, SSSE3_ADD_VEC_32(v0,d0), SSSE3_ADD_VEC_32(v1,d1), SSSE3_ADD_VEC_32(v2,d2), SSSE3_ADD_VEC_32(v3,d3))
+	 d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+	 SSSE3_WRITEU_VEC(res+i+j*(8*SSSE3_SZ_VEC), 4*SSSE3_SZ_VEC, SSSE3_ADD_VEC_32(v4,d0), SSSE3_ADD_VEC_32(v5,d1), SSSE3_ADD_VEC_32(v6,d2), SSSE3_ADD_VEC_32(v7,d3))
+	 d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+      }
+
+   }
+
+   if (i < n) {
+      for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
+	 ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+	 ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+
+	 for (long k = 0; k < CHACHA_RNDS/2; k++) {
+		 SSSE3_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+		 SSSE3_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+	 }
+
+	 SSSE3_WRITE_VEC(buf+j*(8*SSSE3_SZ_VEC), 0, SSSE3_ADD_VEC_32(v0,d0), SSSE3_ADD_VEC_32(v1,d1), SSSE3_ADD_VEC_32(v2,d2), SSSE3_ADD_VEC_32(v3,d3))
+	 d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+	 SSSE3_WRITE_VEC(buf+j*(8*SSSE3_SZ_VEC), 4*SSSE3_SZ_VEC, SSSE3_ADD_VEC_32(v4,d0), SSSE3_ADD_VEC_32(v5,d1), SSSE3_ADD_VEC_32(v6,d2), SSSE3_ADD_VEC_32(v7,d3))
+	 d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+      }
+
+      pos = n-i;
+      std::memcpy(&res[i], &buf[0], pos);
+   }
+
+   SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
+
+   return pos;
+}
+
+AVX2_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ unsigned char *buf_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+   if (n < 0) LogicError("RandomStream::get: bad args");
+   if (n == 0) return pos;
+
+   unsigned char *NTL_RESTRICT buf = buf_store.elts();
+
+   if (n <= AVX2_RANSTREAM_BUFSZ-pos) {
+      std::memcpy(&res[0], &buf[pos], n);
+      pos += n;
+      return pos;
+   }
+
+   unsigned char *NTL_RESTRICT state = state_store.elts();
+
+   avx2_ivec_t d0, d1, d2, d3;
+   AVX2_LOAD_VEC(d0, state + 0*AVX2_SZ_VEC);
+   AVX2_LOAD_VEC(d1, state + 1*AVX2_SZ_VEC);
+   AVX2_LOAD_VEC(d2, state + 2*AVX2_SZ_VEC);
+   AVX2_LOAD_VEC(d3, state + 3*AVX2_SZ_VEC);
+
+   // read remainder of buffer
+   std::memcpy(&res[0], &buf[pos], AVX2_RANSTREAM_BUFSZ-pos);
+   n -= AVX2_RANSTREAM_BUFSZ-pos;
+   res += AVX2_RANSTREAM_BUFSZ-pos;
+   pos = AVX2_RANSTREAM_BUFSZ;
+
+   long i = 0;
+   for (;  i <= n-AVX2_RANSTREAM_BUFSZ; i += AVX2_RANSTREAM_BUFSZ) {
+
+      for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
+	 avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+	 avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+
+	 for (long k = 0; k < CHACHA_RNDS/2; k++) {
+		 AVX2_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+		 AVX2_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+	 }
+
+	 AVX2_WRITEU_VEC(res+i+j*(8*AVX2_SZ_VEC), 0, AVX2_ADD_VEC_32(v0,d0), AVX2_ADD_VEC_32(v1,d1), AVX2_ADD_VEC_32(v2,d2), AVX2_ADD_VEC_32(v3,d3))
+	 d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+	 AVX2_WRITEU_VEC(res+i+j*(8*AVX2_SZ_VEC), 4*AVX2_SZ_VEC, AVX2_ADD_VEC_32(v4,d0), AVX2_ADD_VEC_32(v5,d1), AVX2_ADD_VEC_32(v6,d2), AVX2_ADD_VEC_32(v7,d3))
+	 d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+      }
+
+   }
+
+   if (i < n) {
+      for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
+	 avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+	 avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+
+	 for (long k = 0; k < CHACHA_RNDS/2; k++) {
+		 AVX2_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+		 AVX2_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+	 }
+
+	 AVX2_WRITE_VEC(buf+j*(8*AVX2_SZ_VEC), 0, AVX2_ADD_VEC_32(v0,d0), AVX2_ADD_VEC_32(v1,d1), AVX2_ADD_VEC_32(v2,d2), AVX2_ADD_VEC_32(v3,d3))
+	 d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+	 AVX2_WRITE_VEC(buf+j*(8*AVX2_SZ_VEC), 4*AVX2_SZ_VEC, AVX2_ADD_VEC_32(v4,d0), AVX2_ADD_VEC_32(v5,d1), AVX2_ADD_VEC_32(v6,d2), AVX2_ADD_VEC_32(v7,d3))
+	 d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+      }
+
+      pos = n-i;
+      std::memcpy(&res[i], &buf[0], pos);
+   }
+
+   AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
+
+   return pos;
+}
+
+SSSE3_RESOLVER(static long, randomstream_get_bytes,
+  (_ntl_uint32 *state, unsigned char *buf,
+   AlignedArray<unsigned char> &state_store,
+   AlignedArray<unsigned char> &buf_store,
+   unsigned char *NTL_RESTRICT res,
+   long n,
+   long pos));
+
+struct RandomStream_impl {
+   AlignedArray<unsigned char> state_store;
+   AlignedArray<unsigned char> buf_store;
+   _ntl_uint32 state[16];
+   unsigned char buf[64];
+
+   explicit
+   RandomStream_impl(const unsigned char *key)
+   {
+     randomstream_impl_init(state, state_store, buf_store, key);
+   }
+
+   RandomStream_impl(const RandomStream_impl& other)
+   {
+      if (have_avx2 || have_ssse3) {
+         allocate_space(state_store, buf_store);
+      }
+      *this = other;
+   }
+
+   RandomStream_impl& operator=(const RandomStream_impl& other)
+   {
+      if (have_avx2) {
+         std::memcpy(state_store.elts(), other.state_store.elts(), AVX2_RANSTREAM_STATESZ);
+         std::memcpy(buf_store.elts(), other.buf_store.elts(), AVX2_RANSTREAM_BUFSZ);
+      } else if (have_ssse3) {
+         std::memcpy(state_store.elts(), other.state_store.elts(), SSSE3_RANSTREAM_STATESZ);
+         std::memcpy(buf_store.elts(), other.buf_store.elts(), SSSE3_RANSTREAM_BUFSZ);
+      }
+      return *this;
+   }
+
+   const unsigned char *
+   get_buf() const
+   {
+      if (have_avx2 || have_ssse3) {
+         return buf_store.elts();
+      } else {
+         return &buf[0];
+      }
+   }
+
+   long
+   get_buf_len() const
+   {
+      if (have_avx2) {
+	 return AVX2_RANSTREAM_BUFSZ;
+      } else if (have_ssse3) {
+         return SSSE3_RANSTREAM_BUFSZ;
+      } else {
+         return 64;
+      }
+   }
+
+   long
+   get_bytes(unsigned char *NTL_RESTRICT res,
+             long n, long pos)
+   {
+      return randomstream_get_bytes(state, buf, state_store, buf_store,
+				    res, n, pos);
+   }
+};
+
+ #else
+ 
+ struct RandomStream_impl {
--- a/ntl.spec
+++ b/ntl.spec
@ -10,8 +10,8 @@

 Summary: High-performance algorithms for vectors, matrices, and polynomials 
 Name:    ntl 
-Version: 10.3.0
-Release: 3%{?dist}
+Version: 10.5.0
+Release: 1%{?dist}

 License: LGPLv2+
 URL:     http://shoup.net/ntl/ 
@ -70,6 +70,9 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release}

 %build
 pushd src
+# We eventually want to set NTL_STD_CXX14=on and NTL_SAFE_VECTORS=on, but that
+# involves a change in semantics to vector and matrix assignment operations
+# that latte-integrale, at least, is not yet prepared for.
 ./configure \
  CXX="${CXX-g++}" \
  CXXFLAGS="%{optflags} -fPIC" \
@ -138,7 +141,7 @@ done
 %files
 %doc README
 %license doc/copying.txt
-%{_libdir}/libntl.so.33*
+%{_libdir}/libntl.so.35*

 %files devel 
 %doc doc/*
@ -152,6 +155,9 @@ done


 %changelog
+* Thu Sep 28 2017 Jerry James <loganjerry@gmail.com> - 10.5.0-1
+- ntl-10.5.0
+
 * Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 10.3.0-3
 - Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild

--- a/2
+++ b/2
@ -1 +1 @@
-SHA512 (ntl-10.3.0.tar.gz) = a335e088829658df690c511fe8119e6f5d025ffa7b2d8b4c13b9b7aedac36efae838c66d2bfa70ef208bef9224d93448255d697e987c99c7b4928e1bbf0b9aa3
+SHA512 (ntl-10.5.0.tar.gz) = b299dfc29005079470972c2a9ca02acd0ebdbc31ff8923df02f3627dbc66daa0f527226972cef032e1e488c4272554634a96456e94653fdf8b01356160319aa0