ntl-10.5.0

This commit is contained in:
Jerry James 2017-09-28 18:37:10 -06:00
parent 2828a242e4
commit 7e405f74ed
4 changed files with 603 additions and 59 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@
/ntl-9.11.0.tar.gz
/ntl-10.1.0.tar.gz
/ntl-10.3.0.tar.gz
/ntl-10.5.0.tar.gz

View File

@ -1,6 +1,6 @@
--- doc/config.txt.orig 2016-11-18 11:39:17.000000000 -0700
+++ doc/config.txt 2017-01-12 15:07:36.859377026 -0700
@@ -300,6 +300,7 @@ NTL_AVOID_BRANCHING=off
--- doc/config.txt.orig 2017-07-07 09:05:14.000000000 -0600
+++ doc/config.txt 2017-09-09 12:10:10.877839678 -0600
@@ -337,6 +337,7 @@ NTL_AVOID_BRANCHING=off
NTL_GF2X_NOINLINE=off
NTL_GF2X_ALTCODE=off
NTL_GF2X_ALTCODE1=off
@ -8,7 +8,7 @@
GMP_INCDIR=$(GMP_PREFIX)/include
GMP_LIBDIR=$(GMP_PREFIX)/lib
@@ -597,6 +598,10 @@ NTL_GF2X_ALTCODE1=off
@@ -634,6 +635,10 @@ NTL_GF2X_ALTCODE1=off
# Yet another alternative implementation for GF2X multiplication.
@ -19,9 +19,9 @@
########## More GMP Options:
--- include/NTL/config.h.orig 2016-11-18 11:39:17.000000000 -0700
+++ include/NTL/config.h 2017-01-12 15:07:36.860377023 -0700
@@ -475,6 +475,20 @@ using the configure script.
--- include/NTL/config.h.orig 2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/config.h 2017-09-09 12:10:10.891839641 -0600
@@ -525,6 +525,20 @@ using the configure script.
#endif
@ -42,9 +42,9 @@
--- include/NTL/ctools.h.orig 2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/ctools.h 2017-01-12 15:07:36.861377020 -0700
@@ -447,6 +447,136 @@ char *_ntl_make_aligned(char *p, long al
--- include/NTL/ctools.h.orig 2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/ctools.h 2017-09-09 13:36:16.915768457 -0600
@@ -498,6 +498,166 @@ char *_ntl_make_aligned(char *p, long al
// and it should also be as big as a cache line
@ -61,6 +61,9 @@
+#endif
+
+#include <cpuid.h>
+#ifndef bit_SSSE3
+#define bit_SSSE3 (1 << 9)
+#endif
+#ifndef bit_PCLMUL
+#define bit_PCLMUL (1 << 1)
+#endif
@ -77,10 +80,37 @@
+#define BASE_FUNC(type,name) static type name##_base
+#define TARGET_FUNC(arch,suffix,type,name) \
+ static type __attribute__((target (arch))) name##_##suffix
+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name)
+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name)
+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name)
+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name)
+#define SSSE3_FUNC(type,name) TARGET_FUNC("ssse3",ssse3,type,name)
+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul,ssse3",pclmul,type,name)
+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul,ssse3",avx,type,name)
+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul,ssse3",fma,type,name)
+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul,ssse3",avx2,type,name)
+#define SSSE3_RESOLVER(type,name,params) \
+ extern "C" { \
+ static void __attribute__((optimize ("O0"))) \
+ (*resolve_##name (void))(void) { \
+ if (__builtin_expect(have_avx2, 0) < 0) { \
+ unsigned int eax, ebx, ecx, edx; \
+ if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \
+ have_avx2 = ((ebx & bit_AVX2) != 0); \
+ } else { \
+ have_avx2 = 0; \
+ } \
+ } \
+ if (__builtin_expect(have_ssse3, 0) < 0) { \
+ unsigned int eax, ebx, ecx, edx; \
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \
+ have_ssse3 = ((ecx & bit_SSSE3) != 0); \
+ } else { \
+ have_ssse3 = 0; \
+ } \
+ } \
+ if (have_avx2) return (void (*)(void))&name##_avx2; \
+ if (have_ssse3) return (void (*)(void))&name##_ssse3; \
+ return (void (*)(void))&name##_base; \
+ } \
+ } \
+ type __attribute__((ifunc ("resolve_" #name))) name params
+#define PCLMUL_RESOLVER(type,name,params) \
+ extern "C" { \
+ static void __attribute__((optimize ("O0"))) \
@ -181,9 +211,9 @@
#ifdef NTL_HAVE_BUILTIN_CLZL
--- include/NTL/def_config.h.orig 2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/def_config.h 2017-01-12 15:07:36.861377020 -0700
@@ -475,6 +475,19 @@ using the configure script.
--- include/NTL/def_config.h.orig 2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/def_config.h 2017-09-09 12:10:10.892839638 -0600
@@ -525,6 +525,19 @@ using the configure script.
#endif
@ -203,8 +233,8 @@
--- include/NTL/MatPrime.h.orig 2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/MatPrime.h 2017-01-12 16:15:17.307205250 -0700
--- include/NTL/MatPrime.h.orig 2017-07-07 09:05:14.000000000 -0600
+++ include/NTL/MatPrime.h 2017-09-09 12:10:10.892839638 -0600
@@ -20,7 +20,7 @@ NTL_OPEN_NNS
@ -214,11 +244,20 @@
#define NTL_MatPrime_NBITS (23)
#else
#define NTL_MatPrime_NBITS NTL_SP_NBITS
--- src/cfile.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/cfile 2017-01-12 15:07:36.862377017 -0700
@@ -475,6 +475,20 @@ using the configure script.
--- include/NTL/REPORT_ALL_FEATURES.h.orig 2017-07-07 09:05:15.000000000 -0600
+++ include/NTL/REPORT_ALL_FEATURES.h 2017-09-09 12:11:10.313683979 -0600
@@ -39,3 +39,6 @@
std::cerr << "NTL_HAVE_COPY_TRAITS2\n";
#endif
+#ifdef NTL_LOADTIME_CPU
+ std::cerr << "NTL_LOADTIME_CPU\n";
+#endif
--- src/cfile.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/cfile 2017-09-09 12:10:10.892839638 -0600
@@ -480,6 +480,20 @@ using the configure script.
#elif @{NTL_GF2X_ALTCODE1}
#define NTL_GF2X_ALTCODE1
+#if @{NTL_LOADTIME_CPU}
+#define NTL_LOADTIME_CPU
@ -235,11 +274,11 @@
+#endif
+
@{WIZARD_HACK}
--- src/DispSettings.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/DispSettings.cpp 2017-01-12 15:07:36.863377014 -0700
@@ -164,6 +164,10 @@ cout << "Performance Options:\n";
/*
* Yest another alternative strategy for implementing GF2X
--- src/DispSettings.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/DispSettings.cpp 2017-09-09 12:10:10.892839638 -0600
@@ -168,6 +168,10 @@ cout << "Performance Options:\n";
cout << "NTL_GF2X_NOINLINE\n";
#endif
@ -250,8 +289,8 @@
cout << "***************************/\n";
cout << "\n\n";
--- src/DoConfig.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/DoConfig 2017-01-12 15:07:36.864377011 -0700
--- src/DoConfig.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/DoConfig 2017-09-09 12:10:10.892839638 -0600
@@ -1,7 +1,7 @@
# This is a perl script, invoked from a shell
@ -261,7 +300,7 @@
%MakeFlag = (
@@ -82,6 +82,7 @@
@@ -86,6 +86,7 @@
'NTL_GF2X_NOINLINE' => 'off',
'NTL_GF2X_ALTCODE' => 'off',
'NTL_GF2X_ALTCODE1' => 'off',
@ -269,7 +308,7 @@
);
@@ -191,6 +192,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
@@ -195,6 +196,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
}
@ -285,8 +324,8 @@
# some special MakeVal values that are determined by SHARED
--- src/GF2X1.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/GF2X1.cpp 2017-01-12 15:07:36.866377005 -0700
--- src/GF2X1.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/GF2X1.cpp 2017-09-09 12:10:10.893839636 -0600
@@ -19,7 +19,7 @@
// simple scaling factor for some crossover points:
// we use a lower crossover of the underlying multiplication
@ -296,8 +335,8 @@
#define XOVER_SCALE (1L)
#else
#define XOVER_SCALE (2L)
--- src/GF2X.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/GF2X.cpp 2017-01-12 15:07:36.867377002 -0700
--- src/GF2X.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/GF2X.cpp 2017-09-09 12:10:10.893839636 -0600
@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned
_mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
}
@ -321,7 +360,7 @@
#else
@@ -576,6 +592,27 @@ void add(GF2X& x, const GF2X& a, const G
@@ -556,6 +572,27 @@ void add(GF2X& x, const GF2X& a, const G
@ -349,7 +388,7 @@
static NTL_INLINE
void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
{
@@ -588,6 +625,7 @@ NTL_EFF_BB_MUL_CODE0
@@ -568,6 +605,7 @@ NTL_EFF_BB_MUL_CODE0
}
@ -357,7 +396,7 @@
#ifdef NTL_GF2X_NOINLINE
@@ -612,6 +650,51 @@ NTL_EFF_BB_MUL_CODE0
@@ -592,6 +630,51 @@ NTL_EFF_BB_MUL_CODE0
#endif
@ -409,7 +448,7 @@
static
void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
{
@@ -639,6 +722,53 @@ NTL_EFF_BB_MUL_CODE1
@@ -619,6 +702,53 @@ NTL_EFF_BB_MUL_CODE1
}
@ -463,7 +502,7 @@
static
void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
{
@@ -667,6 +797,52 @@ NTL_EFF_BB_MUL_CODE2
@@ -647,6 +777,52 @@ NTL_EFF_BB_MUL_CODE2
}
@ -516,7 +555,7 @@
static
void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
@@ -695,10 +871,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
@@ -675,10 +851,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
}
@ -548,7 +587,7 @@
static
void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
{
@@ -712,6 +909,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
@@ -692,6 +889,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
}
@ -556,7 +595,7 @@
// mul2...mul8 hard-code 2x2...8x8 word multiplies.
// I adapted these routines from LiDIA (except mul3, see below).
@@ -1623,6 +1821,77 @@ static const _ntl_ulong sqrtab[256] = {
@@ -1603,6 +1801,77 @@ static const _ntl_ulong sqrtab[256] = {
@ -634,7 +673,7 @@
static inline
void sqr1(_ntl_ulong *c, _ntl_ulong a)
{
@@ -1663,6 +1932,7 @@ void sqr(GF2X& c, const GF2X& a)
@@ -1643,6 +1912,7 @@ void sqr(GF2X& c, const GF2X& a)
return;
}
@ -642,9 +681,9 @@
void LeftShift(GF2X& c, const GF2X& a, long n)
--- src/InitSettings.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/InitSettings.cpp 2017-01-12 15:07:36.867377002 -0700
@@ -148,6 +148,11 @@ int main()
--- src/InitSettings.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/InitSettings.cpp 2017-09-09 12:10:10.894839633 -0600
@@ -166,6 +166,11 @@ int main()
cout << "NTL_RANGE_CHECK=0\n";
#endif
@ -656,8 +695,8 @@
// the following are not actual config flags, but help
// in the Wizard logic
--- src/mat_lzz_p.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/mat_lzz_p.cpp 2017-01-12 21:47:53.774949563 -0700
--- src/mat_lzz_p.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/mat_lzz_p.cpp 2017-09-09 12:10:10.895839630 -0600
@@ -10,6 +10,15 @@
#ifdef NTL_HAVE_AVX
@ -2001,9 +2040,9 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
--- src/QuickTest.cpp.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/QuickTest.cpp 2017-01-12 15:07:36.883376955 -0700
@@ -316,6 +316,9 @@ cerr << "Performance Options:\n";
--- src/QuickTest.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/QuickTest.cpp 2017-09-09 12:10:10.895839630 -0600
@@ -310,6 +310,9 @@ cerr << "Performance Options:\n";
cerr << "NTL_GF2X_NOINLINE\n";
#endif
@ -2013,9 +2052,9 @@
cerr << "\n\n";
--- src/WizardAux.orig 2016-11-18 11:39:15.000000000 -0700
+++ src/WizardAux 2017-01-12 15:07:36.883376955 -0700
@@ -88,6 +88,7 @@ system("$ARGV[0] InitSettings");
--- src/WizardAux.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/WizardAux 2017-09-09 12:10:10.895839630 -0600
@@ -89,6 +89,7 @@ system("$ARGV[0] InitSettings");
'NTL_GF2X_NOINLINE' => 0,
'NTL_FFT_BIGTAB' => 0,
'NTL_FFT_LAZYMUL' => 0,
@ -2023,3 +2062,501 @@
'WIZARD_HACK' => '#define NTL_WIZARD_HACK',
--- src/ZZ.cpp.orig 2017-07-07 09:05:14.000000000 -0600
+++ src/ZZ.cpp 2017-09-18 18:41:27.125503871 -0600
@@ -12,6 +12,13 @@
#elif defined(NTL_HAVE_SSSE3)
#include <emmintrin.h>
#include <tmmintrin.h>
+#elif defined(NTL_LOADTIME_CPU)
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+static int have_avx2 = -1;
+static int have_ssse3 = -1;
#endif
@@ -2106,6 +2113,481 @@ struct RandomStream_impl {
};
+#elif defined(NTL_LOADTIME_CPU)
+
+// round selector, specified values:
+// 8: low security - high speed
+// 12: mid security - mid speed
+// 20: high security - low speed
+#ifndef CHACHA_RNDS
+#define CHACHA_RNDS 20
+#endif
+
+typedef __m128i ssse3_ivec_t;
+typedef __m256i avx2_ivec_t;
+
+#define SSSE3_DELTA _mm_set_epi32(0,0,0,1)
+#define AVX2_DELTA _mm256_set_epi64x(0,2,0,2)
+
+#define SSSE3_START _mm_setzero_si128()
+#define AVX2_START _mm256_set_epi64x(0,1,0,0)
+
+#define SSSE3_STOREU_VEC(m,r) _mm_storeu_si128((__m128i*)(m), r)
+#define AVX2_STOREU_VEC(m,r) _mm256_storeu_si256((__m256i*)(m), r)
+
+#define SSSE3_STORE_VEC(m,r) _mm_store_si128((__m128i*)(m), r)
+#define AVX2_STORE_VEC(m,r) _mm256_store_si256((__m256i*)(m), r)
+
+#define SSSE3_LOAD_VEC(r,m) r = _mm_load_si128((const __m128i *)(m))
+#define AVX2_LOAD_VEC(r,m) r = _mm256_load_si256((const __m256i *)(m))
+
+#define SSSE3_LOADU_VEC_128(r, m) r = _mm_loadu_si128((const __m128i*)(m))
+#define AVX2_LOADU_VEC_128(r, m) r = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(m)))
+
+#define SSSE3_ADD_VEC_32(a,b) _mm_add_epi32(a, b)
+#define AVX2_ADD_VEC_32(a,b) _mm256_add_epi32(a, b)
+
+#define SSSE3_ADD_VEC_64(a,b) _mm_add_epi64(a, b)
+#define AVX2_ADD_VEC_64(a,b) _mm256_add_epi64(a, b)
+
+#define SSSE3_XOR_VEC(a,b) _mm_xor_si128(a, b)
+#define AVX2_XOR_VEC(a,b) _mm256_xor_si256(a, b)
+
+#define SSSE3_ROR_VEC_V1(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
+#define AVX2_ROR_VEC_V1(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
+
+#define SSSE3_ROR_VEC_V2(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
+#define AVX2_ROR_VEC_V2(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
+
+#define SSSE3_ROR_VEC_V3(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
+#define AVX2_ROR_VEC_V3(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
+
+#define SSSE3_ROL_VEC_7(x) SSSE3_XOR_VEC(_mm_slli_epi32(x, 7), _mm_srli_epi32(x,25))
+#define AVX2_ROL_VEC_7(x) AVX2_XOR_VEC(_mm256_slli_epi32(x, 7), _mm256_srli_epi32(x,25))
+
+#define SSSE3_ROL_VEC_12(x) SSSE3_XOR_VEC(_mm_slli_epi32(x,12), _mm_srli_epi32(x,20))
+#define AVX2_ROL_VEC_12(x) AVX2_XOR_VEC(_mm256_slli_epi32(x,12), _mm256_srli_epi32(x,20))
+
+#define SSSE3_ROL_VEC_8(x) _mm_shuffle_epi8(x,_mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
+#define AVX2_ROL_VEC_8(x) _mm256_shuffle_epi8(x,_mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
+
+#define SSSE3_ROL_VEC_16(x) _mm_shuffle_epi8(x,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
+#define AVX2_ROL_VEC_16(x) _mm256_shuffle_epi8(x,_mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
+
+#define SSSE3_WRITEU_VEC(op, d, v0, v1, v2, v3) \
+ SSSE3_STOREU_VEC(op + (d + 0*4), v0); \
+ SSSE3_STOREU_VEC(op + (d + 4*4), v1); \
+ SSSE3_STOREU_VEC(op + (d + 8*4), v2); \
+ SSSE3_STOREU_VEC(op + (d +12*4), v3);
+#define AVX2_WRITEU_VEC(op, d, v0, v1, v2, v3) \
+ AVX2_STOREU_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
+ AVX2_STOREU_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
+ AVX2_STOREU_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
+ AVX2_STOREU_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
+
+#define SSSE3_WRITE_VEC(op, d, v0, v1, v2, v3) \
+ SSSE3_STORE_VEC(op + (d + 0*4), v0); \
+ SSSE3_STORE_VEC(op + (d + 4*4), v1); \
+ SSSE3_STORE_VEC(op + (d + 8*4), v2); \
+ SSSE3_STORE_VEC(op + (d +12*4), v3);
+#define AVX2_WRITE_VEC(op, d, v0, v1, v2, v3) \
+ AVX2_STORE_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
+ AVX2_STORE_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
+ AVX2_STORE_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
+ AVX2_STORE_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
+
+#define SSSE3_SZ_VEC (16)
+#define AVX2_SZ_VEC (32)
+
+#define SSSE3_RANSTREAM_BUFSZ (1024)
+// must be a multiple of 8*SSE3_SZ_VEC
+
+#define AVX2_RANSTREAM_BUFSZ (1024)
+// must be a multiple of 8*AVX2_SZ_VEC
+
+#define SSSE3_DQROUND_VECTORS_VEC(a,b,c,d) \
+ a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \
+ c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_12(b); \
+ a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_8(d); \
+ c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_7(b); \
+ b = SSSE3_ROR_VEC_V1(b); c = SSSE3_ROR_VEC_V2(c); d = SSSE3_ROR_VEC_V3(d); \
+ a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_16(d); \
+ c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_12(b); \
+ a = SSSE3_ADD_VEC_32(a,b); d = SSSE3_XOR_VEC(d,a); d = SSSE3_ROL_VEC_8(d); \
+ c = SSSE3_ADD_VEC_32(c,d); b = SSSE3_XOR_VEC(b,c); b = SSSE3_ROL_VEC_7(b); \
+ b = SSSE3_ROR_VEC_V3(b); c = SSSE3_ROR_VEC_V2(c); d = SSSE3_ROR_VEC_V1(d);
+
+#define AVX2_DQROUND_VECTORS_VEC(a,b,c,d) \
+ a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_16(d); \
+ c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_12(b); \
+ a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_8(d); \
+ c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_7(b); \
+ b = AVX2_ROR_VEC_V1(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V3(d); \
+ a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_16(d); \
+ c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_12(b); \
+ a = AVX2_ADD_VEC_32(a,b); d = AVX2_XOR_VEC(d,a); d = AVX2_ROL_VEC_8(d); \
+ c = AVX2_ADD_VEC_32(c,d); b = AVX2_XOR_VEC(b,c); b = AVX2_ROL_VEC_7(b); \
+ b = AVX2_ROR_VEC_V3(b); c = AVX2_ROR_VEC_V2(c); d = AVX2_ROR_VEC_V1(d);
+
+#define SSSE3_RANSTREAM_STATESZ (4*SSSE3_SZ_VEC)
+#define AVX2_RANSTREAM_STATESZ (4*AVX2_SZ_VEC)
+
+static void allocate_space(AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store)
+{
+ if (have_avx2) {
+ state_store.SetLength(AVX2_RANSTREAM_STATESZ);
+ buf_store.SetLength(AVX2_RANSTREAM_BUFSZ);
+ } else {
+ state_store.SetLength(SSSE3_RANSTREAM_STATESZ);
+ buf_store.SetLength(SSSE3_RANSTREAM_BUFSZ);
+ }
+};
+
+BASE_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state,
+ AlignedArray<unsigned char> &state_store __attribute__((unused)),
+ AlignedArray<unsigned char> &buf_store __attribute__((unused)),
+ const unsigned char *key)
+{
+ salsa20_init(state, key);
+}
+
+SSSE3_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ const unsigned char *key)
+{
+ allocate_space(state_store, buf_store);
+
+ unsigned char *state = state_store.elts();
+
+ unsigned int chacha_const[] = {
+ 0x61707865,0x3320646E,0x79622D32,0x6B206574
+ };
+
+ ssse3_ivec_t d0, d1, d2, d3;
+ SSSE3_LOADU_VEC_128(d0, chacha_const);
+ SSSE3_LOADU_VEC_128(d1, key);
+ SSSE3_LOADU_VEC_128(d2, key+16);
+
+ d3 = SSSE3_START;
+
+ SSSE3_STORE_VEC(state + 0*SSSE3_SZ_VEC, d0);
+ SSSE3_STORE_VEC(state + 1*SSSE3_SZ_VEC, d1);
+ SSSE3_STORE_VEC(state + 2*SSSE3_SZ_VEC, d2);
+ SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
+}
+
+AVX2_FUNC(void, randomstream_impl_init)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ const unsigned char *key)
+{
+ allocate_space(state_store, buf_store);
+
+ unsigned char *state = state_store.elts();
+
+ unsigned int chacha_const[] = {
+ 0x61707865,0x3320646E,0x79622D32,0x6B206574
+ };
+
+ avx2_ivec_t d0, d1, d2, d3;
+ AVX2_LOADU_VEC_128(d0, chacha_const);
+ AVX2_LOADU_VEC_128(d1, key);
+ AVX2_LOADU_VEC_128(d2, key+16);
+
+ d3 = AVX2_START;
+
+ AVX2_STORE_VEC(state + 0*AVX2_SZ_VEC, d0);
+ AVX2_STORE_VEC(state + 1*AVX2_SZ_VEC, d1);
+ AVX2_STORE_VEC(state + 2*AVX2_SZ_VEC, d2);
+ AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
+}
+
+SSSE3_RESOLVER(static void, randomstream_impl_init,
+ (_ntl_uint32 *state, AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store, const unsigned char *key));
+
+BASE_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state,
+ unsigned char *buf,
+ AlignedArray<unsigned char> &state_store __attribute__((unused)),
+ AlignedArray<unsigned char> &buf_store __attribute__((unused)),
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+ if (n < 0) LogicError("RandomStream::get: bad args");
+
+ long i, j;
+
+ if (n <= 64-pos) {
+ for (i = 0; i < n; i++) res[i] = buf[pos+i];
+ pos += n;
+ return pos;
+ }
+
+ // read remainder of buffer
+ for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
+ n -= 64-pos;
+ res += 64-pos;
+ pos = 64;
+
+ _ntl_uint32 wdata[16];
+
+ // read 64-byte chunks
+ for (i = 0; i <= n-64; i += 64) {
+ salsa20_apply(state, wdata);
+ for (j = 0; j < 16; j++)
+ FROMLE(res + i + 4*j, wdata[j]);
+ }
+
+ if (i < n) {
+ salsa20_apply(state, wdata);
+
+ for (j = 0; j < 16; j++)
+ FROMLE(buf + 4*j, wdata[j]);
+
+ pos = n-i;
+ for (j = 0; j < pos; j++)
+ res[i+j] = buf[j];
+ }
+
+ return pos;
+}
+
+SSSE3_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ unsigned char *buf_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+ if (n < 0) LogicError("RandomStream::get: bad args");
+ if (n == 0) return pos;
+
+ unsigned char *NTL_RESTRICT buf = buf_store.elts();
+
+ if (n <= SSSE3_RANSTREAM_BUFSZ-pos) {
+ std::memcpy(&res[0], &buf[pos], n);
+ pos += n;
+ return pos;
+ }
+
+ unsigned char *NTL_RESTRICT state = state_store.elts();
+
+ ssse3_ivec_t d0, d1, d2, d3;
+ SSSE3_LOAD_VEC(d0, state + 0*SSSE3_SZ_VEC);
+ SSSE3_LOAD_VEC(d1, state + 1*SSSE3_SZ_VEC);
+ SSSE3_LOAD_VEC(d2, state + 2*SSSE3_SZ_VEC);
+ SSSE3_LOAD_VEC(d3, state + 3*SSSE3_SZ_VEC);
+
+ // read remainder of buffer
+ std::memcpy(&res[0], &buf[pos], SSSE3_RANSTREAM_BUFSZ-pos);
+ n -= SSSE3_RANSTREAM_BUFSZ-pos;
+ res += SSSE3_RANSTREAM_BUFSZ-pos;
+ pos = SSSE3_RANSTREAM_BUFSZ;
+
+ long i = 0;
+ for (; i <= n-SSSE3_RANSTREAM_BUFSZ; i += SSSE3_RANSTREAM_BUFSZ) {
+
+ for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
+ ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+ ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+
+ for (long k = 0; k < CHACHA_RNDS/2; k++) {
+ SSSE3_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+ SSSE3_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+ }
+
+ SSSE3_WRITEU_VEC(res+i+j*(8*SSSE3_SZ_VEC), 0, SSSE3_ADD_VEC_32(v0,d0), SSSE3_ADD_VEC_32(v1,d1), SSSE3_ADD_VEC_32(v2,d2), SSSE3_ADD_VEC_32(v3,d3))
+ d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+ SSSE3_WRITEU_VEC(res+i+j*(8*SSSE3_SZ_VEC), 4*SSSE3_SZ_VEC, SSSE3_ADD_VEC_32(v4,d0), SSSE3_ADD_VEC_32(v5,d1), SSSE3_ADD_VEC_32(v6,d2), SSSE3_ADD_VEC_32(v7,d3))
+ d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+ }
+
+ }
+
+ if (i < n) {
+ for (long j = 0; j < SSSE3_RANSTREAM_BUFSZ/(8*SSSE3_SZ_VEC); j++) {
+ ssse3_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+ ssse3_ivec_t v4=d0, v5=d1, v6=d2, v7=SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+
+ for (long k = 0; k < CHACHA_RNDS/2; k++) {
+ SSSE3_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+ SSSE3_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+ }
+
+ SSSE3_WRITE_VEC(buf+j*(8*SSSE3_SZ_VEC), 0, SSSE3_ADD_VEC_32(v0,d0), SSSE3_ADD_VEC_32(v1,d1), SSSE3_ADD_VEC_32(v2,d2), SSSE3_ADD_VEC_32(v3,d3))
+ d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+ SSSE3_WRITE_VEC(buf+j*(8*SSSE3_SZ_VEC), 4*SSSE3_SZ_VEC, SSSE3_ADD_VEC_32(v4,d0), SSSE3_ADD_VEC_32(v5,d1), SSSE3_ADD_VEC_32(v6,d2), SSSE3_ADD_VEC_32(v7,d3))
+ d3 = SSSE3_ADD_VEC_64(d3, SSSE3_DELTA);
+ }
+
+ pos = n-i;
+ std::memcpy(&res[i], &buf[0], pos);
+ }
+
+ SSSE3_STORE_VEC(state + 3*SSSE3_SZ_VEC, d3);
+
+ return pos;
+}
+
+AVX2_FUNC(long, randomstream_get_bytes)
+(_ntl_uint32 *state_ignored __attribute__((unused)),
+ unsigned char *buf_ignored __attribute__((unused)),
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos)
+{
+ if (n < 0) LogicError("RandomStream::get: bad args");
+ if (n == 0) return pos;
+
+ unsigned char *NTL_RESTRICT buf = buf_store.elts();
+
+ if (n <= AVX2_RANSTREAM_BUFSZ-pos) {
+ std::memcpy(&res[0], &buf[pos], n);
+ pos += n;
+ return pos;
+ }
+
+ unsigned char *NTL_RESTRICT state = state_store.elts();
+
+ avx2_ivec_t d0, d1, d2, d3;
+ AVX2_LOAD_VEC(d0, state + 0*AVX2_SZ_VEC);
+ AVX2_LOAD_VEC(d1, state + 1*AVX2_SZ_VEC);
+ AVX2_LOAD_VEC(d2, state + 2*AVX2_SZ_VEC);
+ AVX2_LOAD_VEC(d3, state + 3*AVX2_SZ_VEC);
+
+ // read remainder of buffer
+ std::memcpy(&res[0], &buf[pos], AVX2_RANSTREAM_BUFSZ-pos);
+ n -= AVX2_RANSTREAM_BUFSZ-pos;
+ res += AVX2_RANSTREAM_BUFSZ-pos;
+ pos = AVX2_RANSTREAM_BUFSZ;
+
+ long i = 0;
+ for (; i <= n-AVX2_RANSTREAM_BUFSZ; i += AVX2_RANSTREAM_BUFSZ) {
+
+ for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
+ avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+ avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+
+ for (long k = 0; k < CHACHA_RNDS/2; k++) {
+ AVX2_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+ AVX2_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+ }
+
+ AVX2_WRITEU_VEC(res+i+j*(8*AVX2_SZ_VEC), 0, AVX2_ADD_VEC_32(v0,d0), AVX2_ADD_VEC_32(v1,d1), AVX2_ADD_VEC_32(v2,d2), AVX2_ADD_VEC_32(v3,d3))
+ d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+ AVX2_WRITEU_VEC(res+i+j*(8*AVX2_SZ_VEC), 4*AVX2_SZ_VEC, AVX2_ADD_VEC_32(v4,d0), AVX2_ADD_VEC_32(v5,d1), AVX2_ADD_VEC_32(v6,d2), AVX2_ADD_VEC_32(v7,d3))
+ d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+ }
+
+ }
+
+ if (i < n) {
+ for (long j = 0; j < AVX2_RANSTREAM_BUFSZ/(8*AVX2_SZ_VEC); j++) {
+ avx2_ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
+ avx2_ivec_t v4=d0, v5=d1, v6=d2, v7=AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+
+ for (long k = 0; k < CHACHA_RNDS/2; k++) {
+ AVX2_DQROUND_VECTORS_VEC(v0,v1,v2,v3)
+ AVX2_DQROUND_VECTORS_VEC(v4,v5,v6,v7)
+ }
+
+ AVX2_WRITE_VEC(buf+j*(8*AVX2_SZ_VEC), 0, AVX2_ADD_VEC_32(v0,d0), AVX2_ADD_VEC_32(v1,d1), AVX2_ADD_VEC_32(v2,d2), AVX2_ADD_VEC_32(v3,d3))
+ d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+ AVX2_WRITE_VEC(buf+j*(8*AVX2_SZ_VEC), 4*AVX2_SZ_VEC, AVX2_ADD_VEC_32(v4,d0), AVX2_ADD_VEC_32(v5,d1), AVX2_ADD_VEC_32(v6,d2), AVX2_ADD_VEC_32(v7,d3))
+ d3 = AVX2_ADD_VEC_64(d3, AVX2_DELTA);
+ }
+
+ pos = n-i;
+ std::memcpy(&res[i], &buf[0], pos);
+ }
+
+ AVX2_STORE_VEC(state + 3*AVX2_SZ_VEC, d3);
+
+ return pos;
+}
+
+SSSE3_RESOLVER(static long, randomstream_get_bytes,
+ (_ntl_uint32 *state, unsigned char *buf,
+ AlignedArray<unsigned char> &state_store,
+ AlignedArray<unsigned char> &buf_store,
+ unsigned char *NTL_RESTRICT res,
+ long n,
+ long pos));
+
+struct RandomStream_impl {
+ AlignedArray<unsigned char> state_store;
+ AlignedArray<unsigned char> buf_store;
+ _ntl_uint32 state[16];
+ unsigned char buf[64];
+
+ explicit
+ RandomStream_impl(const unsigned char *key)
+ {
+ randomstream_impl_init(state, state_store, buf_store, key);
+ }
+
+ RandomStream_impl(const RandomStream_impl& other)
+ {
+ if (have_avx2 || have_ssse3) {
+ allocate_space(state_store, buf_store);
+ }
+ *this = other;
+ }
+
+ RandomStream_impl& operator=(const RandomStream_impl& other)
+ {
+ if (have_avx2) {
+ std::memcpy(state_store.elts(), other.state_store.elts(), AVX2_RANSTREAM_STATESZ);
+ std::memcpy(buf_store.elts(), other.buf_store.elts(), AVX2_RANSTREAM_BUFSZ);
+ } else if (have_ssse3) {
+ std::memcpy(state_store.elts(), other.state_store.elts(), SSSE3_RANSTREAM_STATESZ);
+ std::memcpy(buf_store.elts(), other.buf_store.elts(), SSSE3_RANSTREAM_BUFSZ);
+ }
+ return *this;
+ }
+
+ const unsigned char *
+ get_buf() const
+ {
+ if (have_avx2 || have_ssse3) {
+ return buf_store.elts();
+ } else {
+ return &buf[0];
+ }
+ }
+
+ long
+ get_buf_len() const
+ {
+ if (have_avx2) {
+ return AVX2_RANSTREAM_BUFSZ;
+ } else if (have_ssse3) {
+ return SSSE3_RANSTREAM_BUFSZ;
+ } else {
+ return 64;
+ }
+ }
+
+ long
+ get_bytes(unsigned char *NTL_RESTRICT res,
+ long n, long pos)
+ {
+ return randomstream_get_bytes(state, buf, state_store, buf_store,
+ res, n, pos);
+ }
+};
+
#else
struct RandomStream_impl {

View File

@ -10,8 +10,8 @@
Summary: High-performance algorithms for vectors, matrices, and polynomials
Name: ntl
Version: 10.3.0
Release: 3%{?dist}
Version: 10.5.0
Release: 1%{?dist}
License: LGPLv2+
URL: http://shoup.net/ntl/
@ -70,6 +70,9 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release}
%build
pushd src
# We eventually want to set NTL_STD_CXX14=on and NTL_SAFE_VECTORS=on, but that
# involves a change in semantics to vector and matrix assignment operations
# that latte-integrale, at least, is not yet prepared for.
./configure \
CXX="${CXX-g++}" \
CXXFLAGS="%{optflags} -fPIC" \
@ -138,7 +141,7 @@ done
%files
%doc README
%license doc/copying.txt
%{_libdir}/libntl.so.33*
%{_libdir}/libntl.so.35*
%files devel
%doc doc/*
@ -152,6 +155,9 @@ done
%changelog
* Thu Sep 28 2017 Jerry James <loganjerry@gmail.com> - 10.5.0-1
- ntl-10.5.0
* Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 10.3.0-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild

View File

@ -1 +1 @@
SHA512 (ntl-10.3.0.tar.gz) = a335e088829658df690c511fe8119e6f5d025ffa7b2d8b4c13b9b7aedac36efae838c66d2bfa70ef208bef9224d93448255d697e987c99c7b4928e1bbf0b9aa3
SHA512 (ntl-10.5.0.tar.gz) = b299dfc29005079470972c2a9ca02acd0ebdbc31ff8923df02f3627dbc66daa0f527226972cef032e1e488c4272554634a96456e94653fdf8b01356160319aa0