ntl-10.1.0.

This commit is contained in:
Jerry James 2016-10-20 22:54:40 -06:00
parent 14995b0b61
commit a5419949fa
4 changed files with 101 additions and 105 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@
/ntl-9.9.1.tar.gz
/ntl-9.10.0.tar.gz
/ntl-9.11.0.tar.gz
/ntl-10.1.0.tar.gz

View File

@ -1,16 +1,16 @@
--- doc/config.txt.orig 2016-06-21 12:46:44.000000000 -0600
+++ doc/config.txt 2016-07-20 19:57:16.756611904 -0600
@@ -292,6 +292,7 @@ NTL_GF2X_NOINLINE=off
--- doc/config.txt.orig 2016-10-14 07:53:33.000000000 -0600
+++ doc/config.txt 2016-10-19 19:06:48.837854327 -0600
@@ -300,6 +300,7 @@ NTL_AVOID_BRANCHING=off
NTL_GF2X_NOINLINE=off
NTL_GF2X_ALTCODE=off
NTL_GF2X_ALTCODE1=off
NTL_PCLMUL=off
+NTL_LOADTIME_CPU=off
GMP_INCDIR=$(GMP_PREFIX)/include
GMP_LIBDIR=$(GMP_PREFIX)/lib
@@ -644,6 +645,10 @@ NTL_PCLMUL=off
# switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
# over GF(2)[X] (without relying on the gf2x package)
@@ -597,6 +598,10 @@ NTL_GF2X_ALTCODE1=off
# Yet another alternative implementation for GF2X multiplication.
+NTL_LOADTIME_CPU=off
+
@ -19,9 +19,9 @@
########## More GMP Options:
--- include/NTL/config.h.orig 2016-06-21 12:46:44.000000000 -0600
+++ include/NTL/config.h 2016-07-20 19:57:16.766611105 -0600
@@ -625,6 +625,23 @@ using the configure script.
--- include/NTL/config.h.orig 2016-10-14 07:53:33.000000000 -0600
+++ include/NTL/config.h 2016-10-19 19:03:46.635500859 -0600
@@ -475,6 +475,20 @@ using the configure script.
#endif
@ -35,9 +35,6 @@
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ *
+ * To re-build after changing this flag:
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
+ */
+
+#endif
@ -45,12 +42,12 @@
--- include/NTL/ctools.h.orig 2016-06-21 12:46:44.000000000 -0600
+++ include/NTL/ctools.h 2016-07-20 19:57:16.767611025 -0600
@@ -473,6 +473,137 @@ char *_ntl_make_aligned(char *p, long al
// this should be big enough to satisfy any SIMD instructions,
--- include/NTL/ctools.h.orig 2016-10-14 07:53:32.000000000 -0600
+++ include/NTL/ctools.h 2016-10-19 19:03:46.636500779 -0600
@@ -447,6 +447,136 @@ char *_ntl_make_aligned(char *p, long al
// and it should also be as big as a cache line
+/* Determine CPU characteristics at runtime */
+#ifdef NTL_LOADTIME_CPU
+#if !defined(__x86_64__)
@ -174,7 +171,6 @@
+ have_fma = 0; \
+ } \
+ } \
+ unsigned int eax, ebx, ecx, edx; \
+ return have_avx2 \
+ ? (void (*)(void))&name##_avx2 \
+ : (void (*)(void))&name##_fma; \
@ -183,11 +179,11 @@
+ type __attribute__((ifunc ("resolve_" #name))) name params
+#endif
#ifdef NTL_HAVE_BUILTIN_CLZL
--- include/NTL/def_config.h.orig 2016-06-21 12:46:44.000000000 -0600
+++ include/NTL/def_config.h 2016-07-20 19:57:16.767611025 -0600
@@ -625,6 +625,22 @@ using the configure script.
--- include/NTL/def_config.h.orig 2016-10-14 07:53:33.000000000 -0600
+++ include/NTL/def_config.h 2016-10-19 19:03:46.637500698 -0600
@@ -475,6 +475,19 @@ using the configure script.
#endif
@ -201,18 +197,15 @@
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ *
+ * To re-build after changing this flag:
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
+ */
+
+#endif
--- src/cfile.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/cfile 2016-07-20 19:57:16.768610945 -0600
@@ -625,6 +625,23 @@ using the configure script.
--- src/cfile.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/cfile 2016-10-19 19:03:46.637500698 -0600
@@ -475,6 +475,20 @@ using the configure script.
#endif
@ -226,9 +219,6 @@
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ *
+ * To re-build after changing this flag:
+ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
+ */
+
+#endif
@ -236,10 +226,10 @@
@{WIZARD_HACK}
--- src/DispSettings.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/DispSettings.c 2016-07-20 19:57:16.768610945 -0600
@@ -191,6 +191,10 @@ cout << "Performance Options:\n";
cout << "NTL_PCLMUL\n";
--- src/DispSettings.cpp.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/DispSettings.cpp 2016-10-19 19:03:46.637500698 -0600
@@ -164,6 +164,10 @@ cout << "Performance Options:\n";
cout << "NTL_GF2X_NOINLINE\n";
#endif
+#ifdef NTL_LOADTIME_CPU
@ -249,8 +239,8 @@
cout << "***************************/\n";
cout << "\n\n";
--- src/DoConfig.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/DoConfig 2016-07-20 19:57:16.769610865 -0600
--- src/DoConfig.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/DoConfig 2016-10-19 19:09:03.299035983 -0600
@@ -1,7 +1,7 @@
# This is a perl script, invoked from a shell
@ -260,15 +250,15 @@
%MakeFlag = (
@@ -83,6 +83,7 @@
'NTL_RANGE_CHECK' => 'off',
'NTL_FFT_BIGTAB' => 'off',
'NTL_FFT_LAZYMUL' => 'off',
@@ -82,6 +82,7 @@
'NTL_GF2X_NOINLINE' => 'off',
'NTL_GF2X_ALTCODE' => 'off',
'NTL_GF2X_ALTCODE1' => 'off',
+'NTL_LOADTIME_CPU' => 'off',
);
@@ -149,6 +150,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' &
);
@@ -191,6 +192,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
}
@ -284,23 +274,23 @@
# some special MakeVal values that are determined by SHARED
--- src/GF2X1.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/GF2X1.c 2016-07-20 19:57:16.770610785 -0600
--- src/GF2X1.cpp.orig 2016-10-19 19:03:46.640500457 -0600
+++ src/GF2X1.cpp 2016-10-19 19:10:34.740694338 -0600
@@ -19,7 +19,7 @@
// simple scaling factor for some crossover points:
// we use a lower crossover of the underlying multiplication
// is faster
-#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL))
+#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL) || defined (NTL_LOADTIME_CPU))
-#if (defined(NTL_GF2X_LIB) || defined(NTL_HAVE_PCLMUL))
+#if (defined(NTL_GF2X_LIB) || defined(NTL_HAVE_PCLMUL) || defined(NTL_LOADTIME_CPU))
#define XOVER_SCALE (1L)
#else
#define XOVER_SCALE (2L)
--- src/GF2X.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/GF2X.c 2016-07-20 19:57:16.771610706 -0600
@@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned
__m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
--- src/GF2X.cpp.orig 2016-10-14 07:53:31.000000000 -0600
+++ src/GF2X.cpp 2016-10-19 19:46:20.799482817 -0600
@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned
_mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
}
+#elif defined (NTL_LOADTIME_CPU)
+
+#include <wmmintrin.h>
@ -311,16 +301,16 @@
+
+#define NTL_INLINE inline
+
+#define pclmul_mul1(c,a,b) do { \
+#define pclmul_mul1(c,a,b) do { \
+ __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \
+ __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \
+ _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0)); \
+ _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0)); \
+} while (0)
+
#else
@@ -579,6 +595,27 @@ void add(GF2X& x, const GF2X& a, const G
@@ -576,6 +592,27 @@ void add(GF2X& x, const GF2X& a, const G
@ -345,10 +335,10 @@
+
+#else
+
static
static NTL_INLINE
void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
{
@@ -592,6 +629,7 @@ NTL_EFF_BB_MUL_CODE0
@@ -588,6 +625,7 @@ NTL_EFF_BB_MUL_CODE0
}
@ -356,7 +346,7 @@
#ifdef NTL_GF2X_NOINLINE
@@ -616,6 +654,51 @@ NTL_EFF_BB_MUL_CODE0
@@ -612,6 +650,51 @@ NTL_EFF_BB_MUL_CODE0
#endif
@ -408,7 +398,7 @@
static
void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
{
@@ -643,6 +726,53 @@ NTL_EFF_BB_MUL_CODE1
@@ -639,6 +722,53 @@ NTL_EFF_BB_MUL_CODE1
}
@ -462,7 +452,7 @@
static
void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
{
@@ -671,6 +801,52 @@ NTL_EFF_BB_MUL_CODE2
@@ -667,6 +797,52 @@ NTL_EFF_BB_MUL_CODE2
}
@ -515,15 +505,16 @@
static
void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
@@ -699,9 +875,29 @@ NTL_EFF_SHORT_BB_MUL_CODE1
@@ -695,10 +871,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
}
+#endif
+
+
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+ NTL_EFF_HALF_BB_MUL_CODE0
@ -533,28 +524,28 @@
+{
+ pclmul_mul1(c, a, b);
+}
+
+AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+ pclmul_mul1(c, a, b);
+}
+
+PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
+
+#else
+
static
void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
@@ -716,6 +912,8 @@ NTL_EFF_HALF_BB_MUL_CODE0
{
@@ -712,6 +909,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
}
+#endif
+
// mul2...mul8 hard-code 2x2...8x8 word multiplies.
// I adapted these routines from LiDIA (except mul3, see below).
@@ -1627,6 +1825,77 @@ static const _ntl_ulong sqrtab[256] = {
@@ -1623,6 +1821,77 @@ static const _ntl_ulong sqrtab[256] = {
@ -632,7 +623,7 @@
static inline
void sqr1(_ntl_ulong *c, _ntl_ulong a)
{
@@ -1667,6 +1936,7 @@ void sqr(GF2X& c, const GF2X& a)
@@ -1663,6 +1932,7 @@ void sqr(GF2X& c, const GF2X& a)
return;
}
@ -640,9 +631,9 @@
void LeftShift(GF2X& c, const GF2X& a, long n)
--- src/InitSettings.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/InitSettings.c 2016-07-20 19:57:16.772610626 -0600
@@ -156,6 +156,11 @@ int main()
--- src/InitSettings.cpp.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/InitSettings.cpp 2016-10-19 19:03:46.643500216 -0600
@@ -148,6 +148,11 @@ int main()
cout << "NTL_RANGE_CHECK=0\n";
#endif
@ -652,10 +643,10 @@
+ cout << "NTL_LOADTIME_CPU=0\n";
+#endif
// the following is synthetically defined
#ifdef NTL_LONGLONG_SP_MULMOD
--- src/lzz_pX1.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/lzz_pX1.c 2016-07-20 19:57:16.773610546 -0600
// the following are not actual config flags, but help
// in the Wizard logic
--- src/lzz_pX1.cpp.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/lzz_pX1.cpp 2016-10-19 19:03:46.644500136 -0600
@@ -4,6 +4,12 @@
#ifdef NTL_HAVE_AVX
@ -1084,8 +1075,8 @@
default:
LogicError("CompMod: bad strategy");
--- src/mat_lzz_p.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/mat_lzz_p.c 2016-07-20 19:57:16.776610306 -0600
--- src/mat_lzz_p.cpp.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/mat_lzz_p.cpp 2016-10-19 19:03:46.647499895 -0600
@@ -10,6 +10,15 @@
#ifdef NTL_HAVE_AVX
@ -2128,7 +2119,7 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
@@ -2482,7 +3190,8 @@ void mul_strassen(const mat_window_zz_p&
@@ -2466,7 +3174,8 @@ void mul_strassen(const mat_window_zz_p&
// this code determines if mul_base triggers blk_mul_DD,
// in which case a higher crossover is used
@ -2138,7 +2129,7 @@
{
long V = MAT_BLK_SZ*4;
long p = zz_p::modulus();
@@ -2982,10 +3691,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
@@ -2966,10 +3675,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
@ -2152,7 +2143,7 @@
{
long n = A.NumRows();
@@ -3151,10 +3860,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
@@ -3135,10 +3844,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
@ -2166,7 +2157,7 @@
{
long n = A.NumRows();
@@ -3912,8 +4621,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
@@ -3896,8 +4605,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
else if (n/MAT_BLK_SZ < 4) {
long V = 64;
@ -2178,7 +2169,7 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
@@ -3938,8 +4648,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
@@ -3922,8 +4632,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
else {
long V = 4*MAT_BLK_SZ;
@ -2190,7 +2181,7 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
@@ -4345,10 +5056,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
@@ -4329,10 +5040,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
@ -2204,7 +2195,7 @@
vec_zz_p *xp, bool trans, bool relax)
{
long n = A.NumRows();
@@ -4535,10 +5246,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
@@ -4519,10 +5230,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
@ -2218,7 +2209,7 @@
vec_zz_p *xp, bool trans, bool relax)
{
long n = A.NumRows();
@@ -5349,8 +6060,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
@@ -5333,8 +6044,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
else if (n/MAT_BLK_SZ < 4) {
long V = 64;
@ -2230,7 +2221,7 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
@@ -5375,8 +6087,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
@@ -5359,8 +6071,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
else {
long V = 4*MAT_BLK_SZ;
@ -2242,7 +2233,7 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
@@ -5622,7 +6335,7 @@ long elim_basic(const mat_zz_p& A, mat_z
@@ -5606,7 +6319,7 @@ long elim_basic(const mat_zz_p& A, mat_z
#ifdef NTL_HAVE_LL_TYPE
@ -2251,7 +2242,7 @@
static inline
@@ -7075,8 +7788,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
@@ -7059,8 +7772,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
else {
long V = 4*MAT_BLK_SZ;
@ -2263,10 +2254,10 @@
V <= (MAX_DBL_INT-(p-1))/(p-1) &&
V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
--- src/QuickTest.c.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/QuickTest.c 2016-07-20 19:57:16.777610226 -0600
@@ -339,6 +339,9 @@ cerr << "Performance Options:\n";
cerr << "NTL_PCLMUL\n";
--- src/QuickTest.cpp.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/QuickTest.cpp 2016-10-19 19:03:46.647499895 -0600
@@ -316,6 +316,9 @@ cerr << "Performance Options:\n";
cerr << "NTL_GF2X_NOINLINE\n";
#endif
+#ifdef NTL_LOADTIME_CPU
@ -2275,10 +2266,10 @@
cerr << "\n\n";
--- src/WizardAux.orig 2016-06-21 12:46:44.000000000 -0600
+++ src/WizardAux 2016-07-20 19:57:16.777610226 -0600
@@ -94,6 +94,7 @@ system("make InitSettings");
'NTL_PCLMUL' => 0,
--- src/WizardAux.orig 2016-10-14 07:53:32.000000000 -0600
+++ src/WizardAux 2016-10-19 19:03:46.647499895 -0600
@@ -88,6 +88,7 @@ system("$ARGV[0] InitSettings");
'NTL_GF2X_NOINLINE' => 0,
'NTL_FFT_BIGTAB' => 0,
'NTL_FFT_LAZYMUL' => 0,
+'NTL_LOADTIME_CPU' => 0,

View File

@ -10,10 +10,10 @@
Summary: High-performance algorithms for vectors, matrices, and polynomials
Name: ntl
Version: 9.11.0
Version: 10.1.0
Release: 1%{?dist}
License: GPLv2+
License: LGPLv2+
URL: http://shoup.net/ntl/
Source0: http://shoup.net/ntl/%{name}-%{version}.tar.gz
@ -79,11 +79,12 @@ pushd src
LIBDIR=%{_libdir} \
NATIVE=off \
%{?gf2x:NTL_GF2X_LIB=on} \
NTL_PCLMUL=off \
NTL_DISABLE_TLS_HACK=on \
%ifarch x86_64
NTL_LOADTIME_CPU=on \
WIZARD=off \
TUNE=x86 \
%else
TUNE=generic \
%endif
SHARED=on
popd
@ -136,7 +137,7 @@ done
%files
%doc README
%license doc/copying.txt
%{_libdir}/libntl.so.29*
%{_libdir}/libntl.so.31*
%files devel
%doc doc/*
@ -150,6 +151,9 @@ done
%changelog
* Thu Oct 20 2016 Jerry James <loganjerry@gmail.com> - 10.1.0-1
- ntl-10.1.0
* Mon Sep 5 2016 Jerry James <loganjerry@gmail.com> - 9.11.0-1
- ntl-9.11.0

View File

@ -1 +1 @@
e87daf6ca33fbbb628df1984303f3e2c ntl-9.11.0.tar.gz
16b3449335163a753d45b5f1231bee23 ntl-10.1.0.tar.gz