68#ifndef INCLUDED_volk_32f_tanh_32f_a_H
69#define INCLUDED_volk_32f_tanh_32f_a_H
82 unsigned int number = 0;
83 float* cPtr = cVector;
84 const float* aPtr = aVector;
85 for (; number < num_points; number++) {
86 *cPtr++ = tanhf(*aPtr++);
98 float* cPtr = cVector;
99 const float* aPtr = aVector;
100 for (
unsigned int number = 0; number < num_points; number++) {
103 else if (*aPtr <= -4.97)
106 float x2 = (*aPtr) * (*aPtr);
107 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
108 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
119#include <xmmintrin.h>
124 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
127 float* cPtr = cVector;
128 const float* aPtr = aVector;
130 __m128 aVal, cVal, x2, a, b;
131 __m128 const1, const2, const3, const4, const5, const6;
132 const1 = _mm_set_ps1(135135.0f);
133 const2 = _mm_set_ps1(17325.0f);
134 const3 = _mm_set_ps1(378.0f);
135 const4 = _mm_set_ps1(62370.0f);
136 const5 = _mm_set_ps1(3150.0f);
137 const6 = _mm_set_ps1(28.0f);
138 for (; number < quarterPoints; number++) {
140 aVal = _mm_load_ps(aPtr);
141 x2 = _mm_mul_ps(aVal, aVal);
147 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
153 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
155 cVal = _mm_div_ps(a, b);
157 _mm_store_ps(cPtr, cVal);
163 number = quarterPoints * 4;
170#include <immintrin.h>
175 unsigned int number = 0;
176 const unsigned int eighthPoints = num_points / 8;
178 float* cPtr = cVector;
179 const float* aPtr = aVector;
181 __m256 aVal, cVal, x2, a, b;
182 __m256 const1, const2, const3, const4, const5, const6;
183 const1 = _mm256_set1_ps(135135.0f);
184 const2 = _mm256_set1_ps(17325.0f);
185 const3 = _mm256_set1_ps(378.0f);
186 const4 = _mm256_set1_ps(62370.0f);
187 const5 = _mm256_set1_ps(3150.0f);
188 const6 = _mm256_set1_ps(28.0f);
189 for (; number < eighthPoints; number++) {
191 aVal = _mm256_load_ps(aPtr);
192 x2 = _mm256_mul_ps(aVal, aVal);
199 _mm256_add_ps(const2,
200 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
208 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
210 cVal = _mm256_div_ps(a, b);
212 _mm256_store_ps(cPtr, cVal);
218 number = eighthPoints * 8;
223#if LV_HAVE_AVX && LV_HAVE_FMA
224#include <immintrin.h>
227volk_32f_tanh_32f_a_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
229 unsigned int number = 0;
230 const unsigned int eighthPoints = num_points / 8;
232 float* cPtr = cVector;
233 const float* aPtr = aVector;
235 __m256 aVal, cVal, x2, a, b;
236 __m256 const1, const2, const3, const4, const5, const6;
237 const1 = _mm256_set1_ps(135135.0f);
238 const2 = _mm256_set1_ps(17325.0f);
239 const3 = _mm256_set1_ps(378.0f);
240 const4 = _mm256_set1_ps(62370.0f);
241 const5 = _mm256_set1_ps(3150.0f);
242 const6 = _mm256_set1_ps(28.0f);
243 for (; number < eighthPoints; number++) {
245 aVal = _mm256_load_ps(aPtr);
246 x2 = _mm256_mul_ps(aVal, aVal);
250 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
252 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
254 cVal = _mm256_div_ps(a, b);
256 _mm256_store_ps(cPtr, cVal);
262 number = eighthPoints * 8;
270#ifndef INCLUDED_volk_32f_tanh_32f_u_H
271#define INCLUDED_volk_32f_tanh_32f_u_H
280#include <xmmintrin.h>
285 unsigned int number = 0;
286 const unsigned int quarterPoints = num_points / 4;
288 float* cPtr = cVector;
289 const float* aPtr = aVector;
291 __m128 aVal, cVal, x2, a, b;
292 __m128 const1, const2, const3, const4, const5, const6;
293 const1 = _mm_set_ps1(135135.0f);
294 const2 = _mm_set_ps1(17325.0f);
295 const3 = _mm_set_ps1(378.0f);
296 const4 = _mm_set_ps1(62370.0f);
297 const5 = _mm_set_ps1(3150.0f);
298 const6 = _mm_set_ps1(28.0f);
299 for (; number < quarterPoints; number++) {
301 aVal = _mm_loadu_ps(aPtr);
302 x2 = _mm_mul_ps(aVal, aVal);
308 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
314 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
316 cVal = _mm_div_ps(a, b);
318 _mm_storeu_ps(cPtr, cVal);
324 number = quarterPoints * 4;
331#include <immintrin.h>
336 unsigned int number = 0;
337 const unsigned int eighthPoints = num_points / 8;
339 float* cPtr = cVector;
340 const float* aPtr = aVector;
342 __m256 aVal, cVal, x2, a, b;
343 __m256 const1, const2, const3, const4, const5, const6;
344 const1 = _mm256_set1_ps(135135.0f);
345 const2 = _mm256_set1_ps(17325.0f);
346 const3 = _mm256_set1_ps(378.0f);
347 const4 = _mm256_set1_ps(62370.0f);
348 const5 = _mm256_set1_ps(3150.0f);
349 const6 = _mm256_set1_ps(28.0f);
350 for (; number < eighthPoints; number++) {
352 aVal = _mm256_loadu_ps(aPtr);
353 x2 = _mm256_mul_ps(aVal, aVal);
360 _mm256_add_ps(const2,
361 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
369 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
371 cVal = _mm256_div_ps(a, b);
373 _mm256_storeu_ps(cPtr, cVal);
379 number = eighthPoints * 8;
384#if LV_HAVE_AVX && LV_HAVE_FMA
385#include <immintrin.h>
388volk_32f_tanh_32f_u_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
390 unsigned int number = 0;
391 const unsigned int eighthPoints = num_points / 8;
393 float* cPtr = cVector;
394 const float* aPtr = aVector;
396 __m256 aVal, cVal, x2, a, b;
397 __m256 const1, const2, const3, const4, const5, const6;
398 const1 = _mm256_set1_ps(135135.0f);
399 const2 = _mm256_set1_ps(17325.0f);
400 const3 = _mm256_set1_ps(378.0f);
401 const4 = _mm256_set1_ps(62370.0f);
402 const5 = _mm256_set1_ps(3150.0f);
403 const6 = _mm256_set1_ps(28.0f);
404 for (; number < eighthPoints; number++) {
406 aVal = _mm256_loadu_ps(aPtr);
407 x2 = _mm256_mul_ps(aVal, aVal);
411 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
413 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
415 cVal = _mm256_div_ps(a, b);
417 _mm256_storeu_ps(cPtr, cVal);
423 number = eighthPoints * 8;
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:334
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:80
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:173
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:122
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:96
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:283