Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_tanh_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
68#ifndef INCLUDED_volk_32f_tanh_32f_a_H
69#define INCLUDED_volk_32f_tanh_32f_a_H
70
71#include <inttypes.h>
72#include <math.h>
73#include <stdio.h>
74#include <string.h>
75
76
77#ifdef LV_HAVE_GENERIC
78
79static inline void
80volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
81{
82 unsigned int number = 0;
83 float* cPtr = cVector;
84 const float* aPtr = aVector;
85 for (; number < num_points; number++) {
86 *cPtr++ = tanhf(*aPtr++);
87 }
88}
89
90#endif /* LV_HAVE_GENERIC */
91
92
93#ifdef LV_HAVE_GENERIC
94
95static inline void
96volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
97{
98 float* cPtr = cVector;
99 const float* aPtr = aVector;
100 for (unsigned int number = 0; number < num_points; number++) {
101 if (*aPtr > 4.97)
102 *cPtr++ = 1;
103 else if (*aPtr <= -4.97)
104 *cPtr++ = -1;
105 else {
106 float x2 = (*aPtr) * (*aPtr);
107 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
108 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
109 *cPtr++ = a / b;
110 aPtr++;
111 }
112 }
113}
114
115#endif /* LV_HAVE_GENERIC */
116
117
118#ifdef LV_HAVE_SSE
119#include <xmmintrin.h>
120
121static inline void
122volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
123{
124 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
126
127 float* cPtr = cVector;
128 const float* aPtr = aVector;
129
130 __m128 aVal, cVal, x2, a, b;
131 __m128 const1, const2, const3, const4, const5, const6;
132 const1 = _mm_set_ps1(135135.0f);
133 const2 = _mm_set_ps1(17325.0f);
134 const3 = _mm_set_ps1(378.0f);
135 const4 = _mm_set_ps1(62370.0f);
136 const5 = _mm_set_ps1(3150.0f);
137 const6 = _mm_set_ps1(28.0f);
138 for (; number < quarterPoints; number++) {
139
140 aVal = _mm_load_ps(aPtr);
141 x2 = _mm_mul_ps(aVal, aVal);
142 a = _mm_mul_ps(
143 aVal,
144 _mm_add_ps(
145 const1,
146 _mm_mul_ps(x2,
147 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
148 b = _mm_add_ps(
149 const1,
150 _mm_mul_ps(
151 x2,
152 _mm_add_ps(const4,
153 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
154
155 cVal = _mm_div_ps(a, b);
156
157 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
158
159 aPtr += 4;
160 cPtr += 4;
161 }
162
163 number = quarterPoints * 4;
164 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
165}
166#endif /* LV_HAVE_SSE */
167
168
169#ifdef LV_HAVE_AVX
170#include <immintrin.h>
171
172static inline void
173volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
174{
175 unsigned int number = 0;
176 const unsigned int eighthPoints = num_points / 8;
177
178 float* cPtr = cVector;
179 const float* aPtr = aVector;
180
181 __m256 aVal, cVal, x2, a, b;
182 __m256 const1, const2, const3, const4, const5, const6;
183 const1 = _mm256_set1_ps(135135.0f);
184 const2 = _mm256_set1_ps(17325.0f);
185 const3 = _mm256_set1_ps(378.0f);
186 const4 = _mm256_set1_ps(62370.0f);
187 const5 = _mm256_set1_ps(3150.0f);
188 const6 = _mm256_set1_ps(28.0f);
189 for (; number < eighthPoints; number++) {
190
191 aVal = _mm256_load_ps(aPtr);
192 x2 = _mm256_mul_ps(aVal, aVal);
193 a = _mm256_mul_ps(
194 aVal,
195 _mm256_add_ps(
196 const1,
197 _mm256_mul_ps(
198 x2,
199 _mm256_add_ps(const2,
200 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
201 b = _mm256_add_ps(
202 const1,
203 _mm256_mul_ps(
204 x2,
205 _mm256_add_ps(
206 const4,
207 _mm256_mul_ps(x2,
208 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
209
210 cVal = _mm256_div_ps(a, b);
211
212 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
213
214 aPtr += 8;
215 cPtr += 8;
216 }
217
218 number = eighthPoints * 8;
219 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
220}
221#endif /* LV_HAVE_AVX */
222
223#if LV_HAVE_AVX && LV_HAVE_FMA
224#include <immintrin.h>
225
226static inline void
227volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
228{
229 unsigned int number = 0;
230 const unsigned int eighthPoints = num_points / 8;
231
232 float* cPtr = cVector;
233 const float* aPtr = aVector;
234
235 __m256 aVal, cVal, x2, a, b;
236 __m256 const1, const2, const3, const4, const5, const6;
237 const1 = _mm256_set1_ps(135135.0f);
238 const2 = _mm256_set1_ps(17325.0f);
239 const3 = _mm256_set1_ps(378.0f);
240 const4 = _mm256_set1_ps(62370.0f);
241 const5 = _mm256_set1_ps(3150.0f);
242 const6 = _mm256_set1_ps(28.0f);
243 for (; number < eighthPoints; number++) {
244
245 aVal = _mm256_load_ps(aPtr);
246 x2 = _mm256_mul_ps(aVal, aVal);
247 a = _mm256_mul_ps(
248 aVal,
249 _mm256_fmadd_ps(
250 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
251 b = _mm256_fmadd_ps(
252 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
253
254 cVal = _mm256_div_ps(a, b);
255
256 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
257
258 aPtr += 8;
259 cPtr += 8;
260 }
261
262 number = eighthPoints * 8;
263 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
264}
265#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
266
267#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
268
269
270#ifndef INCLUDED_volk_32f_tanh_32f_u_H
271#define INCLUDED_volk_32f_tanh_32f_u_H
272
273#include <inttypes.h>
274#include <math.h>
275#include <stdio.h>
276#include <string.h>
277
278
279#ifdef LV_HAVE_SSE
280#include <xmmintrin.h>
281
282static inline void
283volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
284{
285 unsigned int number = 0;
286 const unsigned int quarterPoints = num_points / 4;
287
288 float* cPtr = cVector;
289 const float* aPtr = aVector;
290
291 __m128 aVal, cVal, x2, a, b;
292 __m128 const1, const2, const3, const4, const5, const6;
293 const1 = _mm_set_ps1(135135.0f);
294 const2 = _mm_set_ps1(17325.0f);
295 const3 = _mm_set_ps1(378.0f);
296 const4 = _mm_set_ps1(62370.0f);
297 const5 = _mm_set_ps1(3150.0f);
298 const6 = _mm_set_ps1(28.0f);
299 for (; number < quarterPoints; number++) {
300
301 aVal = _mm_loadu_ps(aPtr);
302 x2 = _mm_mul_ps(aVal, aVal);
303 a = _mm_mul_ps(
304 aVal,
305 _mm_add_ps(
306 const1,
307 _mm_mul_ps(x2,
308 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
309 b = _mm_add_ps(
310 const1,
311 _mm_mul_ps(
312 x2,
313 _mm_add_ps(const4,
314 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
315
316 cVal = _mm_div_ps(a, b);
317
318 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
319
320 aPtr += 4;
321 cPtr += 4;
322 }
323
324 number = quarterPoints * 4;
325 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
326}
327#endif /* LV_HAVE_SSE */
328
329
330#ifdef LV_HAVE_AVX
331#include <immintrin.h>
332
333static inline void
334volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
335{
336 unsigned int number = 0;
337 const unsigned int eighthPoints = num_points / 8;
338
339 float* cPtr = cVector;
340 const float* aPtr = aVector;
341
342 __m256 aVal, cVal, x2, a, b;
343 __m256 const1, const2, const3, const4, const5, const6;
344 const1 = _mm256_set1_ps(135135.0f);
345 const2 = _mm256_set1_ps(17325.0f);
346 const3 = _mm256_set1_ps(378.0f);
347 const4 = _mm256_set1_ps(62370.0f);
348 const5 = _mm256_set1_ps(3150.0f);
349 const6 = _mm256_set1_ps(28.0f);
350 for (; number < eighthPoints; number++) {
351
352 aVal = _mm256_loadu_ps(aPtr);
353 x2 = _mm256_mul_ps(aVal, aVal);
354 a = _mm256_mul_ps(
355 aVal,
356 _mm256_add_ps(
357 const1,
358 _mm256_mul_ps(
359 x2,
360 _mm256_add_ps(const2,
361 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
362 b = _mm256_add_ps(
363 const1,
364 _mm256_mul_ps(
365 x2,
366 _mm256_add_ps(
367 const4,
368 _mm256_mul_ps(x2,
369 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
370
371 cVal = _mm256_div_ps(a, b);
372
373 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
374
375 aPtr += 8;
376 cPtr += 8;
377 }
378
379 number = eighthPoints * 8;
380 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
381}
382#endif /* LV_HAVE_AVX */
383
384#if LV_HAVE_AVX && LV_HAVE_FMA
385#include <immintrin.h>
386
387static inline void
388volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
389{
390 unsigned int number = 0;
391 const unsigned int eighthPoints = num_points / 8;
392
393 float* cPtr = cVector;
394 const float* aPtr = aVector;
395
396 __m256 aVal, cVal, x2, a, b;
397 __m256 const1, const2, const3, const4, const5, const6;
398 const1 = _mm256_set1_ps(135135.0f);
399 const2 = _mm256_set1_ps(17325.0f);
400 const3 = _mm256_set1_ps(378.0f);
401 const4 = _mm256_set1_ps(62370.0f);
402 const5 = _mm256_set1_ps(3150.0f);
403 const6 = _mm256_set1_ps(28.0f);
404 for (; number < eighthPoints; number++) {
405
406 aVal = _mm256_loadu_ps(aPtr);
407 x2 = _mm256_mul_ps(aVal, aVal);
408 a = _mm256_mul_ps(
409 aVal,
410 _mm256_fmadd_ps(
411 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
412 b = _mm256_fmadd_ps(
413 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
414
415 cVal = _mm256_div_ps(a, b);
416
417 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
418
419 aPtr += 8;
420 cPtr += 8;
421 }
422
423 number = eighthPoints * 8;
424 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
425}
426#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
427
428#endif /* INCLUDED_volk_32f_tanh_32f_u_H */
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:334
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:80
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:173
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:122
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:96
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:283