Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_tan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
74#include <inttypes.h>
75#include <math.h>
76#include <stdio.h>
77
78#ifndef INCLUDED_volk_32f_tan_32f_a_H
79#define INCLUDED_volk_32f_tan_32f_a_H
80
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
82#include <immintrin.h>
83
84static inline void
85volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
86{
87 float* bPtr = bVector;
88 const float* aPtr = aVector;
89
90 unsigned int number = 0;
91 unsigned int eighthPoints = num_points / 8;
92 unsigned int i = 0;
93
94 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
95 fzeroes;
96 __m256 sine, cosine, tangent, condition1, condition2, condition3;
97 __m256i q, r, ones, twos, fours;
98
99 m4pi = _mm256_set1_ps(1.273239545);
100 pio4A = _mm256_set1_ps(0.78515625);
101 pio4B = _mm256_set1_ps(0.241876e-3);
102 ffours = _mm256_set1_ps(4.0);
103 ftwos = _mm256_set1_ps(2.0);
104 fones = _mm256_set1_ps(1.0);
105 fzeroes = _mm256_setzero_ps();
106 ones = _mm256_set1_epi32(1);
107 twos = _mm256_set1_epi32(2);
108 fours = _mm256_set1_epi32(4);
109
110 cp1 = _mm256_set1_ps(1.0);
111 cp2 = _mm256_set1_ps(0.83333333e-1);
112 cp3 = _mm256_set1_ps(0.2777778e-2);
113 cp4 = _mm256_set1_ps(0.49603e-4);
114 cp5 = _mm256_set1_ps(0.551e-6);
115
116 for (; number < eighthPoints; number++) {
117 aVal = _mm256_load_ps(aPtr);
118 s = _mm256_sub_ps(aVal,
119 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
120 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
121 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
122 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
123
124 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
125 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
126
127 s = _mm256_div_ps(
128 s,
129 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
130 s = _mm256_mul_ps(s, s);
131 // Evaluate Taylor series
132 s = _mm256_mul_ps(
133 _mm256_fmadd_ps(
134 _mm256_fmsub_ps(
135 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
136 s,
137 cp1),
138 s);
139
140 for (i = 0; i < 3; i++) {
141 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
142 }
143 s = _mm256_div_ps(s, ftwos);
144
145 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
146 cosine = _mm256_sub_ps(fones, s);
147
148 condition1 = _mm256_cmp_ps(
149 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
150 fzeroes,
151 _CMP_NEQ_UQ);
152 condition2 = _mm256_cmp_ps(
153 _mm256_cmp_ps(
154 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
155 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
156 _CMP_NEQ_UQ);
157 condition3 = _mm256_cmp_ps(
158 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
159 fzeroes,
160 _CMP_NEQ_UQ);
161
162 __m256 temp = cosine;
163 cosine =
164 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
165 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
166 sine = _mm256_sub_ps(
167 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
168 cosine = _mm256_sub_ps(
169 cosine,
170 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
171 tangent = _mm256_div_ps(sine, cosine);
172 _mm256_store_ps(bPtr, tangent);
173 aPtr += 8;
174 bPtr += 8;
175 }
176
177 number = eighthPoints * 8;
178 for (; number < num_points; number++) {
179 *bPtr++ = tan(*aPtr++);
180 }
181}
182
183#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
184
185#ifdef LV_HAVE_AVX2
186#include <immintrin.h>
187
188static inline void
189volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
190{
191 float* bPtr = bVector;
192 const float* aPtr = aVector;
193
194 unsigned int number = 0;
195 unsigned int eighthPoints = num_points / 8;
196 unsigned int i = 0;
197
198 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
199 fzeroes;
200 __m256 sine, cosine, tangent, condition1, condition2, condition3;
201 __m256i q, r, ones, twos, fours;
202
203 m4pi = _mm256_set1_ps(1.273239545);
204 pio4A = _mm256_set1_ps(0.78515625);
205 pio4B = _mm256_set1_ps(0.241876e-3);
206 ffours = _mm256_set1_ps(4.0);
207 ftwos = _mm256_set1_ps(2.0);
208 fones = _mm256_set1_ps(1.0);
209 fzeroes = _mm256_setzero_ps();
210 ones = _mm256_set1_epi32(1);
211 twos = _mm256_set1_epi32(2);
212 fours = _mm256_set1_epi32(4);
213
214 cp1 = _mm256_set1_ps(1.0);
215 cp2 = _mm256_set1_ps(0.83333333e-1);
216 cp3 = _mm256_set1_ps(0.2777778e-2);
217 cp4 = _mm256_set1_ps(0.49603e-4);
218 cp5 = _mm256_set1_ps(0.551e-6);
219
220 for (; number < eighthPoints; number++) {
221 aVal = _mm256_load_ps(aPtr);
222 s = _mm256_sub_ps(aVal,
223 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
224 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
225 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
226 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
227
228 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
229 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
230
231 s = _mm256_div_ps(
232 s,
233 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
234 s = _mm256_mul_ps(s, s);
235 // Evaluate Taylor series
236 s = _mm256_mul_ps(
237 _mm256_add_ps(
238 _mm256_mul_ps(
239 _mm256_sub_ps(
240 _mm256_mul_ps(
241 _mm256_add_ps(
242 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
243 s),
244 cp3),
245 s),
246 cp2),
247 s),
248 cp1),
249 s);
250
251 for (i = 0; i < 3; i++) {
252 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
253 }
254 s = _mm256_div_ps(s, ftwos);
255
256 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
257 cosine = _mm256_sub_ps(fones, s);
258
259 condition1 = _mm256_cmp_ps(
260 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
261 fzeroes,
262 _CMP_NEQ_UQ);
263 condition2 = _mm256_cmp_ps(
264 _mm256_cmp_ps(
265 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
266 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
267 _CMP_NEQ_UQ);
268 condition3 = _mm256_cmp_ps(
269 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
270 fzeroes,
271 _CMP_NEQ_UQ);
272
273 __m256 temp = cosine;
274 cosine =
275 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
276 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
277 sine = _mm256_sub_ps(
278 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
279 cosine = _mm256_sub_ps(
280 cosine,
281 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
282 tangent = _mm256_div_ps(sine, cosine);
283 _mm256_store_ps(bPtr, tangent);
284 aPtr += 8;
285 bPtr += 8;
286 }
287
288 number = eighthPoints * 8;
289 for (; number < num_points; number++) {
290 *bPtr++ = tan(*aPtr++);
291 }
292}
293
294#endif /* LV_HAVE_AVX2 for aligned */
295
296#ifdef LV_HAVE_SSE4_1
297#include <smmintrin.h>
298
299static inline void
300volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
301{
302 float* bPtr = bVector;
303 const float* aPtr = aVector;
304
305 unsigned int number = 0;
306 unsigned int quarterPoints = num_points / 4;
307 unsigned int i = 0;
308
309 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
310 fzeroes;
311 __m128 sine, cosine, tangent, condition1, condition2, condition3;
312 __m128i q, r, ones, twos, fours;
313
314 m4pi = _mm_set1_ps(1.273239545);
315 pio4A = _mm_set1_ps(0.78515625);
316 pio4B = _mm_set1_ps(0.241876e-3);
317 ffours = _mm_set1_ps(4.0);
318 ftwos = _mm_set1_ps(2.0);
319 fones = _mm_set1_ps(1.0);
320 fzeroes = _mm_setzero_ps();
321 ones = _mm_set1_epi32(1);
322 twos = _mm_set1_epi32(2);
323 fours = _mm_set1_epi32(4);
324
325 cp1 = _mm_set1_ps(1.0);
326 cp2 = _mm_set1_ps(0.83333333e-1);
327 cp3 = _mm_set1_ps(0.2777778e-2);
328 cp4 = _mm_set1_ps(0.49603e-4);
329 cp5 = _mm_set1_ps(0.551e-6);
330
331 for (; number < quarterPoints; number++) {
332 aVal = _mm_load_ps(aPtr);
333 s = _mm_sub_ps(aVal,
334 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
335 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
336 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
337
338 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
339 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
340
341 s = _mm_div_ps(
342 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
343 s = _mm_mul_ps(s, s);
344 // Evaluate Taylor series
345 s = _mm_mul_ps(
346 _mm_add_ps(
347 _mm_mul_ps(
348 _mm_sub_ps(
349 _mm_mul_ps(
350 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
351 cp3),
352 s),
353 cp2),
354 s),
355 cp1),
356 s);
357
358 for (i = 0; i < 3; i++) {
359 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
360 }
361 s = _mm_div_ps(s, ftwos);
362
363 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
364 cosine = _mm_sub_ps(fones, s);
365
366 condition1 = _mm_cmpneq_ps(
367 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
368 condition2 = _mm_cmpneq_ps(
369 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
370 _mm_cmplt_ps(aVal, fzeroes));
371 condition3 = _mm_cmpneq_ps(
372 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
373
374 __m128 temp = cosine;
375 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
376 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
377 sine =
378 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
379 cosine = _mm_sub_ps(
380 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
381 tangent = _mm_div_ps(sine, cosine);
382 _mm_store_ps(bPtr, tangent);
383 aPtr += 4;
384 bPtr += 4;
385 }
386
387 number = quarterPoints * 4;
388 for (; number < num_points; number++) {
389 *bPtr++ = tanf(*aPtr++);
390 }
391}
392
393#endif /* LV_HAVE_SSE4_1 for aligned */
394
395
396#endif /* INCLUDED_volk_32f_tan_32f_a_H */
397
398#ifndef INCLUDED_volk_32f_tan_32f_u_H
399#define INCLUDED_volk_32f_tan_32f_u_H
400
401#if LV_HAVE_AVX2 && LV_HAVE_FMA
402#include <immintrin.h>
403
404static inline void
405volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
406{
407 float* bPtr = bVector;
408 const float* aPtr = aVector;
409
410 unsigned int number = 0;
411 unsigned int eighthPoints = num_points / 8;
412 unsigned int i = 0;
413
414 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
415 fzeroes;
416 __m256 sine, cosine, tangent, condition1, condition2, condition3;
417 __m256i q, r, ones, twos, fours;
418
419 m4pi = _mm256_set1_ps(1.273239545);
420 pio4A = _mm256_set1_ps(0.78515625);
421 pio4B = _mm256_set1_ps(0.241876e-3);
422 ffours = _mm256_set1_ps(4.0);
423 ftwos = _mm256_set1_ps(2.0);
424 fones = _mm256_set1_ps(1.0);
425 fzeroes = _mm256_setzero_ps();
426 ones = _mm256_set1_epi32(1);
427 twos = _mm256_set1_epi32(2);
428 fours = _mm256_set1_epi32(4);
429
430 cp1 = _mm256_set1_ps(1.0);
431 cp2 = _mm256_set1_ps(0.83333333e-1);
432 cp3 = _mm256_set1_ps(0.2777778e-2);
433 cp4 = _mm256_set1_ps(0.49603e-4);
434 cp5 = _mm256_set1_ps(0.551e-6);
435
436 for (; number < eighthPoints; number++) {
437 aVal = _mm256_loadu_ps(aPtr);
438 s = _mm256_sub_ps(aVal,
439 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
440 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
441 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
442 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
443
444 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
445 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
446
447 s = _mm256_div_ps(
448 s,
449 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
450 s = _mm256_mul_ps(s, s);
451 // Evaluate Taylor series
452 s = _mm256_mul_ps(
453 _mm256_fmadd_ps(
454 _mm256_fmsub_ps(
455 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
456 s,
457 cp1),
458 s);
459
460 for (i = 0; i < 3; i++) {
461 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
462 }
463 s = _mm256_div_ps(s, ftwos);
464
465 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
466 cosine = _mm256_sub_ps(fones, s);
467
468 condition1 = _mm256_cmp_ps(
469 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
470 fzeroes,
471 _CMP_NEQ_UQ);
472 condition2 = _mm256_cmp_ps(
473 _mm256_cmp_ps(
474 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
475 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
476 _CMP_NEQ_UQ);
477 condition3 = _mm256_cmp_ps(
478 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
479 fzeroes,
480 _CMP_NEQ_UQ);
481
482 __m256 temp = cosine;
483 cosine =
484 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
485 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
486 sine = _mm256_sub_ps(
487 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
488 cosine = _mm256_sub_ps(
489 cosine,
490 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
491 tangent = _mm256_div_ps(sine, cosine);
492 _mm256_storeu_ps(bPtr, tangent);
493 aPtr += 8;
494 bPtr += 8;
495 }
496
497 number = eighthPoints * 8;
498 for (; number < num_points; number++) {
499 *bPtr++ = tan(*aPtr++);
500 }
501}
502
503#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
504
505#ifdef LV_HAVE_AVX2
506#include <immintrin.h>
507
508static inline void
509volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
510{
511 float* bPtr = bVector;
512 const float* aPtr = aVector;
513
514 unsigned int number = 0;
515 unsigned int eighthPoints = num_points / 8;
516 unsigned int i = 0;
517
518 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
519 fzeroes;
520 __m256 sine, cosine, tangent, condition1, condition2, condition3;
521 __m256i q, r, ones, twos, fours;
522
523 m4pi = _mm256_set1_ps(1.273239545);
524 pio4A = _mm256_set1_ps(0.78515625);
525 pio4B = _mm256_set1_ps(0.241876e-3);
526 ffours = _mm256_set1_ps(4.0);
527 ftwos = _mm256_set1_ps(2.0);
528 fones = _mm256_set1_ps(1.0);
529 fzeroes = _mm256_setzero_ps();
530 ones = _mm256_set1_epi32(1);
531 twos = _mm256_set1_epi32(2);
532 fours = _mm256_set1_epi32(4);
533
534 cp1 = _mm256_set1_ps(1.0);
535 cp2 = _mm256_set1_ps(0.83333333e-1);
536 cp3 = _mm256_set1_ps(0.2777778e-2);
537 cp4 = _mm256_set1_ps(0.49603e-4);
538 cp5 = _mm256_set1_ps(0.551e-6);
539
540 for (; number < eighthPoints; number++) {
541 aVal = _mm256_loadu_ps(aPtr);
542 s = _mm256_sub_ps(aVal,
543 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
544 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
545 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
546 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
547
548 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
549 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
550
551 s = _mm256_div_ps(
552 s,
553 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
554 s = _mm256_mul_ps(s, s);
555 // Evaluate Taylor series
556 s = _mm256_mul_ps(
557 _mm256_add_ps(
558 _mm256_mul_ps(
559 _mm256_sub_ps(
560 _mm256_mul_ps(
561 _mm256_add_ps(
562 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
563 s),
564 cp3),
565 s),
566 cp2),
567 s),
568 cp1),
569 s);
570
571 for (i = 0; i < 3; i++) {
572 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
573 }
574 s = _mm256_div_ps(s, ftwos);
575
576 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
577 cosine = _mm256_sub_ps(fones, s);
578
579 condition1 = _mm256_cmp_ps(
580 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
581 fzeroes,
582 _CMP_NEQ_UQ);
583 condition2 = _mm256_cmp_ps(
584 _mm256_cmp_ps(
585 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
586 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
587 _CMP_NEQ_UQ);
588 condition3 = _mm256_cmp_ps(
589 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
590 fzeroes,
591 _CMP_NEQ_UQ);
592
593 __m256 temp = cosine;
594 cosine =
595 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
596 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
597 sine = _mm256_sub_ps(
598 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
599 cosine = _mm256_sub_ps(
600 cosine,
601 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
602 tangent = _mm256_div_ps(sine, cosine);
603 _mm256_storeu_ps(bPtr, tangent);
604 aPtr += 8;
605 bPtr += 8;
606 }
607
608 number = eighthPoints * 8;
609 for (; number < num_points; number++) {
610 *bPtr++ = tan(*aPtr++);
611 }
612}
613
614#endif /* LV_HAVE_AVX2 for unaligned */
615
616
617#ifdef LV_HAVE_SSE4_1
618#include <smmintrin.h>
619
620static inline void
621volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
622{
623 float* bPtr = bVector;
624 const float* aPtr = aVector;
625
626 unsigned int number = 0;
627 unsigned int quarterPoints = num_points / 4;
628 unsigned int i = 0;
629
630 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
631 fzeroes;
632 __m128 sine, cosine, tangent, condition1, condition2, condition3;
633 __m128i q, r, ones, twos, fours;
634
635 m4pi = _mm_set1_ps(1.273239545);
636 pio4A = _mm_set1_ps(0.78515625);
637 pio4B = _mm_set1_ps(0.241876e-3);
638 ffours = _mm_set1_ps(4.0);
639 ftwos = _mm_set1_ps(2.0);
640 fones = _mm_set1_ps(1.0);
641 fzeroes = _mm_setzero_ps();
642 ones = _mm_set1_epi32(1);
643 twos = _mm_set1_epi32(2);
644 fours = _mm_set1_epi32(4);
645
646 cp1 = _mm_set1_ps(1.0);
647 cp2 = _mm_set1_ps(0.83333333e-1);
648 cp3 = _mm_set1_ps(0.2777778e-2);
649 cp4 = _mm_set1_ps(0.49603e-4);
650 cp5 = _mm_set1_ps(0.551e-6);
651
652 for (; number < quarterPoints; number++) {
653 aVal = _mm_loadu_ps(aPtr);
654 s = _mm_sub_ps(aVal,
655 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
656 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
657 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
658
659 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
660 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
661
662 s = _mm_div_ps(
663 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
664 s = _mm_mul_ps(s, s);
665 // Evaluate Taylor series
666 s = _mm_mul_ps(
667 _mm_add_ps(
668 _mm_mul_ps(
669 _mm_sub_ps(
670 _mm_mul_ps(
671 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
672 cp3),
673 s),
674 cp2),
675 s),
676 cp1),
677 s);
678
679 for (i = 0; i < 3; i++) {
680 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
681 }
682 s = _mm_div_ps(s, ftwos);
683
684 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
685 cosine = _mm_sub_ps(fones, s);
686
687 condition1 = _mm_cmpneq_ps(
688 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
689 condition2 = _mm_cmpneq_ps(
690 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
691 _mm_cmplt_ps(aVal, fzeroes));
692 condition3 = _mm_cmpneq_ps(
693 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
694
695 __m128 temp = cosine;
696 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
697 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
698 sine =
699 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
700 cosine = _mm_sub_ps(
701 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
702 tangent = _mm_div_ps(sine, cosine);
703 _mm_storeu_ps(bPtr, tangent);
704 aPtr += 4;
705 bPtr += 4;
706 }
707
708 number = quarterPoints * 4;
709 for (; number < num_points; number++) {
710 *bPtr++ = tanf(*aPtr++);
711 }
712}
713
714#endif /* LV_HAVE_SSE4_1 for unaligned */
715
716
717#ifdef LV_HAVE_GENERIC
718
719static inline void
720volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
721{
722 float* bPtr = bVector;
723 const float* aPtr = aVector;
724 unsigned int number = 0;
725
726 for (; number < num_points; number++) {
727 *bPtr++ = tanf(*aPtr++);
728 }
729}
730#endif /* LV_HAVE_GENERIC */
731
732
733#ifdef LV_HAVE_NEON
734#include <arm_neon.h>
736
737static inline void
738volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
739{
740 unsigned int number = 0;
741 unsigned int quarter_points = num_points / 4;
742 float* bVectorPtr = bVector;
743 const float* aVectorPtr = aVector;
744
745 float32x4_t b_vec;
746 float32x4_t a_vec;
747
748 for (number = 0; number < quarter_points; number++) {
749 a_vec = vld1q_f32(aVectorPtr);
750 // Prefetch next one, speeds things up
751 __VOLK_PREFETCH(aVectorPtr + 4);
752 b_vec = _vtanq_f32(a_vec);
753 vst1q_f32(bVectorPtr, b_vec);
754 // move pointers ahead
755 bVectorPtr += 4;
756 aVectorPtr += 4;
757 }
758
759 // Deal with the rest
760 for (number = quarter_points * 4; number < num_points; number++) {
761 *bVectorPtr++ = tanf(*aVectorPtr++);
762 }
763}
764#endif /* LV_HAVE_NEON */
765
766
767#endif /* INCLUDED_volk_32f_tan_32f_u_H */
static void volk_32f_tan_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:738
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:720
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:274